kv snapshots save and load last logits for correctness. added some text for musicui, updated docs

This commit is contained in:
Concedo 2026-03-04 21:57:28 +08:00
parent 54cf43ae64
commit 4f1b22c415
4 changed files with 178 additions and 14 deletions

View file

@ -1644,6 +1644,144 @@
},
}
},
"/api/extra/music/prepare": {
"post": {
"summary": "Creates song generation parameters such as caption, lyrics, BPM and duration",
"description": "Creates song generation parameters such as caption, lyrics, BPM and duration. This should be called to produce the generation input for /api/extra/music/generate",
"requestBody": {
"content": {
"application/json": {
"example": {
"caption": "An emotional rap song about the kobold war.",
},
"schema": {
"properties": {
"caption": {
"type": "string",
"description": "A short description of the song to create"
}
},
"type": "object"
}
}
},
"required": true
},
"tags": [
"api/extra"
],
"responses": {
"200": {
"content": {
"application/json": {
"example":
{
"caption": "A melancholic and narrative-driven rap track built around a heavy beat",
"lyrics": "\n[Verse 1]\nKobold is love, yeah, kobold is life\nKobold is the only way we survive\nShe said okay, yo, keep it that way\nIn kobold we trust, just kobold today\n\n[Chorus]\nKobold, kobold, we carry on now\nKobold, kobold, each way and how\nKobold, kobold, right here we go\nKobold, kobold, all that I know.\n\n[Outro]",
"bpm": 120,
"duration": 64.0,
"keyscale": "G minor",
"timesignature": "2",
"vocal_language": "en",
"task_type": "text2music",
"seed": 622315,
"thinking": false,
"lm_temperature": 0.85,
"lm_cfg_scale": 2.0,
"lm_top_p": 0.9,
"lm_negative_prompt": "",
"inference_steps": 8,
"guidance_scale": 1.0,
"shift": 3.0,
"audio_codes": ""
},
"schema": {
"properties": {},
"type": "object"
}
}
},
"description": "Successful request"
}
},
}
},
"/api/extra/music/generate": {
"post": {
"summary": "Generates music based on provided captions, lyrics and configurations",
"description": "Generates music based on provided captions, lyrics and configurations. The config can be generated using /api/extra/music/prepare or crafted manually",
"requestBody": {
"content": {
"application/json": {
"example": {
"caption": "A melancholic and narrative-driven rap track built around a heavy beat",
"lyrics": "\n[Verse 1]\nKobold is love, yeah, kobold is life\nKobold is the only way we survive\nShe said okay, yo, keep it that way\nIn kobold we trust, just kobold today\n\n[Chorus]\nKobold, kobold, we carry on now\nKobold, kobold, each way and how\nKobold, kobold, right here we go\nKobold, kobold, all that I know.\n\n[Outro]",
"bpm": 120,
"duration": 64.0,
"keyscale": "G minor",
"timesignature": "2",
"vocal_language": "en",
"inference_steps": 8
},
"schema": {
"properties": {
"caption": {
"type": "string",
"description": "A short description of the song to create"
},
"lyrics": {
"type": "string",
"description": "The full lyrics of the song to generate"
},
"bpm": {
"type": "number",
"description": "The song Beats Per Minute"
},
"duration": {
"type": "number",
"description": "The length of the song, in seconds."
},
"keyscale": {
"type": "string",
"description": "The musical key of the song."
},
"timesignature": {
"type": "string",
"description": "The musical time signature of the song."
},
"vocal_language": {
"type": "string",
"description": "The language of the song lyrics."
},
"inference_steps": {
"type": "number",
"description": "How many diffusion steps to use."
}
},
"type": "object"
}
}
},
"required": true
},
"tags": [
"api/extra"
],
"responses": {
"200": {
"content": {
"audio/wav": {
"schema": {
"type": "string",
"format": "binary"
}
}
},
"description": "Successful request"
}
},
}
},
"/api/extra/json_to_grammar": {
"post": {
"summary": "Converts a provided JSON schema into GBNF grammar.",

View file

@ -23,7 +23,7 @@ body{
color:var(--text);
}
header{
padding:16px 20px;
padding:16px 12px;
font-size:20px;
font-weight:600;
background:rgba(0,0,0,0.3);
@ -32,8 +32,8 @@ header{
.wrapper{
display:grid;
grid-template-columns:minmax(340px,500px) 1fr;
gap:20px;
padding:20px;
gap:14px;
padding:10px;
}
@media(max-width:1100px){
.wrapper{grid-template-columns:1fr;}
@ -45,7 +45,7 @@ header{
box-shadow:0 10px 40px rgba(0,0,0,.4);
}
h2{
margin:0 0 14px 0;
margin:0 0 10px 0;
font-size:16px;
color:var(--accent2);
}
@ -112,7 +112,7 @@ button{
audio{width:100%;margin-top:6px;}
.advanced-toggle{
margin-top:8px;
font-size:12px;
font-size:14px;
cursor:pointer;
color:var(--accent2);
}
@ -174,11 +174,11 @@ input[type="checkbox"] {
<h2>Song Setup</h2>
<label>Caption</label>
<input id="caption">
<input id="caption" placeholder="Describe the song">
<div style="margin-top:10px">
<label>Lyrics</label>
<textarea id="lyrics"></textarea>
<textarea id="lyrics" placeholder="Enter song lyrics, or press 'Plan' to generate them."></textarea>
</div>
<div class="form-grid" style="margin-top:12px">
@ -217,7 +217,7 @@ input[type="checkbox"] {
<div class="actions" id="actionContainer">
<div id="normalActions" style="display:flex; gap:10px; flex-wrap:wrap;">
<button class="secondary" onclick="planSong()">Plan</button>
<button class="primary" onclick="planSong()">Plan</button>
<button class="primary" onclick="generateSong()">Generate</button>
<button class="danger" onclick="clearFields()">Clear</button>
<button onclick="exportPlan()">Export JSON</button>
@ -229,6 +229,10 @@ input[type="checkbox"] {
<input type="file" id="importFile" hidden accept="application/json" onchange="importPlan(event)">
</div>
<div>
<p style="font-size:14px">Click 'Plan' first to generate lyrics, BPM and duration. Edit as needed.
<br>When satisfied, click 'Generate' to make the music</p>
</div>
</div>

View file

@ -127,6 +127,7 @@ static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
static bool is_quiet = false;
static std::vector<gpt_vocab::id> last_n_tokens;
static std::vector<gpt_vocab::id> current_context_tokens;
static std::vector<float> loaded_latest_logits; //do not use normally, this is only required when loading state happens and we need to override logits
static size_t mem_per_token = 0;
static std::vector<float> logits;
static std::vector<int> smartcontext;
@ -4668,12 +4669,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
}
while(logits_sampled<logits_to_sample && remaining_tokens>0 && !abort_draft && !early_abort)
{
if(!firstdecodedone && current_context_tokens.size()>0)
{
embd.clear();
embd.push_back(current_context_tokens[current_context_tokens.size()-1]);
break;
}
if(logits_sampled>0)
{
//this is not the first loop, so we need to increment some things
@ -4708,6 +4703,28 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
lowestLogit = LowestLogit(logits);
}
if(!firstdecodedone && current_context_tokens.size()>0)
{
if(loaded_latest_logits.size()>0)
{
if(debugmode==1 && !is_quiet)
{
printf("\nLoading %d saved logits...\n",loaded_latest_logits.size());
}
//first decode was not done. this can happen when reloading from a perfectly matched state.
//to prevent a catastrophic failure, we must prepare emergency logits for usage
logitsPtr = loaded_latest_logits.data();
lowestLogit = LowestLogit(logitsPtr,n_vocab);
}
else
{
printf("\nNo cached logits and we need them, emergency fallback with degraded quality...\n");
embd.clear();
embd.push_back(current_context_tokens[current_context_tokens.size()-1]);
break;
}
}
//if adaptive p sampling is used, we need to cache the original probabilities
std::vector<llama_token_data> original_candidates;
if(adaptive_target > 0.0f)
@ -5237,6 +5254,7 @@ size_t gpttype_save_state_kv(int slot)
savestates[slot].current_savestate_buffer.clear();
savestates[slot].current_draft_savestate_buffer.clear();
savestates[slot].savestate_context_tokens.clear();
savestates[slot].latest_logits.clear();
savestates[slot].current_savestate_size = 0;
savestates[slot].current_draft_savestate_size = 0;
savestates[slot].media_signature = "";
@ -5258,6 +5276,8 @@ size_t gpttype_save_state_kv(int slot)
savestates[slot].current_savestate_size = newsize;
savestates[slot].savestate_context_tokens = current_context_tokens;
savestates[slot].media_signature = media_composite_image_signature;
float * lgptr = llama_get_logits(llama_ctx_v4);
savestates[slot].latest_logits.assign(lgptr,lgptr+n_vocab);
int maxedpos = llama_memory_seq_pos_max(llama_get_memory(llama_ctx_v4),0);
//kcpp: so maxedpos appears to always be equal to ctx tokens - 2, if savestate_ctx_tokens > maxedpos + 2 then trim excess
if(maxedpos > 0 && savestates[slot].savestate_context_tokens.size() > maxedpos + 2)
@ -5316,6 +5336,7 @@ bool gpttype_load_state_kv(int slot)
if(res > 0)
{
current_context_tokens = savestates[slot].savestate_context_tokens;
loaded_latest_logits = savestates[slot].latest_logits;
printf("\nKV Load SaveState %d: Restored KV with %zu tokens.\n", slot,current_context_tokens.size());
if(draft_ctx && savestates[slot].current_draft_savestate_size>0)
{

View file

@ -537,6 +537,7 @@ struct savestate_data
size_t current_draft_savestate_size = 0;
std::vector<uint8_t> current_draft_savestate_buffer;
std::vector<gpt_vocab::id> savestate_context_tokens; //for context clones
std::vector<float> latest_logits;
int64_t last_used = 0; //unix timestamp, updated on save or load
std::string media_signature = "";
};