fixed qwen2 audio issues, works fine now (+3 squashed commit)

Squashed commit:

[b3053a1ba] updated lite

[5071630d6] fixed mtmd issues, audio works

[06efa5af4] fix mtmd compile
This commit is contained in:
Concedo 2025-07-12 12:46:58 +08:00
parent 5a3b2e3921
commit dca49de059
6 changed files with 218 additions and 90 deletions

View file

@ -714,7 +714,7 @@ ttsmain: tools/tts/tts.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-op
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
gguf-split: tools/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) gguf-split: tools/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
mtmd-cli: tools/mtmd/mtmd-cli.cpp tools/mtmd/mtmd.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS) mtmd-cli: tools/mtmd/mtmd-cli.cpp tools/mtmd/mtmd.cpp tools/mtmd/mtmd-helper.cpp tools/mtmd/clip.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)

View file

@ -3089,7 +3089,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
} else { } else {
if(debugmode==1 && !is_quiet) if(debugmode==1 && !is_quiet)
{ {
printf("\nAudio Clip Embed Chunk %i used Tokens: %d",i,chunk.clp_image_tokens); printf("\nAudio Clip %i Embed Chunk used Tokens: %d",i,chunk.clp_image_tokens);
} }
total_chunk_tokens += chunk.clp_image_tokens; total_chunk_tokens += chunk.clp_image_tokens;
media_objects[i].mediachunks.push_back(chunk); media_objects[i].mediachunks.push_back(chunk);
@ -3480,7 +3480,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token); TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL); bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
TokenizeString("\n\n", media_sep, file_format, false); TokenizeString("\n\n", media_sep, file_format, false);
TokenizeString("\nImages:\n", media_intro, file_format, false); TokenizeString("\nAttached Media:\n", media_intro, file_format, false);
if(media_composite_image_signature=="") if(media_composite_image_signature=="")
{ {

View file

@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
--> -->
<script id="init-config"> <script id="init-config">
const LITEVER = 259; const LITEVER = 260;
const urlParams = new URLSearchParams(window.location.search); const urlParams = new URLSearchParams(window.location.search);
var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_"; const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@ -1261,28 +1261,28 @@ Current version indicated by LITEVER below.
.zoomedimg .zoomedimg
{ {
border-radius: 6%; border-radius: 6%;
width:462px; width:420px;
height:462px; height:420px;
} }
.zoomedimg.portrait .zoomedimg.portrait
{ {
width:308px; width:280px;
height:462px; height:420px;
} }
.zoomedimg.portrait_long .zoomedimg.portrait_long
{ {
width:231px; width:210px;
height:462px; height:420px;
} }
.zoomedimg.landscape .zoomedimg.landscape
{ {
width:462px; width:420px;
height:308px; height:280px;
} }
.zoomedimg.landscape_long .zoomedimg.landscape_long
{ {
width:462px; width:420px;
height:231px; height:210px;
} }
@media (max-width: 620px) { @media (max-width: 620px) {
.zoomedimg { .zoomedimg {
@ -3670,17 +3670,20 @@ Current version indicated by LITEVER below.
//truncate to first 3 bytes //truncate to first 3 bytes
return hsh.substring(0, hashBytes*2); return hsh.substring(0, hashBytes*2);
}; };
function b64_to_persistent_blob(data) function b64_to_persistent_blob(data, refhash="") //refhash will be calculated if not provided
{ {
if(!data || !data.startsWith("data:")) if(!data || !data.startsWith("data:"))
{ {
return null; return null;
} }
let audiohash = cyrb_hash(data); if(refhash=="")
let fetchedblob = data_hash_to_blob_lookup[audiohash]; {
refhash = cyrb_hash(data);
}
let fetchedblob = data_hash_to_blob_lookup[refhash];
if(fetchedblob) if(fetchedblob)
{ {
return fetchedblob; return fetchedblob.blob;
} }
let splits = data.split(";base64,"); let splits = data.split(";base64,");
let dtype = splits[0]; let dtype = splits[0];
@ -3695,7 +3698,7 @@ Current version indicated by LITEVER below.
// Create Blob and URL // Create Blob and URL
const audioBlob = new Blob([bytes], {type: dtype}); const audioBlob = new Blob([bytes], {type: dtype});
const audioUrl = URL.createObjectURL(audioBlob); const audioUrl = URL.createObjectURL(audioBlob);
data_hash_to_blob_lookup[audiohash] = audioUrl; data_hash_to_blob_lookup[refhash] = {"id":audioUrl,"original":data,"blob":audioUrl};
return audioUrl; return audioUrl;
} }
function basic_lcg(seed) { // simple RNG for reproducible dice rolls function basic_lcg(seed) { // simple RNG for reproducible dice rolls
@ -4002,10 +4005,11 @@ Current version indicated by LITEVER below.
e.preventDefault(); e.preventDefault();
e.stopPropagation(); e.stopPropagation();
const file = files[0]; const file = files[0];
const fname = files[0].name;
const reader = new FileReader(); const reader = new FileReader();
reader.onload = function(img) { reader.onload = function(img) {
let origImg = img.target.result; let origImg = img.target.result;
self_upload_file_dispatch(origImg); self_upload_file_dispatch(origImg,fname);
} }
reader.readAsDataURL(file); reader.readAsDataURL(file);
} }
@ -14663,21 +14667,21 @@ Current version indicated by LITEVER below.
document.getElementById("addmediacontainer").classList.add("hidden"); document.getElementById("addmediacontainer").classList.add("hidden");
} }
function self_upload_file_dispatch(data) function self_upload_file_dispatch(data,filename)
{ {
if(data.startsWith("data:audio")) if(data.startsWith("data:audio"))
{ {
self_upload_audio(data); self_upload_audio(data,filename);
} }
else if(data.startsWith("data:image")) else if(data.startsWith("data:image"))
{ {
self_upload_img(data); self_upload_img(data,filename);
}else{ }else{
msgbox("Unsupported File Format!\nOnly Image and Audio files are supported!","Unsupported File Format"); msgbox("Unsupported File Format!\nOnly Image and Audio files are supported!","Unsupported File Format");
} }
} }
function self_upload_img(origImg) function self_upload_img(origImg,filename)
{ {
let imgid = "selfuploadimg"+(Math.floor(10000 + Math.random() * 90000)).toString(); let imgid = "selfuploadimg"+(Math.floor(10000 + Math.random() * 90000)).toString();
let nimgtag = "[<|p|" + imgid + "|p|>]"; let nimgtag = "[<|p|" + imgid + "|p|>]";
@ -14719,7 +14723,7 @@ Current version indicated by LITEVER below.
}, false, imgres,0.35,true); }, false, imgres,0.35,true);
} }
function self_upload_audio(origAudio) function self_upload_audio(origAudio,filename)
{ {
let imgid = "selfuploadaudio"+(Math.floor(10000 + Math.random() * 90000)).toString(); let imgid = "selfuploadaudio"+(Math.floor(10000 + Math.random() * 90000)).toString();
let nimgtag = "[<|p|" + imgid + "|p|>]"; let nimgtag = "[<|p|" + imgid + "|p|>]";
@ -14737,10 +14741,12 @@ Current version indicated by LITEVER below.
image_db[imgid] = { done: false, queue: "Processing", result: "", prompt:"", poll_category:0 }; image_db[imgid] = { done: false, queue: "Processing", result: "", prompt:"", poll_category:0 };
image_db[imgid].aspect = 0; image_db[imgid].aspect = 0;
image_db[imgid].imsource = 1; //0=generated,1=uploaded image_db[imgid].imsource = 1; //0=generated,1=uploaded
image_db[imgid].imrefid = ""; image_db[imgid].imrefid = filename;
convertAudioToCompressedBase64(origAudio,(newAudio)=>{ image_db[imgid].len = 0;
convertAudioToCompressedBase64(origAudio,(newAudio,duration)=>{
image_db[imgid].done = true; image_db[imgid].done = true;
image_db[imgid].result = newAudio; image_db[imgid].result = newAudio;
image_db[imgid].len = duration;
}); });
} }
@ -14760,7 +14766,7 @@ Current version indicated by LITEVER below.
var reader = new FileReader(); var reader = new FileReader();
reader.onload = function(event){ reader.onload = function(event){
let origImg = event.target.result; let origImg = event.target.result;
self_upload_file_dispatch(origImg); self_upload_file_dispatch(origImg,"");
}; };
reader.readAsDataURL(blob); reader.readAsDataURL(blob);
founditem = true; founditem = true;
@ -14789,10 +14795,11 @@ Current version indicated by LITEVER below.
console.log(files); console.log(files);
if (files.length > 0 && files[0] != null && files[0].name && files[0].name != "") { if (files.length > 0 && files[0] != null && files[0].name && files[0].name != "") {
const file = files[0]; const file = files[0];
const fname = files[0].name;
const reader = new FileReader(); const reader = new FileReader();
reader.onload = function(img) { reader.onload = function(img) {
let origImg = img.target.result; let origImg = img.target.result;
self_upload_file_dispatch(origImg); self_upload_file_dispatch(origImg, fname);
} }
reader.readAsDataURL(file); reader.readAsDataURL(file);
document.getElementById("pasteimgcontainer").classList.add("hidden"); document.getElementById("pasteimgcontainer").classList.add("hidden");
@ -14875,7 +14882,7 @@ Current version indicated by LITEVER below.
const sy = (videoHeight - sideLength) / 2; const sy = (videoHeight - sideLength) / 2;
context.drawImage(video, sx, sy, sideLength, sideLength, 0, 0, 512, 512); context.drawImage(video, sx, sy, sideLength, sideLength, 0, 0, 512, 512);
const dataURL = canvas.toDataURL('image/png'); const dataURL = canvas.toDataURL('image/png');
self_upload_file_dispatch(dataURL); // Call your upload function self_upload_file_dispatch(dataURL,""); // Call your upload function
hide_popups(); hide_popups();
} }
@ -14886,10 +14893,11 @@ Current version indicated by LITEVER below.
finput.onchange = (event) => { finput.onchange = (event) => {
if (event.target.files.length > 0 && event.target.files[0]) { if (event.target.files.length > 0 && event.target.files[0]) {
const file = event.target.files[0]; const file = event.target.files[0];
const fname = file.name;
const reader = new FileReader(); const reader = new FileReader();
reader.onload = function(img) { reader.onload = function(img) {
let origImg = img.target.result; let origImg = img.target.result;
self_upload_file_dispatch(origImg); self_upload_file_dispatch(origImg,fname);
} }
reader.readAsDataURL(file); reader.readAsDataURL(file);
} }
@ -16345,6 +16353,10 @@ Current version indicated by LITEVER below.
{ {
submit_payload.params.images = insertAIVisionImages; submit_payload.params.images = insertAIVisionImages;
} }
if(is_using_kcpp_with_llava() && insertAIAudioSounds.length>0)
{
submit_payload.params.audio = insertAIAudioSounds;
}
if(localsettings.sampler_seed>=1) if(localsettings.sampler_seed>=1)
{ {
@ -17689,6 +17701,51 @@ Current version indicated by LITEVER below.
} }
} }
function zoomed_transcribe_btn(audiohash,onDone)
{
let fetchedblob = data_hash_to_blob_lookup[audiohash];
if(!fetchedblob)
{
return;
}
fetch(fetchedblob.blob)
.then(x => x.blob())
.then(completeRecording => {
audioBlobToDecodedAudioBuffer(completeRecording,(buffer)=>{
resampleAudioBuffer(buffer,16000,(rsBuffer)=>{
let wavblob = audioBufferToWavBlob(rsBuffer);
const reader = new FileReader();
reader.onload = function(audiodata) {
let dataurl = audiodata.target.result;
let payload = {
"audio_data": dataurl,
"prompt": "",
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false),
"langcode": document.getElementById("voice_langcode").value
};
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(payload),
})
.then(x => x.json())
.then(resp => {
console.log(resp);
if(resp && resp.text && resp.text!="")
{
msgbox(resp.text,"Transcribed Audio");
}
}).catch((error) => {
console.log("Transcribe Error: " + error);
});
}
reader.readAsDataURL(wavblob);
});
});
});
}
function toggle_ai_vision(imghash) function toggle_ai_vision(imghash)
{ {
@ -17700,7 +17757,7 @@ Current version indicated by LITEVER below.
{ {
//request a new interrogation //request a new interrogation
var alreadysent = Object.values(interrogation_db).some(item => item.imghash === imghash); var alreadysent = Object.values(interrogation_db).some(item => item.imghash === imghash);
if(!alreadysent) if(!alreadysent && document.getElementById("zoomedimg"))
{ {
let b64 = document.getElementById("zoomedimg").src; let b64 = document.getElementById("zoomedimg").src;
interrogate_new_image(b64,imghash,(savedmeta.visionmode==1)); interrogate_new_image(b64,imghash,(savedmeta.visionmode==1));
@ -17719,10 +17776,33 @@ Current version indicated by LITEVER below.
let savedmeta = completed_imgs_meta[imghash]; let savedmeta = completed_imgs_meta[imghash];
if(!savedmeta && imghash!="") if(!savedmeta && imghash!="")
{ {
savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0}; savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0, ref:"", len:0};
}
if(!savedmeta.visionmode)
{
savedmeta.visionmode = 0;
}
let hasllava = is_using_kcpp_with_llava();
let visionstatus = "";
if(savedmeta.visionmode==3)
{
if(custom_kobold_endpoint!="") //on a kobo endpoint
{
visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(hasllava?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`));
}
else
{
let isoai = (custom_oai_key!="" && document.getElementById("useoaichatcompl").checked);
let isgemini = (custom_gemini_key!="");
visionstatus = (isoai?`<span class="color_green">OpenAI API (Conditional)</span>`:(isgemini?`<span class="color_green">Gemini API (Conditional)</span>`:`<span class="color_yellow">Unsupported</span>`));
}
}
else
{
visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(savedmeta.desc?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing</span>`));
} }
if(savedmeta) if(savedmeta && document.getElementById("zoomedimg"))
{ {
document.getElementById("zoomedimg").classList.remove("portrait"); document.getElementById("zoomedimg").classList.remove("portrait");
document.getElementById("zoomedimg").classList.remove("landscape"); document.getElementById("zoomedimg").classList.remove("landscape");
@ -17745,32 +17825,8 @@ Current version indicated by LITEVER below.
document.getElementById("zoomedimg").classList.add("landscape_long"); document.getElementById("zoomedimg").classList.add("landscape_long");
} }
if(!savedmeta.visionmode)
{
savedmeta.visionmode = 0;
}
let origprompt = (savedmeta.prompt?replaceAll(savedmeta.prompt,"\n"," ") : "No Saved Description"); let origprompt = (savedmeta.prompt?replaceAll(savedmeta.prompt,"\n"," ") : "No Saved Description");
latest_orig_prompt = origprompt; latest_orig_prompt = origprompt;
let hasllava = is_using_kcpp_with_llava();
let visionstatus = "";
if(savedmeta.visionmode==3)
{
if(custom_kobold_endpoint!="") //on a kobo endpoint
{
visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(hasllava?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`));
}
else
{
let isoai = (custom_oai_key!="" && document.getElementById("useoaichatcompl").checked);
let isgemini = (custom_gemini_key!="");
visionstatus = (isoai?`<span class="color_green">OpenAI API (Conditional)</span>`:(isgemini?`<span class="color_green">Gemini API (Conditional)</span>`:`<span class="color_yellow">Unsupported</span>`));
}
}
else
{
visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(savedmeta.desc?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing</span>`));
}
let togglebtn = `<select class="form-control" id="aivisionmode" style="display:inline;height:24px;width: 140px; padding: 2px; margin: 3px; font-size:12px;" onchange="toggle_ai_vision(\'`+imghash+`\')"> let togglebtn = `<select class="form-control" id="aivisionmode" style="display:inline;height:24px;width: 140px; padding: 2px; margin: 3px; font-size:12px;" onchange="toggle_ai_vision(\'`+imghash+`\')">
<option value="0">Disabled</option> <option value="0">Disabled</option>
@ -17786,6 +17842,25 @@ Current version indicated by LITEVER below.
`; `;
document.getElementById("aivisionmode").value = savedmeta.visionmode; document.getElementById("aivisionmode").value = savedmeta.visionmode;
} }
else if(savedmeta && document.getElementById("zoomedaudio"))
{
let transcribebtn = "";
if(is_using_kcpp_with_whisper())
{
transcribebtn = `<button type="button" class="btn btn-primary" style="width: 140px; padding: 2px; margin: 3px; font-size:12px;" onclick="zoomed_transcribe_btn(\'`+imghash+`\')">Transcribe Audio</button>`;
}
let togglebtn = `<select class="form-control" id="aivisionmode" style="display:inline;height:24px;width: 140px; padding: 2px; margin: 3px; font-size:12px;" onchange="toggle_ai_vision(\'`+imghash+`\')">
<option value="0">Disabled</option>
<option value="3">Multimodal Audio</option>
</select>`;
document.getElementById("zoomedimgdesc").innerHTML = `
AI Embed Audio: `+visionstatus+` <span class="helpicon">?<span class="helptext">Allows the AI to hear and react to this audio (on supported models). Transcribe tries to replace the audio file with detected speech.</span></span>
${togglebtn}
<br>
${transcribebtn}
`;
document.getElementById("aivisionmode").value = savedmeta.visionmode;
}
else else
{ {
document.getElementById("zoomedimgdesc").innerText = "No Saved Data"; document.getElementById("zoomedimgdesc").innerText = "No Saved Data";
@ -17801,7 +17876,7 @@ Current version indicated by LITEVER below.
{ {
inputBox("Enter prompt to create a new image, based on this source image.","Create Img2Img","","Enter Img2Img Prompt",()=>{ inputBox("Enter prompt to create a new image, based on this source image.","Create Img2Img","","Enter Img2Img Prompt",()=>{
let userinput = getInputBoxValue(); let userinput = getInputBoxValue();
if(userinput.trim()!="") if(userinput.trim()!="" && document.getElementById("zoomedimg"))
{ {
var sentence = userinput.trim().substring(0, 380); var sentence = userinput.trim().substring(0, 380);
let b64 = document.getElementById("zoomedimg").src; let b64 = document.getElementById("zoomedimg").src;
@ -17812,26 +17887,60 @@ Current version indicated by LITEVER below.
} }
function click_image(target,imghash) function click_image(target,imghash)
{ {
clear_zoomed_img_and_audio();
if(target) if(target)
{ {
if(localsettings.invert_colors)
{
document.getElementById("zoomedimg").classList.add("invert_colors");
}else{
document.getElementById("zoomedimg").classList.remove("invert_colors");
}
document.getElementById("zoomedimgcontainer").classList.remove("hidden"); document.getElementById("zoomedimgcontainer").classList.remove("hidden");
document.getElementById("zoomedimg").src = target.src; let src = `<img class="zoomedimg ${localsettings.invert_colors?"invert_colors":""}" id="zoomedimg" src="${target.src}">`;
document.getElementById("zoomedimgdiv").innerHTML = src;
document.getElementById("zoomedimgdiv").classList.remove("hidden");
document.getElementById("zoomedaudiodiv").classList.add("hidden");
update_clicked_image(imghash); update_clicked_image(imghash);
} }
} }
function delete_curr_image() function click_audio(target,audiohash,audioblob)
{ {
let removesrc = document.getElementById("zoomedimg").src; clear_zoomed_img_and_audio();
if (removesrc && removesrc != "") { if(target)
var matchingStr = ("[<|d|" + removesrc + "|d|>]") {
document.getElementById("zoomedimgcontainer").classList.remove("hidden");
document.getElementById("zoomedimgdiv").classList.add("hidden");
document.getElementById("zoomedaudiodiv").classList.remove("hidden");
let src = `<div><audio controls title="AudioPlayer"><source src="${audioblob}" id="zoomedaudio" type="audio/mp3"></audio></div>`;
document.getElementById("zoomedaudiodiv").innerHTML = src;
update_clicked_image(audiohash);
}
}
function clear_zoomed_img_and_audio()
{
document.getElementById("zoomedimgdiv").innerHTML = "";
document.getElementById("zoomedaudiodiv").innerHTML = "";
}
function delete_curr_media()
{
let zoomedimg = document.getElementById("zoomedimg");
let zoomedaudio = document.getElementById("zoomedaudio");
let targettoremove = "";
if (zoomedimg && zoomedimg.src && zoomedimg.src !="") {
targettoremove = zoomedimg.src;
}
else if(zoomedaudio && zoomedaudio.src && zoomedaudio.src !="")
{
let blobid = zoomedaudio.src;
for(v in data_hash_to_blob_lookup)
{
let itm = data_hash_to_blob_lookup[v];
if(itm.id==blobid)
{
targettoremove = itm.original;
break;
}
}
}
if(targettoremove)
{
var matchingStr = ("[<|d|" + targettoremove + "|d|>]")
for (let i = 0; i < gametext_arr.length; ++i) { for (let i = 0; i < gametext_arr.length; ++i) {
if (gametext_arr[i].includes(matchingStr)) { if (gametext_arr[i].includes(matchingStr)) {
gametext_arr[i] = gametext_arr[i].replace(matchingStr, ""); gametext_arr[i] = gametext_arr[i].replace(matchingStr, "");
@ -17883,9 +17992,17 @@ Current version indicated by LITEVER below.
function render_audio_html(data) function render_audio_html(data)
{ {
let audioblob = b64_to_persistent_blob(data); let audiohash = cyrb_hash(data).trim();
const reinvertcolor = localsettings.invert_colors?"invert_colors":""; let audioblob = b64_to_persistent_blob(data,audiohash);
const str = `<div style="display:flex" class="${reinvertcolor}" contenteditable="false"><audio style="display:flex" controls title="Audio Player"><source src="${audioblob}" type="audio/mp3"></audio></div>`; let filename = "";
let len = 0;
if (completed_imgs_meta[audiohash] != null) {
filename = completed_imgs_meta[audiohash].ref;
len = completed_imgs_meta[audiohash].len;
}
let fndisp = filename!=""?`(${filename.substring(0,50)}) `:"";
fndisp = len?(`: ${Math.floor(len)}s ${fndisp}`):fndisp;
const str = `<span><br><button type="button" title="Attached Audio" class="btn btn-primary" style="font-size:12px; padding:8px 8px; border-radius: 16px" onclick="return click_audio(this,\'${audiohash}\',\'${audioblob}\');">Attached Audio ${fndisp}🔊</button><br></span>`;
return str; return str;
} }
@ -18472,7 +18589,7 @@ Current version indicated by LITEVER below.
gametext_arr[i] = gametext_arr[i].replace(matchstr, newstr); gametext_arr[i] = gametext_arr[i].replace(matchstr, newstr);
let metaid = cyrb_hash(img.result); let metaid = cyrb_hash(img.result);
//default to llava if supported, and image is self uploaded //default to llava if supported, and image is self uploaded
completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:((image_db[key].imsource==1 && is_using_kcpp_with_llava())?3:0), aspect:image_db[key].aspect}; completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:((image_db[key].imsource==1 && is_using_kcpp_with_llava())?3:0), aspect:image_db[key].aspect, ref:image_db[key].imrefid, len:image_db[key].len};
delete image_db[key]; delete image_db[key];
} }
} }
@ -18686,13 +18803,13 @@ Current version indicated by LITEVER below.
} }
// AUDIO MANIPULATION FUNCTIONS // AUDIO MANIPULATION FUNCTIONS
//convert any audio to a webm blob (high compression), returns a promise //convert any audio to a webm blob (high compression)
function convertAudioToCompressedBase64(inputBase64, onDone) { function convertAudioToCompressedBase64(inputBase64, onDone) {
// Step 1: Convert base64 string to Blob // Step 1: Convert base64 string to Blob
const matches = inputBase64.match(/^data:(audio\/[a-zA-Z0-9-]+);base64,(.+)$/); const matches = inputBase64.match(/^data:(audio\/[a-zA-Z0-9-]+);base64,(.+)$/);
if (!matches) { if (!matches) {
console.log("Convert Audio: Invalid base64 input"); console.log("Convert Audio: Invalid base64 input");
onDone(null); onDone(null,null);
} }
const mimeType = matches[1]; const mimeType = matches[1];
@ -18712,6 +18829,7 @@ Current version indicated by LITEVER below.
audioContext.decodeAudioData(arrayBuffer, function (buffer) { audioContext.decodeAudioData(arrayBuffer, function (buffer) {
const samplefreq = buffer.sampleRate; const samplefreq = buffer.sampleRate;
const samples = buffer.getChannelData(0); // mono const samples = buffer.getChannelData(0); // mono
const durationInSeconds = buffer.duration;
const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, 40); // mono, 16kHz, 40kbps const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, 40); // mono, 16kHz, 40kbps
const sampleBlockSize = 1152; //can be anything but make it a multiple of 576 to make encoders life easier const sampleBlockSize = 1152; //can be anything but make it a multiple of 576 to make encoders life easier
let mp3Data = []; let mp3Data = [];
@ -18737,12 +18855,12 @@ Current version indicated by LITEVER below.
const fileReader = new FileReader(); const fileReader = new FileReader();
fileReader.onloadend = function () { fileReader.onloadend = function () {
const mp3Base64 = fileReader.result; const mp3Base64 = fileReader.result;
onDone(mp3Base64); onDone(mp3Base64,durationInSeconds);
}; };
fileReader.readAsDataURL(mp3Blob); fileReader.readAsDataURL(mp3Blob);
}, function (err) { }, function (err) {
console.log("Audio decode failed."); console.log("Audio decode failed.");
onDone(null); onDone(null,null);
}); });
}; };
reader.readAsArrayBuffer(inputBlob); reader.readAsArrayBuffer(inputBlob);
@ -19574,6 +19692,7 @@ Current version indicated by LITEVER below.
} }
var insertAIVisionImages = []; //concat gametext will populate this var insertAIVisionImages = []; //concat gametext will populate this
var insertAIAudioSounds = [];
function concat_gametext(stripimg = false, stripimg_replace_str = "", append_before_segment="",append_after_segment="",escapeTxt=false,insertAIVision=false) { function concat_gametext(stripimg = false, stripimg_replace_str = "", append_before_segment="",append_after_segment="",escapeTxt=false,insertAIVision=false) {
let fulltxt = ""; let fulltxt = "";
for (let i = 0; i < gametext_arr.length; ++i) { for (let i = 0; i < gametext_arr.length; ++i) {
@ -19637,6 +19756,7 @@ Current version indicated by LITEVER below.
if(insertAIVision) if(insertAIVision)
{ {
insertAIVisionImages = []; //a bit hacky insertAIVisionImages = []; //a bit hacky
insertAIAudioSounds = [];
fulltxt = fulltxt.replace(/\[<\|d\|.+?\|d\|>\]/g, function (m) { fulltxt = fulltxt.replace(/\[<\|d\|.+?\|d\|>\]/g, function (m) {
// m here means the whole matched string // m here means the whole matched string
let inner = m.substring(5, m.length - 5); let inner = m.substring(5, m.length - 5);
@ -19649,11 +19769,18 @@ Current version indicated by LITEVER below.
} }
else if(foundmeta.visionmode==3) else if(foundmeta.visionmode==3)
{ {
let placeholder = "";
let parts = inner.split(','); let parts = inner.split(',');
if (parts.length === 2 && parts[0].startsWith('data:image')) { if (parts.length === 2 && parts[0].startsWith('data:image')) {
insertAIVisionImages.push(parts[1]); insertAIVisionImages.push(parts[1]);
placeholder = "\n(Attached Image)\n";
} }
return "\n(Attached Image)\n"; else if(parts.length === 2 && parts[0].startsWith('data:audio'))
{
insertAIAudioSounds.push(parts[1]);
placeholder = "\n(Attached Audio)\n";
}
return placeholder;
} }
} }
return ""; return "";
@ -25798,11 +25925,12 @@ Current version indicated by LITEVER below.
<div class="popupbg flex"></div> <div class="popupbg flex"></div>
<div class="nspopup flexsize highest"> <div class="nspopup flexsize highest">
<div class="popuptitlebar"> <div class="popuptitlebar">
<div class="popuptitletext">Image Information</div> <div class="popuptitletext">Media Information</div>
</div> </div>
<div class="zoomedimgdiv"> <div id="zoomedimgdiv" class="zoomedimgdiv">
<img class="zoomedimg" id="zoomedimg" src=""> </div>
<div id="zoomedaudiodiv" class="zoomedimgdiv">
</div> </div>
<div class="menutext zoomedimgdesc" id="zoomedimgdesc" style="word-wrap: break-word;"> <div class="menutext zoomedimgdesc" id="zoomedimgdesc" style="word-wrap: break-word;">
@ -25810,8 +25938,8 @@ Current version indicated by LITEVER below.
</div> </div>
<br> <br>
<div class="popupfooter"> <div class="popupfooter">
<button type="button" class="bg_red btn btn-primary" style="width: 124px;" onclick="delete_curr_image();hide_popups();">Delete Image</button> <button type="button" class="bg_red btn btn-primary" style="width: 124px;" onclick="delete_curr_media();clear_zoomed_img_and_audio();hide_popups();">Delete Media</button>
<button type="button" class="btn btn-primary" onclick="hide_popups()">Close</button> <button type="button" class="btn btn-primary" onclick="clear_zoomed_img_and_audio();hide_popups()">Close</button>
</div> </div>
</div> </div>
</div> </div>

View file

@ -454,7 +454,7 @@ bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whi
batch_f32.entries.push_back(std::move(mel_f32)); batch_f32.entries.push_back(std::move(mel_f32));
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
float * audio_embd = (float *)malloc(n_tokens * n_mmproj_embd); float * audio_embd = (float *)malloc(n_tokens * n_mmproj_embd * sizeof(float));
bool ok = clip_image_batch_encode( bool ok = clip_image_batch_encode(
ctx_clip, ctx_clip,
n_threads, n_threads,
@ -462,5 +462,5 @@ bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whi
audio_embd); audio_embd);
*image_embd_out = audio_embd; *image_embd_out = audio_embd;
*n_img_pos_out = n_tokens; *n_img_pos_out = n_tokens;
return ok ? 0 : 1; return ok;
} }

View file

@ -29,7 +29,7 @@
#define MA_API static #define MA_API static
#include "miniaudio/miniaudio.h" #include "miniaudio/miniaudio.h"
#define STB_IMAGE_IMPLEMENTATION // #define STB_IMAGE_IMPLEMENTATION
#include "stb/stb_image.h" #include "stb/stb_image.h"
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__) #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)