diff --git a/Makefile b/Makefile
index dd93602e0..9f912d812 100644
--- a/Makefile
+++ b/Makefile
@@ -714,7 +714,7 @@ ttsmain: tools/tts/tts.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-op
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 gguf-split: tools/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-mtmd-cli: tools/mtmd/mtmd-cli.cpp tools/mtmd/mtmd.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
+mtmd-cli: tools/mtmd/mtmd-cli.cpp tools/mtmd/mtmd.cpp tools/mtmd/mtmd-helper.cpp tools/mtmd/clip.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index d66edd4d1..7bf9fb358 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -3089,7 +3089,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
                     } else {
                         if(debugmode==1 && !is_quiet)
                         {
-                            printf("\nAudio Clip Embed Chunk %i used Tokens: %d",i,chunk.clp_image_tokens);
+                            printf("\nAudio Clip %i Embed Chunk used Tokens: %d",i,chunk.clp_image_tokens);
                         }
                         total_chunk_tokens += chunk.clp_image_tokens;
                         media_objects[i].mediachunks.push_back(chunk);
@@ -3480,7 +3480,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
     bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
     TokenizeString("\n\n", media_sep, file_format, false);
-    TokenizeString("\nImages:\n", media_intro, file_format, false);
+    TokenizeString("\nAttached Media:\n", media_intro, file_format, false);
 
     if(media_composite_image_signature=="")
     {
diff --git a/klite.embd b/klite.embd
index 36c8239ed..822194c02 100644
--- a/klite.embd
+++ b/klite.embd
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->
 
 <script id="init-config">
-	const LITEVER = 259;
+	const LITEVER = 260;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -1261,28 +1261,28 @@ Current version indicated by LITEVER below.
 	.zoomedimg
 	{
 		border-radius: 6%;
-		width:462px;
-		height:462px;
+		width:420px;
+		height:420px;
 	}
 	.zoomedimg.portrait
 	{
-		width:308px;
-		height:462px;
+		width:280px;
+		height:420px;
 	}
 	.zoomedimg.portrait_long
 	{
-		width:231px;
-		height:462px;
+		width:210px;
+		height:420px;
 	}
 	.zoomedimg.landscape
 	{
-		width:462px;
-		height:308px;
+		width:420px;
+		height:280px;
 	}
 	.zoomedimg.landscape_long
 	{
-		width:462px;
-		height:231px;
+		width:420px;
+		height:210px;
 	}
 	@media (max-width: 620px) {
 		.zoomedimg {
@@ -3670,17 +3670,20 @@ Current version indicated by LITEVER below.
 		//truncate to first 3 bytes
 		return hsh.substring(0, hashBytes*2);
 	};
-	function b64_to_persistent_blob(data)
+	function b64_to_persistent_blob(data, refhash="") //refhash will be calculated if not provided
 	{
 		if(!data || !data.startsWith("data:"))
 		{
 			return null;
 		}
-		let audiohash = cyrb_hash(data);
-		let fetchedblob = data_hash_to_blob_lookup[audiohash];
+		if(refhash=="")
+		{
+			refhash = cyrb_hash(data);
+		}
+		let fetchedblob = data_hash_to_blob_lookup[refhash];
 		if(fetchedblob)
 		{
-			return fetchedblob;
+			return fetchedblob.blob;
 		}
 		let splits = data.split(";base64,");
 		let dtype = splits[0];
@@ -3695,7 +3698,7 @@ Current version indicated by LITEVER below.
 		// Create Blob and URL
 		const audioBlob = new Blob([bytes], {type: dtype});
 		const audioUrl = URL.createObjectURL(audioBlob);
-		data_hash_to_blob_lookup[audiohash] = audioUrl;
+		data_hash_to_blob_lookup[refhash] = {"id":audioUrl,"original":data,"blob":audioUrl};
 		return audioUrl;
 	}
 	function basic_lcg(seed) { // simple RNG for reproducible dice rolls
@@ -4002,10 +4005,11 @@ Current version indicated by LITEVER below.
 				e.preventDefault();
 				e.stopPropagation();
 				const file = files[0];
+				const fname = files[0].name;
 				const reader = new FileReader();
 				reader.onload = function(img) {
 					let origImg = img.target.result;
-					self_upload_file_dispatch(origImg);
+					self_upload_file_dispatch(origImg,fname);
 				}
 				reader.readAsDataURL(file);
 			}
@@ -14663,21 +14667,21 @@ Current version indicated by LITEVER below.
 		document.getElementById("addmediacontainer").classList.add("hidden");
 	}
 
-	function self_upload_file_dispatch(data)
+	function self_upload_file_dispatch(data,filename)
 	{
 		if(data.startsWith("data:audio"))
 		{
-			self_upload_audio(data);
+			self_upload_audio(data,filename);
 		}
 		else if(data.startsWith("data:image"))
 		{
-			self_upload_img(data);
+			self_upload_img(data,filename);
 		}else{
 			msgbox("Unsupported File Format!\nOnly Image and Audio files are supported!","Unsupported File Format");
 		}
 	}
 
-	function self_upload_img(origImg)
+	function self_upload_img(origImg,filename)
 	{
 		let imgid = "selfuploadimg"+(Math.floor(10000 + Math.random() * 90000)).toString();
 		let nimgtag = "[<|p|" + imgid + "|p|>]";
@@ -14719,7 +14723,7 @@ Current version indicated by LITEVER below.
 		}, false, imgres,0.35,true);
 	}
 
-	function self_upload_audio(origAudio)
+	function self_upload_audio(origAudio,filename)
 	{
 		let imgid = "selfuploadaudio"+(Math.floor(10000 + Math.random() * 90000)).toString();
 		let nimgtag = "[<|p|" + imgid + "|p|>]";
@@ -14737,10 +14741,12 @@ Current version indicated by LITEVER below.
 		image_db[imgid] = { done: false, queue: "Processing", result: "", prompt:"", poll_category:0 };
 		image_db[imgid].aspect = 0;
 		image_db[imgid].imsource = 1; //0=generated,1=uploaded
-		image_db[imgid].imrefid = "";
-		convertAudioToCompressedBase64(origAudio,(newAudio)=>{
+		image_db[imgid].imrefid = filename;
+		image_db[imgid].len = 0;
+		convertAudioToCompressedBase64(origAudio,(newAudio,duration)=>{
 			image_db[imgid].done = true;
 			image_db[imgid].result = newAudio;
+			image_db[imgid].len = duration;
 		});
 	}
 
@@ -14760,7 +14766,7 @@ Current version indicated by LITEVER below.
 				var reader = new FileReader();
 				reader.onload = function(event){
 					let origImg = event.target.result;
-					self_upload_file_dispatch(origImg);
+					self_upload_file_dispatch(origImg,"");
 				};
 				reader.readAsDataURL(blob);
 				founditem = true;
@@ -14789,10 +14795,11 @@ Current version indicated by LITEVER below.
 				console.log(files);
 				if (files.length > 0 && files[0] != null && files[0].name && files[0].name != "") {
 					const file = files[0];
+					const fname = files[0].name;
 					const reader = new FileReader();
 					reader.onload = function(img) {
 						let origImg = img.target.result;
-						self_upload_file_dispatch(origImg);
+						self_upload_file_dispatch(origImg, fname);
 					}
 					reader.readAsDataURL(file);
 					document.getElementById("pasteimgcontainer").classList.add("hidden");
@@ -14875,7 +14882,7 @@ Current version indicated by LITEVER below.
 		const sy = (videoHeight - sideLength) / 2;
 		context.drawImage(video, sx, sy, sideLength, sideLength, 0, 0, 512, 512);
 		const dataURL = canvas.toDataURL('image/png');
-		self_upload_file_dispatch(dataURL);  // Call your upload function
+		self_upload_file_dispatch(dataURL,"");  // Call your upload function
 		hide_popups();
 	}
 
@@ -14886,10 +14893,11 @@ Current version indicated by LITEVER below.
 		finput.onchange = (event) => {
 			if (event.target.files.length > 0 && event.target.files[0]) {
 				const file = event.target.files[0];
+				const fname = file.name;
 				const reader = new FileReader();
 				reader.onload = function(img) {
 					let origImg = img.target.result;
-					self_upload_file_dispatch(origImg);
+					self_upload_file_dispatch(origImg,fname);
 				}
 				reader.readAsDataURL(file);
 			}
@@ -16345,6 +16353,10 @@ Current version indicated by LITEVER below.
 			{
 				submit_payload.params.images = insertAIVisionImages;
 			}
+			if(is_using_kcpp_with_llava() && insertAIAudioSounds.length>0)
+			{
+				submit_payload.params.audio = insertAIAudioSounds;
+			}
 
 			if(localsettings.sampler_seed>=1)
 			{
@@ -17689,6 +17701,51 @@ Current version indicated by LITEVER below.
 		}
 
 	}
+	function zoomed_transcribe_btn(audiohash,onDone)
+	{
+		let fetchedblob = data_hash_to_blob_lookup[audiohash];
+		if(!fetchedblob)
+		{
+			return;
+		}
+		fetch(fetchedblob.blob)
+		.then(x => x.blob())
+		.then(completeRecording => {
+			audioBlobToDecodedAudioBuffer(completeRecording,(buffer)=>{
+			resampleAudioBuffer(buffer,16000,(rsBuffer)=>{
+				let wavblob = audioBufferToWavBlob(rsBuffer);
+				const reader = new FileReader();
+				reader.onload = function(audiodata) {
+					let dataurl = audiodata.target.result;
+					let payload = {
+						"audio_data": dataurl,
+						"prompt": "",
+						"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false),
+						"langcode": document.getElementById("voice_langcode").value
+					};
+					fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
+						method: 'POST',
+						headers: {
+							'Content-Type': 'application/json',
+						},
+						body: JSON.stringify(payload),
+					})
+					.then(x => x.json())
+					.then(resp => {
+						console.log(resp);
+						if(resp && resp.text && resp.text!="")
+						{
+							msgbox(resp.text,"Transcribed Audio");
+						}
+					}).catch((error) => {
+						console.log("Transcribe Error: " + error);
+					});
+				}
+				reader.readAsDataURL(wavblob);
+			});
+		});
+		});
+	}
 
 	function toggle_ai_vision(imghash)
 	{
@@ -17700,7 +17757,7 @@ Current version indicated by LITEVER below.
 			{
 				//request a new interrogation
 				var alreadysent = Object.values(interrogation_db).some(item => item.imghash === imghash);
-				if(!alreadysent)
+				if(!alreadysent && document.getElementById("zoomedimg"))
 				{
 					let b64 = document.getElementById("zoomedimg").src;
 					interrogate_new_image(b64,imghash,(savedmeta.visionmode==1));
@@ -17719,10 +17776,33 @@ Current version indicated by LITEVER below.
 		let savedmeta = completed_imgs_meta[imghash];
 		if(!savedmeta && imghash!="")
 		{
-			savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0};
+			savedmeta = completed_imgs_meta[imghash] = {prompt:"", desc:"", visionmode:0, aspect:0, ref:"", len:0};
+		}
+		if(!savedmeta.visionmode)
+		{
+			savedmeta.visionmode = 0;
+		}
+		let hasllava = is_using_kcpp_with_llava();
+		let visionstatus = "";
+		if(savedmeta.visionmode==3)
+		{
+			if(custom_kobold_endpoint!="") //on a kobo endpoint
+			{
+				visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(hasllava?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`));
+			}
+			else
+			{
+				let isoai = (custom_oai_key!="" && document.getElementById("useoaichatcompl").checked);
+				let isgemini = (custom_gemini_key!="");
+				visionstatus = (isoai?`<span class="color_green">OpenAI API (Conditional)</span>`:(isgemini?`<span class="color_green">Gemini API (Conditional)</span>`:`<span class="color_yellow">Unsupported</span>`));
+			}
+		}
+		else
+		{
+			visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(savedmeta.desc?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing</span>`));
 		}
 
-		if(savedmeta)
+		if(savedmeta && document.getElementById("zoomedimg"))
 		{
 			document.getElementById("zoomedimg").classList.remove("portrait");
 			document.getElementById("zoomedimg").classList.remove("landscape");
@@ -17745,32 +17825,8 @@ Current version indicated by LITEVER below.
 				document.getElementById("zoomedimg").classList.add("landscape_long");
 			}
 
-			if(!savedmeta.visionmode)
-			{
-				savedmeta.visionmode = 0;
-			}
-
 			let origprompt = (savedmeta.prompt?replaceAll(savedmeta.prompt,"\n"," ") : "No Saved Description");
 			latest_orig_prompt = origprompt;
-			let hasllava = is_using_kcpp_with_llava();
-			let visionstatus = "";
-			if(savedmeta.visionmode==3)
-			{
-				if(custom_kobold_endpoint!="") //on a kobo endpoint
-				{
-					visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(hasllava?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`));
-				}
-				else
-				{
-					let isoai = (custom_oai_key!="" && document.getElementById("useoaichatcompl").checked);
-					let isgemini = (custom_gemini_key!="");
-					visionstatus = (isoai?`<span class="color_green">OpenAI API (Conditional)</span>`:(isgemini?`<span class="color_green">Gemini API (Conditional)</span>`:`<span class="color_yellow">Unsupported</span>`));
-				}
-			}
-			else
-			{
-				visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(savedmeta.desc?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing</span>`));
-			}
 
 			let togglebtn = `<select class="form-control" id="aivisionmode" style="display:inline;height:24px;width: 140px; padding: 2px; margin: 3px; font-size:12px;" onchange="toggle_ai_vision(\'`+imghash+`\')">
 								<option value="0">Disabled</option>
@@ -17786,6 +17842,25 @@ Current version indicated by LITEVER below.
 			`;
 			document.getElementById("aivisionmode").value = savedmeta.visionmode;
 		}
+		else if(savedmeta && document.getElementById("zoomedaudio"))
+		{
+			let transcribebtn = "";
+			if(is_using_kcpp_with_whisper())
+			{
+				transcribebtn = `<button type="button" class="btn btn-primary" style="width: 140px; padding: 2px; margin: 3px; font-size:12px;" onclick="zoomed_transcribe_btn(\'`+imghash+`\')">Transcribe Audio</button>`;
+			}
+			let togglebtn = `<select class="form-control" id="aivisionmode" style="display:inline;height:24px;width: 140px; padding: 2px; margin: 3px; font-size:12px;" onchange="toggle_ai_vision(\'`+imghash+`\')">
+								<option value="0">Disabled</option>
+								<option value="3">Multimodal Audio</option>
+							</select>`;
+			document.getElementById("zoomedimgdesc").innerHTML = `
+			AI Embed Audio: `+visionstatus+` <span class="helpicon">?<span class="helptext">Allows the AI to hear and react to this audio (on supported models). Transcribe tries to replace the audio file with detected speech.</span></span>
+			${togglebtn}
+			<br>
+			${transcribebtn}
+			`;
+			document.getElementById("aivisionmode").value = savedmeta.visionmode;
+		}
 		else
 		{
 			document.getElementById("zoomedimgdesc").innerText = "No Saved Data";
@@ -17801,7 +17876,7 @@ Current version indicated by LITEVER below.
 	{
 		inputBox("Enter prompt to create a new image, based on this source image.","Create Img2Img","","Enter Img2Img Prompt",()=>{
 			let userinput = getInputBoxValue();
-			if(userinput.trim()!="")
+			if(userinput.trim()!="" && document.getElementById("zoomedimg"))
 			{
 				var sentence = userinput.trim().substring(0, 380);
 				let b64 = document.getElementById("zoomedimg").src;
@@ -17812,26 +17887,60 @@ Current version indicated by LITEVER below.
 	}
 	function click_image(target,imghash)
 	{
+		clear_zoomed_img_and_audio();
 		if(target)
 		{
-			if(localsettings.invert_colors)
-			{
-				document.getElementById("zoomedimg").classList.add("invert_colors");
-			}else{
-				document.getElementById("zoomedimg").classList.remove("invert_colors");
-			}
+
 			document.getElementById("zoomedimgcontainer").classList.remove("hidden");
-			document.getElementById("zoomedimg").src = target.src;
-
+			let src = `<img class="zoomedimg ${localsettings.invert_colors?"invert_colors":""}" id="zoomedimg" src="${target.src}">`;
+			document.getElementById("zoomedimgdiv").innerHTML = src;
+			document.getElementById("zoomedimgdiv").classList.remove("hidden");
+			document.getElementById("zoomedaudiodiv").classList.add("hidden");
 			update_clicked_image(imghash);
-
 		}
 	}
-	function delete_curr_image()
+	function click_audio(target,audiohash,audioblob)
 	{
-		let removesrc = document.getElementById("zoomedimg").src;
-		if (removesrc && removesrc != "") {
-			var matchingStr = ("[<|d|" + removesrc + "|d|>]")
+		clear_zoomed_img_and_audio();
+		if(target)
+		{
+			document.getElementById("zoomedimgcontainer").classList.remove("hidden");
+			document.getElementById("zoomedimgdiv").classList.add("hidden");
+			document.getElementById("zoomedaudiodiv").classList.remove("hidden");
+			let src = `<div><audio controls title="AudioPlayer"><source src="${audioblob}" id="zoomedaudio" type="audio/mp3"></audio></div>`;
+			document.getElementById("zoomedaudiodiv").innerHTML = src;
+			update_clicked_image(audiohash);
+		}
+	}
+	function clear_zoomed_img_and_audio()
+	{
+		document.getElementById("zoomedimgdiv").innerHTML = "";
+		document.getElementById("zoomedaudiodiv").innerHTML = "";
+	}
+	function delete_curr_media()
+	{
+		let zoomedimg = document.getElementById("zoomedimg");
+		let zoomedaudio = document.getElementById("zoomedaudio");
+		let targettoremove = "";
+		if (zoomedimg && zoomedimg.src && zoomedimg.src !="") {
+			targettoremove = zoomedimg.src;
+		}
+		else if(zoomedaudio && zoomedaudio.src && zoomedaudio.src !="")
+		{
+			let blobid = zoomedaudio.src;
+			for(v in data_hash_to_blob_lookup)
+			{
+				let itm = data_hash_to_blob_lookup[v];
+				if(itm.id==blobid)
+				{
+					targettoremove = itm.original;
+					break;
+				}
+			}
+		}
+		if(targettoremove)
+		{
+			var matchingStr = ("[<|d|" + targettoremove + "|d|>]")
 			for (let i = 0; i < gametext_arr.length; ++i) {
 				if (gametext_arr[i].includes(matchingStr)) {
 					gametext_arr[i] = gametext_arr[i].replace(matchingStr, "");
@@ -17883,9 +17992,17 @@ Current version indicated by LITEVER below.
 
 	function render_audio_html(data)
 	{
-		let audioblob = b64_to_persistent_blob(data);
-		const reinvertcolor = localsettings.invert_colors?"invert_colors":"";
-		const str = `<div style="display:flex" class="${reinvertcolor}" contenteditable="false"><audio style="display:flex" controls title="Audio Player"><source src="${audioblob}" type="audio/mp3"></audio></div>`;
+		let audiohash = cyrb_hash(data).trim();
+		let audioblob = b64_to_persistent_blob(data,audiohash);
+		let filename = "";
+		let len = 0;
+		if (completed_imgs_meta[audiohash] != null) {
+			filename = completed_imgs_meta[audiohash].ref;
+			len = completed_imgs_meta[audiohash].len;
+		}
+		let fndisp = filename!=""?`(${filename.substring(0,50)}) `:"";
+		fndisp = len?(`: ${Math.floor(len)}s ${fndisp}`):fndisp;
+		const str = `<span><br><button type="button" title="Attached Audio" class="btn btn-primary" style="font-size:12px; padding:8px 8px; border-radius: 16px" onclick="return click_audio(this,\'${audiohash}\',\'${audioblob}\');">Attached Audio ${fndisp}🔊</button><br></span>`;
 		return str;
 	}
 
@@ -18472,7 +18589,7 @@ Current version indicated by LITEVER below.
 							gametext_arr[i] = gametext_arr[i].replace(matchstr, newstr);
 							let metaid = cyrb_hash(img.result);
 							//default to llava if supported, and image is self uploaded
-							completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:((image_db[key].imsource==1 && is_using_kcpp_with_llava())?3:0), aspect:image_db[key].aspect};
+							completed_imgs_meta[metaid] = {prompt:image_db[key].prompt, desc:"", visionmode:((image_db[key].imsource==1 && is_using_kcpp_with_llava())?3:0), aspect:image_db[key].aspect, ref:image_db[key].imrefid, len:image_db[key].len};
 							delete image_db[key];
 						}
 					}
@@ -18686,13 +18803,13 @@ Current version indicated by LITEVER below.
 	}
 
 	// AUDIO MANIPULATION FUNCTIONS
-	//convert any audio to a webm blob (high compression), returns a promise
+	//convert any audio to a webm blob (high compression)
 	function convertAudioToCompressedBase64(inputBase64, onDone) {
 		// Step 1: Convert base64 string to Blob
 		const matches = inputBase64.match(/^data:(audio\/[a-zA-Z0-9-]+);base64,(.+)$/);
 		if (!matches) {
 			console.log("Convert Audio: Invalid base64 input");
-			onDone(null);
+			onDone(null,null);
 		}
 
 		const mimeType = matches[1];
@@ -18712,6 +18829,7 @@ Current version indicated by LITEVER below.
 			audioContext.decodeAudioData(arrayBuffer, function (buffer) {
 				const samplefreq = buffer.sampleRate;
 				const samples = buffer.getChannelData(0); // mono
+				const durationInSeconds = buffer.duration;
 				const mp3encoder = new lamejs.Mp3Encoder(1, samplefreq, 40); // mono, 16kHz, 40kbps
 				const sampleBlockSize = 1152; //can be anything but make it a multiple of 576 to make encoders life easier
 				let mp3Data = [];
@@ -18737,12 +18855,12 @@ Current version indicated by LITEVER below.
 				const fileReader = new FileReader();
 				fileReader.onloadend = function () {
 					const mp3Base64 = fileReader.result;
-					onDone(mp3Base64);
+					onDone(mp3Base64,durationInSeconds);
 				};
 				fileReader.readAsDataURL(mp3Blob);
 			}, function (err) {
 				console.log("Audio decode failed.");
-				onDone(null);
+				onDone(null,null);
 			});
 		};
 		reader.readAsArrayBuffer(inputBlob);
@@ -19574,6 +19692,7 @@ Current version indicated by LITEVER below.
 	}
 
 	var insertAIVisionImages = []; //concat gametext will populate this
+	var insertAIAudioSounds = [];
 	function concat_gametext(stripimg = false, stripimg_replace_str = "", append_before_segment="",append_after_segment="",escapeTxt=false,insertAIVision=false) {
 		let fulltxt = "";
 		for (let i = 0; i < gametext_arr.length; ++i) {
@@ -19637,6 +19756,7 @@ Current version indicated by LITEVER below.
 			if(insertAIVision)
 			{
 				insertAIVisionImages = []; //a bit hacky
+				insertAIAudioSounds = [];
 				fulltxt = fulltxt.replace(/\[<\|d\|.+?\|d\|>\]/g, function (m) {
 					// m here means the whole matched string
 					let inner = m.substring(5, m.length - 5);
@@ -19649,11 +19769,18 @@ Current version indicated by LITEVER below.
 						}
 						else if(foundmeta.visionmode==3)
 						{
+							let placeholder = "";
 							let parts = inner.split(',');
 							if (parts.length === 2 && parts[0].startsWith('data:image')) {
 								insertAIVisionImages.push(parts[1]);
+								placeholder = "\n(Attached Image)\n";
 							}
-							return "\n(Attached Image)\n";
+							else if(parts.length === 2 && parts[0].startsWith('data:audio'))
+							{
+								insertAIAudioSounds.push(parts[1]);
+								placeholder = "\n(Attached Audio)\n";
+							}
+							return placeholder;
 						}
 					}
 					return "";
@@ -25798,11 +25925,12 @@ Current version indicated by LITEVER below.
 		<div class="popupbg flex"></div>
 		<div class="nspopup flexsize highest">
 			<div class="popuptitlebar">
-				<div class="popuptitletext">Image Information</div>
+				<div class="popuptitletext">Media Information</div>
 			</div>
 
-			<div class="zoomedimgdiv">
-				<img class="zoomedimg" id="zoomedimg" src="">
+			<div id="zoomedimgdiv" class="zoomedimgdiv">
+			</div>
+			<div id="zoomedaudiodiv" class="zoomedimgdiv">
 			</div>
 
 			<div class="menutext zoomedimgdesc" id="zoomedimgdesc" style="word-wrap: break-word;">
@@ -25810,8 +25938,8 @@ Current version indicated by LITEVER below.
 			</div>
 			<br>
 			<div class="popupfooter">
-				<button type="button" class="bg_red btn btn-primary" style="width: 124px;" onclick="delete_curr_image();hide_popups();">Delete Image</button>
-				<button type="button" class="btn btn-primary" onclick="hide_popups()">Close</button>
+				<button type="button" class="bg_red btn btn-primary" style="width: 124px;" onclick="delete_curr_media();clear_zoomed_img_and_audio();hide_popups();">Delete Media</button>
+				<button type="button" class="btn btn-primary" onclick="clear_zoomed_img_and_audio();hide_popups()">Close</button>
 			</div>
 		</div>
 	</div>
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index 7e7ed1ef2..1a836c441 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -548,7 +548,7 @@ kcpp_embd_batch::kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npas
             batch.seq_id  [i] = seq_id_0.data();
             batch.logits  [i] = (return_all_logits?true:false);
         }
-            for (int j = 0; j < batch.n_tokens * 3; j++) {
+        for (int j = 0; j < batch.n_tokens * 3; j++) {
             batch.pos[j] = npast + (j % batch.n_tokens);
         }
     }
diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
index dbd7e64a8..f623b98c8 100644
--- a/tools/mtmd/llava.cpp
+++ b/tools/mtmd/llava.cpp
@@ -454,7 +454,7 @@ bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whi
     batch_f32.entries.push_back(std::move(mel_f32));
 
     int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
-    float * audio_embd = (float *)malloc(n_tokens * n_mmproj_embd);
+    float * audio_embd = (float *)malloc(n_tokens * n_mmproj_embd * sizeof(float));
     bool ok = clip_image_batch_encode(
         ctx_clip,
         n_threads,
@@ -462,5 +462,5 @@ bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whi
         audio_embd);
     *image_embd_out = audio_embd;
     *n_img_pos_out = n_tokens;
-    return ok ? 0 : 1;
+    return ok;
 }
\ No newline at end of file
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 686f42f39..d83faefe2 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -29,7 +29,7 @@
 #define MA_API static
 #include "miniaudio/miniaudio.h"
 
-#define STB_IMAGE_IMPLEMENTATION
+// #define STB_IMAGE_IMPLEMENTATION
 #include "stb/stb_image.h"
 
 #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)