From d59ec68753c9a2691d74547cd196c18013a6591c Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Mon, 11 Mar 2024 18:28:23 +0800
Subject: [PATCH] added interrogate endpoint (+1 squashed commits)

Squashed commits:

[7bf96261] added interrogate endpoint
---
 kcpp_docs.embd | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 klite.embd     | 18 ++++++++++++++---
 koboldcpp.py   | 33 +++++++++++++++++++++++++++++-
 3 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/kcpp_docs.embd b/kcpp_docs.embd
index 5b0d2d457..98bf980db 100644
--- a/kcpp_docs.embd
+++ b/kcpp_docs.embd
@@ -997,6 +997,60 @@
                       ]
                    }
                 },
+                "/sdapi/v1/interrogate": {
+                   "post": {
+                      "description": "Generates a short text caption describing an image.",
+                      "requestBody": {
+                         "content": {
+                            "application/json": {
+                               "example": {
+                                  "image": "base64_image_data",
+                                  "model": "clip"
+                               },
+                               "schema": {
+                                  "properties": {
+                                     "image": {
+                                        "type": "string",
+                                        "description": "A base64 string containing the encoded PNG of the image."
+                                     },
+                                     "model": {
+                                        "type": "string",
+                                        "description": "Not used."
+                                     },
+                                  },
+                                  "type": "object"
+                               }
+                            }
+                         },
+                         "required": false
+                      },
+                      "responses": {
+                         "200": {
+                            "content": {
+                               "application/json": {
+                                  "example":
+                                  {
+                                     "caption":"A picture of a white cottage with a flagpole."
+                                  },
+                                  "schema": {
+                                     "properties": {
+                                        "caption": {
+                                           "type": "string",
+                                           "description": "A short text description of the image."
+                                        }
+                                     }
+                                  }
+                               }
+                            },
+                            "description": "Successful request"
+                         }
+                      },
+                      "summary": "Generates a short text caption describing an image",
+                      "tags": [
+                         "sdapi/v1"
+                      ]
+                   }
+                },
                 "/v1/completions": {
                    "post": {
                       "summary": "Generates text continuations given a prompt. Please refer to OpenAI documentation",
diff --git a/klite.embd b/klite.embd
index 6d8a1709b..163fbfe3f 100644
--- a/klite.embd
+++ b/klite.embd
@@ -7,7 +7,7 @@ Just copy this single static HTML file anywhere and open it in a browser, or fro
 Please go to https://github.com/LostRuins/lite.koboldai.net for updates on Kobold Lite.
 If you are submitting a pull request for Lite, PLEASE use the above repo, not the KoboldCpp one.
 Kobold Lite is under the AGPL v3.0 License unless otherwise exempted. Please do not remove this line.
-Current version: 122
+Current version: 123
 -Concedo
 -->
 
@@ -3497,6 +3497,7 @@ Current version: 122
 	var localmodehost = "localhost";
 	var kobold_endpoint_version = ""; //used to track problematic versions to avoid sending extra fields
 	var koboldcpp_version = ""; //detect if we are using koboldcpp
+	var koboldcpp_has_vision = false;
 	var last_request_str = "No Requests Available"; //full context of last submitted request
 	var lastcheckgenkey = ""; //for checking polled-streaming unique id when generating in kcpp
 	var globalabortcontroller = null;
@@ -7013,6 +7014,7 @@ Current version: 122
 									{
 										koboldcpp_version = data.version;
 										console.log("KoboldCpp Detected: " + koboldcpp_version);
+										koboldcpp_has_vision = (data.vision?true:false);
 
 										//also check against kcpp's max true context length
 										fetch(apply_proxy_url(tmpep + koboldcpp_truemaxctxlen_endpoint))
@@ -10717,7 +10719,17 @@ Current version: 122
 
 			let origprompt = (savedmeta.prompt?replaceAll(savedmeta.prompt,"\n"," ") : "No Saved Description");
 			latest_orig_prompt = origprompt;
-			let visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:((savedmeta.desc||savedmeta.visionmode==3)?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing</span>`));
+			let hasllava = (is_using_kcpp_with_llava() && koboldcpp_has_vision);
+			let visionstatus = "";
+			if(savedmeta.visionmode==3)
+			{
+				visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(hasllava?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`));
+			}
+			else
+			{
+				visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(savedmeta.desc?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing</span>`));
+			}
+
 			let togglebtn = `<select class="form-control" id="aivisionmode" style="display:inline;height:24px;width: 134px; padding: 2px; margin: 3px; font-size:12px;" onchange="toggle_ai_vision(\'`+imghash+`\')">
 								<option value="0">Disabled</option>
 								<option value="1">Interrogate (Horde)</option>
@@ -10725,7 +10737,7 @@ Current version: 122
 								<option value="3">Multimodal (LLaVA)</option>
 							</select>`;
 			document.getElementById("zoomedimgdesc").innerHTML = `
-			AI Vision: `+visionstatus+` <span class="helpicon">?<span class="helptext">This allows the AI to visually recognize this image, to see and react to this image. Uses Horde or Local A1111 for image interrogation if enabled.</span></span>
+			AI Vision: `+visionstatus+` <span class="helpicon">?<span class="helptext">This allows the AI to visually recognize this image, to see and react to this image. On KoboldCpp, LLaVA support can be used with multimodal models. Otherwise, uses Horde or Local A1111 for image interrogation if enabled.</span></span>
 			`+togglebtn+`
 			<br><button type="button" class="btn btn-primary" style="width: 140px; padding: 2px; margin: 3px; font-size:12px;" onclick="show_orig_prompt()">View Original Prompt</button>
 			`;
diff --git a/koboldcpp.py b/koboldcpp.py
index af4260298..9314dc28a 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -333,6 +333,17 @@ def set_backend_props(inputs):
         inputs.vulkan_info = "0".encode("UTF-8")
     return inputs
 
+def end_trim_to_sentence(input_text):
+    enders = ['.', '!', '?', '*', '"', ')', '}', '`', ']', ';', '…']
+    last = -1
+    for ender in enders:
+        last = max(last, input_text.rfind(ender))
+    nl = input_text.rfind("\n")
+    last = max(last, nl)
+    if last > 0:
+        return input_text[:last + 1].strip()
+    return input_text.strip()
+
 def load_model(model_filename):
     global args
     inputs = load_model_inputs()
@@ -720,6 +731,12 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                     if len(images_added)>0:
                         genparams["images"] = images_added
 
+            elif api_format==5:
+                    firstimg = genparams.get('image', "")
+                    genparams["images"] = [firstimg]
+                    genparams["max_length"] = 32
+                    genparams["prompt"] = "### Instruction: In one sentence, write a descriptive caption for this image.\n### Response:"
+
             return generate(
                 prompt=genparams.get('prompt', ""),
                 memory=genparams.get('memory', ""),
@@ -776,6 +793,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
             res = {"id": "chatcmpl-1", "object": "chat.completion", "created": 1, "model": friendlymodelname,
             "usage": {"prompt_tokens": 100,"completion_tokens": 100,"total_tokens": 200},
             "choices": [{"index": 0, "message":{"role": "assistant", "content": recvtxt,}, "finish_reason": "length"}]}
+        elif api_format==5:
+            res = {"caption": end_trim_to_sentence(recvtxt)}
         else:
             res = {"results": [{"text": recvtxt}]}
 
@@ -1162,7 +1181,7 @@ Enter Prompt:<br>
         try:
             sse_stream_flag = False
 
-            api_format = 0 #1=basic,2=kai,3=oai,4=oai-chat
+            api_format = 0 #1=basic,2=kai,3=oai,4=oai-chat,5=interrogate
             is_txt2img = False
 
             if self.path.endswith('/request'):
@@ -1181,6 +1200,18 @@ Enter Prompt:<br>
             if self.path.endswith('/v1/chat/completions'):
                 api_format = 4
 
+            if self.path.endswith('/sdapi/v1/interrogate'):
+                has_vision = (mmprojpath!="")
+                if not has_vision:
+                    self.send_response(503)
+                    self.end_headers(content_type='application/json')
+                    self.wfile.write(json.dumps({"detail": {
+                            "msg": "No LLaVA model loaded",
+                            "type": "service_unavailable",
+                        }}).encode())
+                    return
+                api_format = 5
+
             if self.path.endswith('/sdapi/v1/txt2img'):
                 is_txt2img = True