added interrogate endpoint (+1 squashed commits)

Squashed commits: [7bf96261] added interrogate endpoint
2025-09-09 16:44:35 +00:00 · 2024-03-11 18:28:23 +08:00 · 2024-03-11 18:28:23 +08:00 · d59ec68753
commit d59ec68753
parent e4946b96ea
3 changed files with 101 additions and 4 deletions
--- a/kcpp_docs.embd
+++ b/kcpp_docs.embd
@ -997,6 +997,60 @@
                      ]
                   }
                },
+                "/sdapi/v1/interrogate": {
+                   "post": {
+                      "description": "Generates a short text caption describing an image.",
+                      "requestBody": {
+                         "content": {
+                            "application/json": {
+                               "example": {
+                                  "image": "base64_image_data",
+                                  "model": "clip"
+                               },
+                               "schema": {
+                                  "properties": {
+                                     "image": {
+                                        "type": "string",
+                                        "description": "A base64 string containing the encoded PNG of the image."
+                                     },
+                                     "model": {
+                                        "type": "string",
+                                        "description": "Not used."
+                                     },
+                                  },
+                                  "type": "object"
+                               }
+                            }
+                         },
+                         "required": false
+                      },
+                      "responses": {
+                         "200": {
+                            "content": {
+                               "application/json": {
+                                  "example":
+                                  {
+                                     "caption":"A picture of a white cottage with a flagpole."
+                                  },
+                                  "schema": {
+                                     "properties": {
+                                        "caption": {
+                                           "type": "string",
+                                           "description": "A short text description of the image."
+                                        }
+                                     }
+                                  }
+                               }
+                            },
+                            "description": "Successful request"
+                         }
+                      },
+                      "summary": "Generates a short text caption describing an image",
+                      "tags": [
+                         "sdapi/v1"
+                      ]
+                   }
+                },
                "/v1/completions": {
                   "post": {
                      "summary": "Generates text continuations given a prompt. Please refer to OpenAI documentation",
--- a/klite.embd
+++ b/klite.embd
@ -7,7 +7,7 @@ Just copy this single static HTML file anywhere and open it in a browser, or fro
 Please go to https://github.com/LostRuins/lite.koboldai.net for updates on Kobold Lite.
 If you are submitting a pull request for Lite, PLEASE use the above repo, not the KoboldCpp one.
 Kobold Lite is under the AGPL v3.0 License unless otherwise exempted. Please do not remove this line.
-Current version: 122
+Current version: 123
 -Concedo
 -->

@ -3497,6 +3497,7 @@ Current version: 122
 	var localmodehost = "localhost";
 	var kobold_endpoint_version = ""; //used to track problematic versions to avoid sending extra fields
 	var koboldcpp_version = ""; //detect if we are using koboldcpp
+	var koboldcpp_has_vision = false;
 	var last_request_str = "No Requests Available"; //full context of last submitted request
 	var lastcheckgenkey = ""; //for checking polled-streaming unique id when generating in kcpp
 	var globalabortcontroller = null;
@ -7013,6 +7014,7 @@ Current version: 122
 									{
 										koboldcpp_version = data.version;
 										console.log("KoboldCpp Detected: " + koboldcpp_version);
+										koboldcpp_has_vision = (data.vision?true:false);

 										//also check against kcpp's max true context length
 										fetch(apply_proxy_url(tmpep + koboldcpp_truemaxctxlen_endpoint))
@ -10717,7 +10719,17 @@ Current version: 122

 			let origprompt = (savedmeta.prompt?replaceAll(savedmeta.prompt,"\n"," ") : "No Saved Description");
 			latest_orig_prompt = origprompt;
-			let visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:((savedmeta.desc||savedmeta.visionmode==3)?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing</span>`));
+			let hasllava = (is_using_kcpp_with_llava() && koboldcpp_has_vision);
+			let visionstatus = "";
+			if(savedmeta.visionmode==3)
+			{
+				visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(hasllava?`<span class="color_green">Active</span>`:`<span class="color_yellow">Unsupported</span>`));
+			}
+			else
+			{
+				visionstatus = ((!savedmeta.visionmode || savedmeta.visionmode==0)?`<span class="color_red">Inactive</span>`:(savedmeta.desc?`<span class="color_green">Active</span>`:`<span class="color_yellow">Analyzing</span>`));
+			}
+
 			let togglebtn = `<select class="form-control" id="aivisionmode" style="display:inline;height:24px;width: 134px; padding: 2px; margin: 3px; font-size:12px;" onchange="toggle_ai_vision(\'`+imghash+`\')">
 								<option value="0">Disabled</option>
 								<option value="1">Interrogate (Horde)</option>
@ -10725,7 +10737,7 @@ Current version: 122
 								<option value="3">Multimodal (LLaVA)</option>
 							</select>`;
 			document.getElementById("zoomedimgdesc").innerHTML = `
-			AI Vision: `+visionstatus+` <span class="helpicon">?<span class="helptext">This allows the AI to visually recognize this image, to see and react to this image. Uses Horde or Local A1111 for image interrogation if enabled.</span></span>
+			AI Vision: `+visionstatus+` <span class="helpicon">?<span class="helptext">This allows the AI to visually recognize this image, to see and react to this image. On KoboldCpp, LLaVA support can be used with multimodal models. Otherwise, uses Horde or Local A1111 for image interrogation if enabled.</span></span>
 			`+togglebtn+`
 			<br><button type="button" class="btn btn-primary" style="width: 140px; padding: 2px; margin: 3px; font-size:12px;" onclick="show_orig_prompt()">View Original Prompt</button>
 			`;
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -333,6 +333,17 @@ def set_backend_props(inputs):
        inputs.vulkan_info = "0".encode("UTF-8")
    return inputs

+def end_trim_to_sentence(input_text):
+    enders = ['.', '!', '?', '*', '"', ')', '}', '`', ']', ';', '…']
+    last = -1
+    for ender in enders:
+        last = max(last, input_text.rfind(ender))
+    nl = input_text.rfind("\n")
+    last = max(last, nl)
+    if last > 0:
+        return input_text[:last + 1].strip()
+    return input_text.strip()
+
 def load_model(model_filename):
    global args
    inputs = load_model_inputs()
@ -720,6 +731,12 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                    if len(images_added)>0:
                        genparams["images"] = images_added

+            elif api_format==5:
+                    firstimg = genparams.get('image', "")
+                    genparams["images"] = [firstimg]
+                    genparams["max_length"] = 32
+                    genparams["prompt"] = "### Instruction: In one sentence, write a descriptive caption for this image.\n### Response:"
+
            return generate(
                prompt=genparams.get('prompt', ""),
                memory=genparams.get('memory', ""),
@ -776,6 +793,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
            res = {"id": "chatcmpl-1", "object": "chat.completion", "created": 1, "model": friendlymodelname,
            "usage": {"prompt_tokens": 100,"completion_tokens": 100,"total_tokens": 200},
            "choices": [{"index": 0, "message":{"role": "assistant", "content": recvtxt,}, "finish_reason": "length"}]}
+        elif api_format==5:
+            res = {"caption": end_trim_to_sentence(recvtxt)}
        else:
            res = {"results": [{"text": recvtxt}]}

@ -1162,7 +1181,7 @@ Enter Prompt:<br>
        try:
            sse_stream_flag = False

-            api_format = 0 #1=basic,2=kai,3=oai,4=oai-chat
+            api_format = 0 #1=basic,2=kai,3=oai,4=oai-chat,5=interrogate
            is_txt2img = False

            if self.path.endswith('/request'):
@ -1181,6 +1200,18 @@ Enter Prompt:<br>
            if self.path.endswith('/v1/chat/completions'):
                api_format = 4

+            if self.path.endswith('/sdapi/v1/interrogate'):
+                has_vision = (mmprojpath!="")
+                if not has_vision:
+                    self.send_response(503)
+                    self.end_headers(content_type='application/json')
+                    self.wfile.write(json.dumps({"detail": {
+                            "msg": "No LLaVA model loaded",
+                            "type": "service_unavailable",
+                        }}).encode())
+                    return
+                api_format = 5
+
            if self.path.endswith('/sdapi/v1/txt2img'):
                is_txt2img = True