From 547ab2aebba991138a17b899f0142c1f0776a321 Mon Sep 17 00:00:00 2001
From: kallewoof <kalle.alm@gmail.com>
Date: Thu, 21 Nov 2024 11:58:32 +0900
Subject: [PATCH] API: add /props route (#1222)

* API: add an /extra/chat_template route

A lot of manual tweaking is done when swapping between models. We can automate or make better assumptions about some of them by having more information, such as chat template. This PR adds an endpoint /extra/chat_template which returns the model chat template string as is in a 'chat_template' key. The front end can then use this to derive the proper templates or use it as is, or at least warn the user when they are trying to use e.g. a Mistral preset with a Llama 3.1 model.

* switch to pre-established /props endpoint for chat template

* bug-fix (upstream): one-off in string juggling
---
 expose.cpp          | 10 ++++++++++
 gpttype_adapter.cpp | 15 +++++++++++++++
 koboldcpp.py        |  8 ++++++++
 model_adapter.h     |  2 ++
 4 files changed, 35 insertions(+)
diff --git a/expose.cpp b/expose.cpp
index efa1c8bb3..ba660e061 100644
--- a/expose.cpp
+++ b/expose.cpp
@@ -275,6 +275,16 @@ extern "C"
         return (int)last_stop_reason;
     }
 
+    const char* get_chat_template() {
+        // we need to keep this around
+        static std::string* ct = nullptr;
+        if (ct == nullptr) {
+            ct = new std::string();
+        }
+        *ct = gpttype_get_chat_template();
+        return ct->c_str();
+    }
+
     const char* get_pending_output() {
        return gpttype_get_pending_output().c_str();
     }
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index c014049b8..283fca721 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -2491,6 +2491,21 @@ bool gpttype_generate_abort()
     return true;
 }
 
+std::string gpttype_get_chat_template()
+{
+    // copied from examples/server/utils.hpp::llama_get_chat_template
+    std::string template_key = "tokenizer.chat_template";
+    // call with NULL buffer to get the total size of the string
+    int32_t res = llama_model_meta_val_str(&llama_ctx_v4->model, template_key.c_str(), NULL, 0);
+    if (res < 0) {
+        return "";
+    }
+
+    std::vector<char> model_template(res + 1, 0);
+    llama_model_meta_val_str(&llama_ctx_v4->model, template_key.c_str(), model_template.data(), model_template.size());
+    return std::string(model_template.data(), model_template.size() - 1);
+}
+
 std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
 {
     std::vector<int> toks;
diff --git a/koboldcpp.py b/koboldcpp.py
index f9384de13..6530d68c8 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -459,6 +459,7 @@ def init_library():
     handle.abort_generate.restype = ctypes.c_bool
     handle.token_count.restype = token_count_outputs
     handle.get_pending_output.restype = ctypes.c_char_p
+    handle.get_chat_template.restype = ctypes.c_char_p
     handle.sd_load_model.argtypes = [sd_load_model_inputs]
     handle.sd_load_model.restype = ctypes.c_bool
     handle.sd_generate.argtypes = [sd_generation_inputs]
@@ -1956,6 +1957,13 @@ Enter Prompt:<br>
             self.send_header("location", self.path)
             self.end_headers(content_type='text/html')
             return None
+        elif self.path.endswith('/props'):
+            ctbytes = handle.get_chat_template()
+            chat_template = ctypes.string_at(ctbytes).decode("UTF-8")
+            # TODO: decide whether to add or skip below settings from llama.cpp /props endpoint.
+            # { "default_generation_settings", ctx_server.default_generation_settings_for_props },
+            # { "total_slots",                 ctx_server.params.n_parallel },
+            response_body = (json.dumps({"chat_template":chat_template}).encode())
 
         if response_body is None:
             self.send_response(404)
diff --git a/model_adapter.h b/model_adapter.h
index 769829a26..5c448320f 100644
--- a/model_adapter.h
+++ b/model_adapter.h
@@ -91,6 +91,8 @@ enum ModelLoadResult
 ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta file_format_meta);
 generation_outputs gpttype_generate(const generation_inputs inputs);
 bool gpttype_generate_abort();
+std::string gpttype_get_chat_template();
+
 const std::string & gpttype_get_pending_output();
 std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos);
 const std::vector<TopPicksData> gpttype_get_top_picks_data();