From 547ab2aebba991138a17b899f0142c1f0776a321 Mon Sep 17 00:00:00 2001 From: kallewoof Date: Thu, 21 Nov 2024 11:58:32 +0900 Subject: [PATCH] API: add /props route (#1222) * API: add an /extra/chat_template route A lot of manual tweaking is done when swapping between models. We can automate or make better assumptions about some of them by having more information, such as chat template. This PR adds an endpoint /extra/chat_template which returns the model chat template string as is in a 'chat_template' key. The front end can then use this to derive the proper templates or use it as is, or at least warn the user when they are trying to use e.g. a Mistral preset with a Llama 3.1 model. * switch to pre-established /props endpoint for chat template * bug-fix (upstream): one-off in string juggling --- expose.cpp | 10 ++++++++++ gpttype_adapter.cpp | 15 +++++++++++++++ koboldcpp.py | 8 ++++++++ model_adapter.h | 2 ++ 4 files changed, 35 insertions(+) diff --git a/expose.cpp b/expose.cpp index efa1c8bb3..ba660e061 100644 --- a/expose.cpp +++ b/expose.cpp @@ -275,6 +275,16 @@ extern "C" return (int)last_stop_reason; } + const char* get_chat_template() { + // we need to keep this around + static std::string* ct = nullptr; + if (ct == nullptr) { + ct = new std::string(); + } + *ct = gpttype_get_chat_template(); + return ct->c_str(); + } + const char* get_pending_output() { return gpttype_get_pending_output().c_str(); } diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index c014049b8..283fca721 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2491,6 +2491,21 @@ bool gpttype_generate_abort() return true; } +std::string gpttype_get_chat_template() +{ + // copied from examples/server/utils.hpp::llama_get_chat_template + std::string template_key = "tokenizer.chat_template"; + // call with NULL buffer to get the total size of the string + int32_t res = llama_model_meta_val_str(&llama_ctx_v4->model, template_key.c_str(), NULL, 0); + if (res < 0) { + return ""; + } + + std::vector model_template(res + 1, 0); + llama_model_meta_val_str(&llama_ctx_v4->model, template_key.c_str(), model_template.data(), model_template.size()); + return std::string(model_template.data(), model_template.size() - 1); +} + std::vector gpttype_get_token_arr(const std::string & input, bool addbos) { std::vector toks; diff --git a/koboldcpp.py b/koboldcpp.py index f9384de13..6530d68c8 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -459,6 +459,7 @@ def init_library(): handle.abort_generate.restype = ctypes.c_bool handle.token_count.restype = token_count_outputs handle.get_pending_output.restype = ctypes.c_char_p + handle.get_chat_template.restype = ctypes.c_char_p handle.sd_load_model.argtypes = [sd_load_model_inputs] handle.sd_load_model.restype = ctypes.c_bool handle.sd_generate.argtypes = [sd_generation_inputs] @@ -1956,6 +1957,13 @@ Enter Prompt:
self.send_header("location", self.path) self.end_headers(content_type='text/html') return None + elif self.path.endswith('/props'): + ctbytes = handle.get_chat_template() + chat_template = ctypes.string_at(ctbytes).decode("UTF-8") + # TODO: decide whether to add or skip below settings from llama.cpp /props endpoint. + # { "default_generation_settings", ctx_server.default_generation_settings_for_props }, + # { "total_slots", ctx_server.params.n_parallel }, + response_body = (json.dumps({"chat_template":chat_template}).encode()) if response_body is None: self.send_response(404) diff --git a/model_adapter.h b/model_adapter.h index 769829a26..5c448320f 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -91,6 +91,8 @@ enum ModelLoadResult ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta file_format_meta); generation_outputs gpttype_generate(const generation_inputs inputs); bool gpttype_generate_abort(); +std::string gpttype_get_chat_template(); + const std::string & gpttype_get_pending_output(); std::vector gpttype_get_token_arr(const std::string & input, bool addbos); const std::vector gpttype_get_top_picks_data();