mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
API: add /props route (#1222)
* API: add an /extra/chat_template route A lot of manual tweaking is done when swapping between models. We can automate or make better assumptions about some of them by having more information, such as chat template. This PR adds an endpoint /extra/chat_template which returns the model chat template string as is in a 'chat_template' key. The front end can then use this to derive the proper templates or use it as is, or at least warn the user when they are trying to use e.g. a Mistral preset with a Llama 3.1 model. * switch to pre-established /props endpoint for chat template * bug-fix (upstream): one-off in string juggling
This commit is contained in:
parent
8ab3eb89a8
commit
547ab2aebb
4 changed files with 35 additions and 0 deletions
10
expose.cpp
10
expose.cpp
|
@ -275,6 +275,16 @@ extern "C"
|
||||||
return (int)last_stop_reason;
|
return (int)last_stop_reason;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char* get_chat_template() {
|
||||||
|
// we need to keep this around
|
||||||
|
static std::string* ct = nullptr;
|
||||||
|
if (ct == nullptr) {
|
||||||
|
ct = new std::string();
|
||||||
|
}
|
||||||
|
*ct = gpttype_get_chat_template();
|
||||||
|
return ct->c_str();
|
||||||
|
}
|
||||||
|
|
||||||
const char* get_pending_output() {
|
const char* get_pending_output() {
|
||||||
return gpttype_get_pending_output().c_str();
|
return gpttype_get_pending_output().c_str();
|
||||||
}
|
}
|
||||||
|
|
|
@ -2491,6 +2491,21 @@ bool gpttype_generate_abort()
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string gpttype_get_chat_template()
|
||||||
|
{
|
||||||
|
// copied from examples/server/utils.hpp::llama_get_chat_template
|
||||||
|
std::string template_key = "tokenizer.chat_template";
|
||||||
|
// call with NULL buffer to get the total size of the string
|
||||||
|
int32_t res = llama_model_meta_val_str(&llama_ctx_v4->model, template_key.c_str(), NULL, 0);
|
||||||
|
if (res < 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<char> model_template(res + 1, 0);
|
||||||
|
llama_model_meta_val_str(&llama_ctx_v4->model, template_key.c_str(), model_template.data(), model_template.size());
|
||||||
|
return std::string(model_template.data(), model_template.size() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
|
std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
|
||||||
{
|
{
|
||||||
std::vector<int> toks;
|
std::vector<int> toks;
|
||||||
|
|
|
@ -459,6 +459,7 @@ def init_library():
|
||||||
handle.abort_generate.restype = ctypes.c_bool
|
handle.abort_generate.restype = ctypes.c_bool
|
||||||
handle.token_count.restype = token_count_outputs
|
handle.token_count.restype = token_count_outputs
|
||||||
handle.get_pending_output.restype = ctypes.c_char_p
|
handle.get_pending_output.restype = ctypes.c_char_p
|
||||||
|
handle.get_chat_template.restype = ctypes.c_char_p
|
||||||
handle.sd_load_model.argtypes = [sd_load_model_inputs]
|
handle.sd_load_model.argtypes = [sd_load_model_inputs]
|
||||||
handle.sd_load_model.restype = ctypes.c_bool
|
handle.sd_load_model.restype = ctypes.c_bool
|
||||||
handle.sd_generate.argtypes = [sd_generation_inputs]
|
handle.sd_generate.argtypes = [sd_generation_inputs]
|
||||||
|
@ -1956,6 +1957,13 @@ Enter Prompt:<br>
|
||||||
self.send_header("location", self.path)
|
self.send_header("location", self.path)
|
||||||
self.end_headers(content_type='text/html')
|
self.end_headers(content_type='text/html')
|
||||||
return None
|
return None
|
||||||
|
elif self.path.endswith('/props'):
|
||||||
|
ctbytes = handle.get_chat_template()
|
||||||
|
chat_template = ctypes.string_at(ctbytes).decode("UTF-8")
|
||||||
|
# TODO: decide whether to add or skip below settings from llama.cpp /props endpoint.
|
||||||
|
# { "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||||
|
# { "total_slots", ctx_server.params.n_parallel },
|
||||||
|
response_body = (json.dumps({"chat_template":chat_template}).encode())
|
||||||
|
|
||||||
if response_body is None:
|
if response_body is None:
|
||||||
self.send_response(404)
|
self.send_response(404)
|
||||||
|
|
|
@ -91,6 +91,8 @@ enum ModelLoadResult
|
||||||
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta file_format_meta);
|
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta file_format_meta);
|
||||||
generation_outputs gpttype_generate(const generation_inputs inputs);
|
generation_outputs gpttype_generate(const generation_inputs inputs);
|
||||||
bool gpttype_generate_abort();
|
bool gpttype_generate_abort();
|
||||||
|
std::string gpttype_get_chat_template();
|
||||||
|
|
||||||
const std::string & gpttype_get_pending_output();
|
const std::string & gpttype_get_pending_output();
|
||||||
std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos);
|
std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos);
|
||||||
const std::vector<TopPicksData> gpttype_get_top_picks_data();
|
const std::vector<TopPicksData> gpttype_get_top_picks_data();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue