mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/docker.yml # CMakeLists.txt # CONTRIBUTING.md # docs/android.md # docs/docker.md # examples/embedding/embedding.cpp # examples/imatrix/imatrix.cpp # examples/infill/infill.cpp # examples/llama-bench/llama-bench.cpp # examples/main/README.md # examples/parallel/parallel.cpp # examples/perplexity/perplexity.cpp # examples/quantize-stats/quantize-stats.cpp # examples/save-load-state/save-load-state.cpp # examples/server/README.md # examples/simple/CMakeLists.txt # examples/speculative/speculative.cpp # flake.lock # ggml/src/CMakeLists.txt # ggml/src/ggml-blas.cpp # pocs/vdot/q8dot.cpp # pocs/vdot/vdot.cpp # scripts/debug-test.sh # scripts/sync-ggml.last # src/llama.cpp # tests/test-backend-ops.cpp # tests/test-chat-template.cpp # tests/test-quantize-fns.cpp # tests/test-quantize-perf.cpp # tests/test-tokenizer-0.cpp # tests/test-tokenizer-1-bpe.cpp # tests/test-tokenizer-1-spm.cpp
This commit is contained in:
commit
e692a79aab
61 changed files with 2579 additions and 1949 deletions
|
@ -189,8 +189,8 @@ struct server_slot {
|
|||
// sampling
|
||||
json json_schema;
|
||||
|
||||
struct gpt_sampler_params sparams;
|
||||
struct gpt_sampler * smpl = nullptr;
|
||||
struct common_sampler_params sparams;
|
||||
struct common_sampler * smpl = nullptr;
|
||||
|
||||
llama_token sampled;
|
||||
|
||||
|
@ -232,7 +232,7 @@ struct server_slot {
|
|||
generated_token_probs.clear();
|
||||
}
|
||||
|
||||
bool has_budget(gpt_params &global_params) {
|
||||
bool has_budget(common_params &global_params) {
|
||||
if (params.n_predict == -1 && global_params.n_predict == -1) {
|
||||
return true; // limitless
|
||||
}
|
||||
|
@ -612,9 +612,9 @@ struct server_response {
|
|||
struct server_context {
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
std::vector<llama_lora_adapter_container> loras;
|
||||
std::vector<common_lora_adapter_container> loras;
|
||||
|
||||
gpt_params params;
|
||||
common_params params;
|
||||
|
||||
llama_batch batch = {};
|
||||
|
||||
|
@ -656,20 +656,20 @@ struct server_context {
|
|||
// Clear any sampling context
|
||||
for (server_slot & slot : slots) {
|
||||
if (slot.smpl != nullptr) {
|
||||
gpt_sampler_free(slot.smpl);
|
||||
common_sampler_free(slot.smpl);
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
bool load_model(const gpt_params & params_) {
|
||||
bool load_model(const common_params & params_) {
|
||||
params = params_;
|
||||
|
||||
// dedicate one sequence to the system prompt
|
||||
params.n_parallel += 1;
|
||||
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
|
@ -772,10 +772,10 @@ struct server_context {
|
|||
|
||||
std::vector<llama_token> p;
|
||||
if (first) {
|
||||
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
p = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
} else {
|
||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
|
@ -789,7 +789,7 @@ struct server_context {
|
|||
}
|
||||
} else {
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
prompt_tokens = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
|
@ -1000,7 +1000,7 @@ struct server_context {
|
|||
slot.sparams.logit_bias.push_back({tok, bias});
|
||||
}
|
||||
} else if (el[0].is_string()) {
|
||||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
|
||||
for (auto tok : toks) {
|
||||
slot.sparams.logit_bias.push_back({tok, bias});
|
||||
}
|
||||
|
@ -1032,7 +1032,7 @@ struct server_context {
|
|||
sampler_names.emplace_back(name);
|
||||
}
|
||||
}
|
||||
slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
|
||||
slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
|
||||
} else {
|
||||
slot.sparams.samplers = default_sparams.samplers;
|
||||
}
|
||||
|
@ -1040,10 +1040,10 @@ struct server_context {
|
|||
|
||||
{
|
||||
if (slot.smpl != nullptr) {
|
||||
gpt_sampler_free(slot.smpl);
|
||||
common_sampler_free(slot.smpl);
|
||||
}
|
||||
|
||||
slot.smpl = gpt_sampler_init(model, slot.sparams);
|
||||
slot.smpl = common_sampler_init(model, slot.sparams);
|
||||
if (slot.smpl == nullptr) {
|
||||
// for now, the only error that may happen here is invalid grammar
|
||||
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
||||
|
@ -1074,7 +1074,7 @@ struct server_context {
|
|||
system_tokens.clear();
|
||||
|
||||
if (!system_prompt.empty()) {
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||
system_tokens = common_tokenize(ctx, system_prompt, true);
|
||||
|
||||
const int32_t n_batch = llama_n_batch(ctx);
|
||||
const int32_t n_tokens_prompt = system_tokens.size();
|
||||
|
@ -1082,10 +1082,10 @@ struct server_context {
|
|||
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int32_t j = 0; j < n_tokens; ++j) {
|
||||
llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
|
||||
common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
|
@ -1107,19 +1107,14 @@ struct server_context {
|
|||
SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
|
||||
|
||||
system_prompt = sys_prompt;
|
||||
|
||||
// release all slots
|
||||
for (server_slot & slot : slots) {
|
||||
slot.release();
|
||||
}
|
||||
|
||||
// update system_tokens and KV cache as soon as all slots are idle
|
||||
system_need_update = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
|
||||
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
|
@ -1230,7 +1225,7 @@ struct server_context {
|
|||
std::vector<std::string> samplers;
|
||||
samplers.reserve(slot.sparams.samplers.size());
|
||||
for (const auto & sampler : slot.sparams.samplers) {
|
||||
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
|
||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
|
@ -1238,7 +1233,7 @@ struct server_context {
|
|||
{"n_predict", slot.n_predict}, // Server configured n_predict
|
||||
{"model", params.model_alias},
|
||||
{"seed", slot.sparams.seed},
|
||||
{"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
|
||||
{"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
|
||||
{"temperature", slot.sparams.temp},
|
||||
{"dynatemp_range", slot.sparams.dynatemp_range},
|
||||
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
||||
|
@ -1303,7 +1298,7 @@ struct server_context {
|
|||
};
|
||||
|
||||
if (slot.sparams.n_probs > 0) {
|
||||
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
|
||||
const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
||||
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
|
||||
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
|
||||
|
||||
|
@ -1353,7 +1348,7 @@ struct server_context {
|
|||
if (slot.sparams.n_probs > 0) {
|
||||
std::vector<completion_token_output> probs;
|
||||
if (!slot.params.stream && slot.stopped_word) {
|
||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
|
||||
const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
|
||||
|
||||
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
|
||||
probs = std::vector<completion_token_output>(
|
||||
|
@ -1407,7 +1402,7 @@ struct server_context {
|
|||
continue;
|
||||
}
|
||||
|
||||
llama_embd_normalize(embd, embd_res.data(), n_embd);
|
||||
common_embd_normalize(embd, embd_res.data(), n_embd);
|
||||
|
||||
res.data = json {
|
||||
{"embedding", embd_res},
|
||||
|
@ -1628,16 +1623,6 @@ struct server_context {
|
|||
break;
|
||||
}
|
||||
|
||||
if (task.data.contains("system_prompt")) {
|
||||
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
|
||||
system_prompt_set(sys_prompt);
|
||||
|
||||
for (server_slot & slot : slots) {
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
}
|
||||
}
|
||||
|
||||
slot->reset();
|
||||
|
||||
slot->id_task = task.id;
|
||||
|
@ -1851,7 +1836,7 @@ struct server_context {
|
|||
} break;
|
||||
case SERVER_TASK_TYPE_SET_LORA:
|
||||
{
|
||||
llama_lora_adapters_apply(ctx, loras);
|
||||
common_lora_adapters_apply(ctx, loras);
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.stop = true;
|
||||
|
@ -1863,10 +1848,6 @@ struct server_context {
|
|||
}
|
||||
|
||||
void update_slots() {
|
||||
if (system_need_update) {
|
||||
system_prompt_update();
|
||||
}
|
||||
|
||||
// check if all slots are idle
|
||||
{
|
||||
bool all_idle = true;
|
||||
|
@ -1879,6 +1860,10 @@ struct server_context {
|
|||
}
|
||||
|
||||
if (all_idle) {
|
||||
if (system_need_update) {
|
||||
system_prompt_update();
|
||||
}
|
||||
|
||||
SRV_INF("%s", "all slots are idle\n");
|
||||
if (system_prompt.empty() && clean_kv_cache) {
|
||||
kv_cache_clear();
|
||||
|
@ -1937,7 +1922,7 @@ struct server_context {
|
|||
}
|
||||
|
||||
// start populating the batch for this iteration
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// frist, add sampled tokens from any ongoing sequences
|
||||
for (auto & slot : slots) {
|
||||
|
@ -1951,7 +1936,7 @@ struct server_context {
|
|||
|
||||
// TODO: we always have to take into account the "system_tokens"
|
||||
// this is not great and needs to be improved somehow
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
|
||||
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
|
||||
|
||||
slot.n_past += 1;
|
||||
|
||||
|
@ -2108,7 +2093,7 @@ struct server_context {
|
|||
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||
}
|
||||
|
||||
gpt_sampler_reset(slot.smpl);
|
||||
common_sampler_reset(slot.smpl);
|
||||
|
||||
if (!slot.params.cache_prompt) {
|
||||
slot.n_past_se = 0;
|
||||
|
@ -2121,7 +2106,7 @@ struct server_context {
|
|||
|
||||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (int i = 0; i < slot.n_past; ++i) {
|
||||
gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
|
||||
common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2175,7 +2160,7 @@ struct server_context {
|
|||
slot.n_past_se = 0;
|
||||
slot.ga_i = 0;
|
||||
// TODO: is the system prompt ever in the sampling context?
|
||||
gpt_sampler_reset(slot.smpl);
|
||||
common_sampler_reset(slot.smpl);
|
||||
}
|
||||
|
||||
// remove the non-common part from the cache
|
||||
|
@ -2200,7 +2185,7 @@ struct server_context {
|
|||
}
|
||||
}
|
||||
|
||||
llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
|
||||
common_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
|
||||
|
||||
if (slot.params.cache_prompt) {
|
||||
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
||||
|
@ -2338,9 +2323,9 @@ struct server_context {
|
|||
}
|
||||
|
||||
completion_token_output result;
|
||||
const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
||||
const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
||||
|
||||
gpt_sampler_accept(slot.smpl, id, true);
|
||||
common_sampler_accept(slot.smpl, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1) {
|
||||
|
@ -2351,7 +2336,7 @@ struct server_context {
|
|||
|
||||
result.tok = id;
|
||||
|
||||
const auto * cur_p = gpt_sampler_get_candidates(slot.smpl);
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
|
@ -2415,13 +2400,13 @@ inline void signal_handler(int signal) {
|
|||
|
||||
int main(int argc, char ** argv) {
|
||||
// own arguments required by this example
|
||||
gpt_params params;
|
||||
common_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
common_init();
|
||||
|
||||
// enabling this will output extra debug information in the HTTP responses from the server
|
||||
// see format_final_response_oaicompat()
|
||||
|
@ -2443,7 +2428,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
|
||||
std::unique_ptr<httplib::Server> svr;
|
||||
|
@ -2537,20 +2522,10 @@ int main(int argc, char ** argv) {
|
|||
//
|
||||
|
||||
auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
// TODO: should we apply API key to all endpoints, including "/health" and "/models"?
|
||||
static const std::unordered_set<std::string> protected_endpoints = {
|
||||
"/props",
|
||||
"/completion",
|
||||
"/completions",
|
||||
"/v1/completions",
|
||||
"/chat/completions",
|
||||
"/v1/chat/completions",
|
||||
"/infill",
|
||||
"/tokenize",
|
||||
"/detokenize",
|
||||
"/embedding",
|
||||
"/embeddings",
|
||||
"/v1/embeddings",
|
||||
static const std::unordered_set<std::string> public_endpoints = {
|
||||
"/health",
|
||||
"/models",
|
||||
"/v1/models",
|
||||
};
|
||||
|
||||
// If API key is not set, skip validation
|
||||
|
@ -2558,8 +2533,8 @@ int main(int argc, char ** argv) {
|
|||
return true;
|
||||
}
|
||||
|
||||
// If path is not in protected_endpoints list, skip validation
|
||||
if (protected_endpoints.find(req.path) == protected_endpoints.end()) {
|
||||
// If path is public, skip validation
|
||||
if (public_endpoints.find(req.path) != public_endpoints.end()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -2621,7 +2596,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
|
||||
if (!params.endpoint_slots) {
|
||||
res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2870,24 +2845,31 @@ int main(int argc, char ** argv) {
|
|||
};
|
||||
|
||||
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||
std::string template_key = "tokenizer.chat_template", curr_tmpl;
|
||||
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
|
||||
if (tlen > 0) {
|
||||
std::vector<char> curr_tmpl_buf(tlen + 1, 0);
|
||||
if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
|
||||
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
|
||||
}
|
||||
}
|
||||
json data = {
|
||||
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
||||
{ "system_prompt", ctx_server.system_prompt },
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params.n_parallel },
|
||||
{ "chat_template", curr_tmpl.c_str() },
|
||||
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
||||
};
|
||||
|
||||
res_ok(res, data);
|
||||
};
|
||||
|
||||
const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
if (!ctx_server.params.endpoint_props) {
|
||||
res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
json data = json::parse(req.body);
|
||||
if (data.contains("system_prompt")) {
|
||||
std::string system_prompt = data.at("system_prompt");
|
||||
ctx_server.system_prompt_set(system_prompt);
|
||||
}
|
||||
|
||||
res_ok(res, {{ "success", true }});
|
||||
};
|
||||
|
||||
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
|
@ -3033,7 +3015,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (with_pieces) {
|
||||
for (const auto& token : tokens) {
|
||||
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
|
||||
std::string piece = common_token_to_piece(ctx_server.ctx, token);
|
||||
json piece_json;
|
||||
|
||||
// Check if the piece is valid UTF-8
|
||||
|
@ -3266,30 +3248,39 @@ int main(int argc, char ** argv) {
|
|||
svr->set_base_dir(params.public_path);
|
||||
}
|
||||
|
||||
// using embedded static files
|
||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
|
||||
if (!params.api_keys.empty()) {
|
||||
// for now, if API key is set, web UI is unusable
|
||||
svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
|
||||
return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
|
||||
});
|
||||
} else {
|
||||
// using embedded static files
|
||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
|
||||
|
||||
// add new-ui files
|
||||
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
||||
// add new-ui files
|
||||
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
||||
}
|
||||
|
||||
// register API routes
|
||||
svr->Get ("/health", handle_health);
|
||||
svr->Get ("/health", handle_health); // public endpoint (no API key check)
|
||||
svr->Get ("/metrics", handle_metrics);
|
||||
svr->Get ("/props", handle_props);
|
||||
svr->Get ("/v1/models", handle_models);
|
||||
svr->Post("/props", handle_props_change);
|
||||
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
||||
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
||||
svr->Post("/completion", handle_completions); // legacy
|
||||
svr->Post("/completions", handle_completions);
|
||||
svr->Post("/v1/completions", handle_completions);
|
||||
|
@ -3367,7 +3358,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// print sample chat example to make it clear which template is used
|
||||
LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
|
||||
LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
|
||||
|
||||
ctx_server.queue_tasks.on_new_task(std::bind(
|
||||
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue