Merge branch 'master' into concedo_experimental

# Conflicts:
#	.devops/nix/package.nix
#	.github/workflows/build.yml
#	.gitignore
#	CMakeLists.txt
#	Makefile
#	README.md
#	ci/run.sh
#	flake.lock
#	flake.nix
#	scripts/get-flags.mk
#	scripts/get-wikitext-2.sh
#	scripts/sync-ggml.last
#	tests/CMakeLists.txt
#	tests/test-backend-ops.cpp
#	tests/test-grammar-parser.cpp
#	tests/test-llama-grammar.cpp
This commit is contained in:
Concedo 2024-02-20 16:30:21 +08:00
commit f0a662112b
34 changed files with 2394 additions and 753 deletions

View file

@ -29,6 +29,7 @@
#include <chrono>
#include <condition_variable>
#include <atomic>
#include <signal.h>
using json = nlohmann::json;
@ -41,6 +42,7 @@ struct server_params
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
bool slots_endpoint = true;
};
bool server_verbose = false;
@ -159,6 +161,7 @@ struct llama_client_slot
int32_t n_decoded = 0;
int32_t n_remaining = -1;
int32_t i_batch = -1;
int32_t n_predict = -1;
int32_t num_prompt_tokens = 0;
int32_t num_prompt_tokens_processed = 0;
@ -410,6 +413,7 @@ struct llama_server_context
slot.id = i;
slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict;
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
@ -545,6 +549,16 @@ struct llama_server_context
slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
// Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", {
{"params.n_predict", slot->params.n_predict},
{"slot.n_predict", slot->n_predict},
});
slot->params.n_predict = slot->n_predict;
}
// infill
if (data.count("input_prefix") != 0)
@ -1053,6 +1067,7 @@ struct llama_server_context
return json {
{"n_ctx", slot.n_ctx},
{"n_predict", slot.n_predict},
{"model", params.model_alias},
{"seed", slot.params.seed},
{"temperature", slot.sparams.temp},
@ -1080,6 +1095,7 @@ struct llama_server_context
{"stream", slot.params.stream},
{"logit_bias", slot.sparams.logit_bias},
{"n_probs", slot.sparams.n_probs},
{"min_keep", slot.sparams.min_keep},
{"grammar", slot.sparams.grammar},
{"samplers", samplers_sequence}
};
@ -1914,14 +1930,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
printf("\n");
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
printf(" --chat-template FORMAT_NAME");
printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
printf("\n");
}
@ -2361,6 +2379,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {});
}
else if (arg == "--slots-endpoint-disable")
{
sparams.slots_endpoint = false;
}
else if (arg == "--chat-template")
{
if (++i >= argc)
@ -2512,6 +2534,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}
std::function<void(int)> shutdown_handler;
inline void signal_handler(int signal) { shutdown_handler(signal); }
int main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
@ -2558,13 +2583,40 @@ int main(int argc, char **argv)
res.set_header("Access-Control-Allow-Headers", "*");
});
svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
server_state current_state = state.load();
switch(current_state) {
case SERVER_STATE_READY:
res.set_content(R"({"status": "ok"})", "application/json");
res.status = 200; // HTTP OK
case SERVER_STATE_READY: {
int available_slots = 0;
int processing_slots = 0;
for (llama_client_slot &slot: llama.slots) {
if (slot.available()) {
available_slots++;
} else {
processing_slots++;
}
}
if (available_slots > 0) {
json health = {
{"status", "ok"},
{"slots_idle", available_slots},
{"slots_processing", processing_slots}};
res.set_content(health.dump(), "application/json");
res.status = 200; // HTTP OK
} else {
json health = {
{"status", "no slot available"},
{"slots_idle", available_slots},
{"slots_processing", processing_slots}};
res.set_content(health.dump(), "application/json");
if (req.has_param("fail_on_no_slot")) {
res.status = 503; // HTTP Service Unavailable
} else {
res.status = 200; // HTTP OK
}
}
break;
}
case SERVER_STATE_LOADING_MODEL:
res.set_content(R"({"status": "loading model"})", "application/json");
res.status = 503; // HTTP Service Unavailable
@ -2576,6 +2628,32 @@ int main(int argc, char **argv)
}
});
if (sparams.slots_endpoint) {
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
json slots;
for (llama_client_slot & slot : llama.slots) {
json slot_data = llama.get_formated_generation(slot);
slot_data["id"] = slot.id;
slot_data["task_id"] = slot.task_id;
slot_data["state"] = slot.state;
slot_data["prompt"] = slot.prompt;
slot_data["next_token"] = {
{"has_next_token", slot.has_next_token},
{"n_remain", slot.n_remaining},
{"num_tokens_predicted", slot.n_decoded},
{"stopped_eos", slot.stopped_eos},
{"stopped_word", slot.stopped_word},
{"stopped_limit", slot.stopped_limit},
{"stopping_word", slot.stopping_word},
};
slots.push_back(slot_data);
}
res.set_content(slots.dump(), "application/json");
res.status = 200; // HTTP OK
});
}
svr.set_logger(log_server_request);
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
@ -3129,8 +3207,25 @@ int main(int argc, char **argv)
std::placeholders::_2,
std::placeholders::_3
));
llama.queue_tasks.start_loop();
shutdown_handler = [&](int) {
llama.queue_tasks.terminate();
};
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
llama.queue_tasks.start_loop();
svr.stop();
t.join();
llama_backend_free();