Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	Makefile
#	README.md
#	common/common.cpp
#	requirements/requirements-convert-hf-to-gguf-update.txt
#	requirements/requirements-convert-hf-to-gguf.txt
#	requirements/requirements-convert.txt
#	tests/CMakeLists.txt
#	tests/test-json-schema-to-grammar.cpp
This commit is contained in:
Concedo 2024-05-09 15:13:34 +08:00
commit d084f78faa
48 changed files with 3786 additions and 1438 deletions

View file

@ -13,6 +13,8 @@
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
#include "httplib.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
// auto generated files (update with ./deps.sh)
@ -860,7 +862,7 @@ struct server_context {
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
// process "json_schema" and "grammar"
if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
return false;
} else if (data.contains("json_schema") && !data.contains("grammar")) {
@ -1513,7 +1515,7 @@ struct server_context {
// add subtasks
for (int i = 0; i < prompt_count; i++) {
json subtask_data = multiprompt_task.data;
subtask_data["prompt"] = subtask_data["prompt"][i];
subtask_data["prompt"] = subtask_data.at("prompt")[i];
// subtasks inherit everything else (infill mode, embedding mode, etc.)
request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
@ -1533,7 +1535,7 @@ struct server_context {
}
if (task.data.contains("system_prompt")) {
system_prompt_set(task.data["system_prompt"]);
system_prompt_set(task.data.at("system_prompt"));
for (server_slot & slot : slots) {
slot.n_past = 0;
@ -1645,7 +1647,7 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SLOT_SAVE:
{
int id_slot = task.data["id_slot"];
int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -1655,8 +1657,8 @@ struct server_context {
const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us();
std::string filename = task.data["filename"];
std::string filepath = task.data["filepath"];
std::string filename = task.data.at("filename");
std::string filepath = task.data.at("filepath");
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
@ -1680,7 +1682,7 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SLOT_RESTORE:
{
int id_slot = task.data["id_slot"];
int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -1689,8 +1691,8 @@ struct server_context {
const int64_t t_start = ggml_time_us();
std::string filename = task.data["filename"];
std::string filepath = task.data["filepath"];
std::string filename = task.data.at("filename");
std::string filepath = task.data.at("filepath");
slot->cache_tokens.resize(slot->n_ctx);
size_t token_count = 0;
@ -1722,7 +1724,7 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SLOT_ERASE:
{
int id_slot = task.data["id_slot"];
int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -3137,8 +3139,8 @@ int main(int argc, char ** argv) {
server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
const int n_idle_slots = result.data["idle"];
const int n_processing_slots = result.data["processing"];
const int n_idle_slots = result.data.at("idle");
const int n_processing_slots = result.data.at("processing");
json health = {
{"status", "ok"},
@ -3148,7 +3150,7 @@ int main(int argc, char ** argv) {
res.status = 200; // HTTP OK
if (sparams.slots_endpoint && req.has_param("include_slots")) {
health["slots"] = result.data["slots"];
health["slots"] = result.data.at("slots");
}
if (n_idle_slots == 0) {
@ -3192,7 +3194,7 @@ int main(int argc, char ** argv) {
server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
res.set_content(result.data["slots"].dump(), "application/json");
res.set_content(result.data.at("slots").dump(), "application/json");
res.status = 200; // HTTP OK
};
@ -3219,32 +3221,32 @@ int main(int argc, char ** argv) {
json data = result.data;
const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
const uint64_t t_prompt_processing = data["t_prompt_processing"];
const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
const uint64_t t_prompt_processing = data.at("t_prompt_processing");
const uint64_t n_tokens_predicted = data["n_tokens_predicted"];
const uint64_t t_tokens_generation = data["t_tokens_generation"];
const uint64_t n_tokens_predicted = data.at("n_tokens_predicted");
const uint64_t t_tokens_generation = data.at("t_tokens_generation");
const int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
json all_metrics_def = json {
{"counter", {{
{"name", "prompt_tokens_total"},
{"help", "Number of prompt tokens processed."},
{"value", (uint64_t) data["n_prompt_tokens_processed_total"]}
{"value", (uint64_t) data.at("n_prompt_tokens_processed_total")}
}, {
{"name", "prompt_seconds_total"},
{"help", "Prompt process time"},
{"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3}
{"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
}, {
{"name", "tokens_predicted_total"},
{"help", "Number of generation tokens processed."},
{"value", (uint64_t) data["n_tokens_predicted_total"]}
{"value", (uint64_t) data.at("n_tokens_predicted_total")}
}, {
{"name", "tokens_predicted_seconds_total"},
{"help", "Predict process time"},
{"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3}
{"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
}}},
{"gauge", {{
{"name", "prompt_tokens_seconds"},
@ -3261,15 +3263,15 @@ int main(int argc, char ** argv) {
},{
{"name", "kv_cache_tokens"},
{"help", "KV-cache tokens."},
{"value", (uint64_t) data["kv_cache_tokens_count"]}
{"value", (uint64_t) data.at("kv_cache_tokens_count")}
},{
{"name", "requests_processing"},
{"help", "Number of request processing."},
{"value", (uint64_t) data["processing"]}
{"value", (uint64_t) data.at("processing")}
},{
{"name", "requests_deferred"},
{"help", "Number of request deferred."},
{"value", (uint64_t) data["deferred"]}
{"value", (uint64_t) data.at("deferred")}
}}}
};
@ -3280,8 +3282,8 @@ int main(int argc, char ** argv) {
const auto & metrics_def = el.value();
for (const auto & metric_def : metrics_def) {
const std::string name = metric_def["name"];
const std::string help = metric_def["help"];
const std::string name = metric_def.at("name");
const std::string help = metric_def.at("help");
auto value = json_value(metric_def, "value", 0.);
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
@ -3290,7 +3292,7 @@ int main(int argc, char ** argv) {
}
}
const int64_t t_start = data["t_start"];
const int64_t t_start = data.at("t_start");
res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
@ -3299,7 +3301,7 @@ int main(int argc, char ** argv) {
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data["filename"];
std::string filename = request_data.at("filename");
if (!validate_file_name(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
@ -3329,7 +3331,7 @@ int main(int argc, char ** argv) {
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data["filename"];
std::string filename = request_data.at("filename");
if (!validate_file_name(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
@ -3648,7 +3650,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> tokens;
if (body.count("content") != 0) {
tokens = ctx_server.tokenize(body["content"], false);
const bool add_special = json_value(body, "add_special", false);
tokens = ctx_server.tokenize(body.at("content"), add_special);
}
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json; charset=utf-8");
@ -3660,7 +3663,7 @@ int main(int argc, char ** argv) {
std::string content;
if (body.count("tokens") != 0) {
const std::vector<llama_token> tokens = body["tokens"];
const std::vector<llama_token> tokens = body.at("tokens");
content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
}
@ -3683,10 +3686,10 @@ int main(int argc, char ** argv) {
json prompt;
if (body.count("input") != 0) {
is_openai = true;
prompt = body["input"];
prompt = body.at("input");
} else if (body.count("content") != 0) {
// with "content", we only support single prompt
prompt = std::vector<std::string>{body["content"]};
prompt = std::vector<std::string>{body.at("content")};
} else {
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
return;
@ -3705,7 +3708,7 @@ int main(int argc, char ** argv) {
if (!result.error) {
if (result.data.count("results")) {
// result for multi-task
responses = result.data["results"];
responses = result.data.at("results");
} else {
// result for single task
responses = std::vector<json>{result.data};