diff --git a/tools/server/README.md b/tools/server/README.md index 62f918ce4..024760da6 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1651,6 +1651,7 @@ Note: 2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow: - If a model is running but updated or removed from the source, it will be unloaded - If a model is not running, it will be added or updated according to the source +3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance. The `status` object can be: diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 4b28033d9..3f20c94c5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3926,22 +3926,7 @@ void server_routes::init_routes() { }}, {"object", "list"}, {"data", { - { - {"id", meta->model_name}, - {"aliases", meta->model_aliases}, - {"tags", meta->model_tags}, - {"object", "model"}, - {"created", std::time(0)}, - {"owned_by", "llamacpp"}, - {"meta", { - {"vocab_type", meta->model_vocab_type}, - {"n_vocab", meta->model_vocab_n_tokens}, - {"n_ctx_train", meta->model_n_ctx_train}, - {"n_embd", meta->model_n_embd_inp}, - {"n_params", meta->model_n_params}, - {"size", meta->model_size}, - }}, - }, + get_model_info(), }} }; @@ -4155,6 +4140,26 @@ void server_routes::init_routes() { }; } +json server_routes::get_model_info() const { + return json { + {"id", meta->model_name}, + {"aliases", meta->model_aliases}, + {"tags", meta->model_tags}, + {"object", "model"}, + {"created", std::time(0)}, + {"owned_by", "llamacpp"}, + {"meta", { + {"vocab_type", meta->model_vocab_type}, + {"n_vocab", meta->model_vocab_n_tokens}, + {"n_ctx", meta->slot_n_ctx}, + {"n_ctx_train", meta->model_n_ctx_train}, + {"n_embd", meta->model_n_embd_inp}, + {"n_params", meta->model_n_params}, + {"size", meta->model_size}, + }}, + }; +} + std::unique_ptr server_routes::handle_slots_save(const server_http_req & req, int id_slot) { auto res = create_response(); const json request_data = json::parse(req.body); diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 37f10dc77..58dda8914 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -122,6 +122,10 @@ struct server_routes { server_http_context::handler_t post_rerank; server_http_context::handler_t get_lora_adapters; server_http_context::handler_t post_lora_adapters; + + // to be used in router mode + json get_model_info() const; + private: std::unique_ptr handle_completions_impl( const server_http_req & req, diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 5a05ca203..e60e1abc3 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -44,6 +44,7 @@ extern char **environ; #define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit" #define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep #define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep" +#define CMD_CHILD_TO_ROUTER_INFO "cmd_child_to_router:info:" // followed by json string // address for child process, this is needed because router may run on 0.0.0.0 // ref: https://github.com/ggml-org/llama.cpp/issues/17862 @@ -718,10 +719,11 @@ void server_models::load(const std::string & name) { // prepare new instance info instance_t inst; - inst.meta = meta; - inst.meta.port = get_free_port(); - inst.meta.status = SERVER_MODEL_STATUS_LOADING; - inst.meta.last_used = ggml_time_ms(); + inst.meta = meta; + inst.meta.port = get_free_port(); + inst.meta.status = SERVER_MODEL_STATUS_LOADING; + inst.meta.loaded_info = json{}; + inst.meta.last_used = ggml_time_ms(); if (inst.meta.port <= 0) { throw std::runtime_error("failed to get a port number"); @@ -767,12 +769,14 @@ void server_models::load(const std::string & name) { // read stdout/stderr and forward to main server log // also handle status report from child process if (stdout_file) { - char buffer[4096]; + char buffer[128 * 1024]; // large buffer for storing info while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0); + } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) { + this->update_loaded_info(name, str); } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) { this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0); } @@ -916,6 +920,29 @@ void server_models::update_status(const std::string & name, server_model_status cv.notify_all(); } +void server_models::update_loaded_info(const std::string & name, std::string & raw_info) { + if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) { + SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str()); + return; + } + + json info; + try { + info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO))); + } catch (const std::exception & e) { + SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what()); + return; + } + + std::unique_lock lk(mutex); + auto it = mapping.find(name); + if (it != mapping.end()) { + auto & meta = it->second.meta; + meta.loaded_info = info; + } + cv.notify_all(); +} + void server_models::wait_until_loading_finished(const std::string & name) { std::unique_lock lk(mutex); cv.wait(lk, [this, &name]() { @@ -994,12 +1021,14 @@ bool server_models::is_child_server() { return router_port != nullptr; } -std::thread server_models::setup_child_server(const std::function & shutdown_handler) { +std::thread server_models::setup_child_server(const std::function & shutdown_handler, const json & model_info) { // send a notification to the router server that a model instance is ready common_log_pause(common_log_main()); fflush(stdout); fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY); fflush(stdout); + fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str()); + fflush(stdout); common_log_resume(common_log_main()); // setup thread for monitoring stdin @@ -1176,7 +1205,8 @@ void server_models_routes::init_routes() { status["exit_code"] = meta.exit_code; status["failed"] = true; } - models_json.push_back(json { + + json model_info = json { {"id", meta.name}, {"aliases", meta.aliases}, {"tags", meta.tags}, @@ -1185,7 +1215,17 @@ void server_models_routes::init_routes() { {"created", t}, // for OAI-compat {"status", status}, // TODO: add other fields, may require reading GGUF metadata - }); + }; + + // merge with loaded_info from the child process if available + if (meta.is_running()) { + for (auto it = meta.loaded_info.begin(); it != meta.loaded_info.end(); ++it) { + if (!model_info.contains(it.key())) { + model_info[it.key()] = it.value(); + } + } + } + models_json.push_back(model_info); } res_ok(res, { {"data", models_json}, diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 64a15f5ba..2267de588 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -63,6 +63,7 @@ struct server_model_meta { server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading std::vector args; // args passed to the model instance, will be populated by render_args() + json loaded_info; // info to be reflected via /v1/models endpoint int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown @@ -145,6 +146,7 @@ public: // update the status of a model instance (thread-safe) void update_status(const std::string & name, server_model_status status, int exit_code); + void update_loaded_info(const std::string & name, std::string & raw_info); // wait until the model instance is fully loaded (thread-safe) // return when the model no longer in "loading" state @@ -163,7 +165,7 @@ public: // notify the router server that a model instance is ready // return the monitoring thread (to be joined by the caller) - static std::thread setup_child_server(const std::function & shutdown_handler); + static std::thread setup_child_server(const std::function & shutdown_handler, const json & model_info); // notify the router server that the sleeping state has changed static void notify_router_sleeping_state(bool sleeping); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0508f79d7..77fb7b23b 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -334,7 +334,8 @@ int main(int argc, char ** argv) { // optionally, notify router server that this instance is ready std::thread monitor_thread; if (server_models::is_child_server()) { - monitor_thread = server_models::setup_child_server(shutdown_handler); + json model_info = routes.get_model_info(); + monitor_thread = server_models::setup_child_server(shutdown_handler, model_info); } // this call blocks the main thread until queue_tasks.terminate() is called