server: (router) expose child model info from router's /v1/models (#22683)

* server: (router) expose child model info from router's /v1/models

* update docs
This commit is contained in:
Xuan-Son Nguyen 2026-05-08 14:42:15 +02:00 committed by GitHub
parent 58e68df0f9
commit 9dcf835528
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 79 additions and 26 deletions

View file

@ -1651,6 +1651,7 @@ Note:
2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
- If a model is running but updated or removed from the source, it will be unloaded
- If a model is not running, it will be added or updated according to the source
3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
The `status` object can be:

View file

@ -3926,22 +3926,7 @@ void server_routes::init_routes() {
}},
{"object", "list"},
{"data", {
{
{"id", meta->model_name},
{"aliases", meta->model_aliases},
{"tags", meta->model_tags},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
{"meta", {
{"vocab_type", meta->model_vocab_type},
{"n_vocab", meta->model_vocab_n_tokens},
{"n_ctx_train", meta->model_n_ctx_train},
{"n_embd", meta->model_n_embd_inp},
{"n_params", meta->model_n_params},
{"size", meta->model_size},
}},
},
get_model_info(),
}}
};
@ -4155,6 +4140,26 @@ void server_routes::init_routes() {
};
}
json server_routes::get_model_info() const {
return json {
{"id", meta->model_name},
{"aliases", meta->model_aliases},
{"tags", meta->model_tags},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
{"meta", {
{"vocab_type", meta->model_vocab_type},
{"n_vocab", meta->model_vocab_n_tokens},
{"n_ctx", meta->slot_n_ctx},
{"n_ctx_train", meta->model_n_ctx_train},
{"n_embd", meta->model_n_embd_inp},
{"n_params", meta->model_n_params},
{"size", meta->model_size},
}},
};
}
std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const server_http_req & req, int id_slot) {
auto res = create_response();
const json request_data = json::parse(req.body);

View file

@ -122,6 +122,10 @@ struct server_routes {
server_http_context::handler_t post_rerank;
server_http_context::handler_t get_lora_adapters;
server_http_context::handler_t post_lora_adapters;
// to be used in router mode
json get_model_info() const;
private:
std::unique_ptr<server_res_generator> handle_completions_impl(
const server_http_req & req,

View file

@ -44,6 +44,7 @@ extern char **environ;
#define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit"
#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
#define CMD_CHILD_TO_ROUTER_INFO "cmd_child_to_router:info:" // followed by json string
// address for child process, this is needed because router may run on 0.0.0.0
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
@ -718,10 +719,11 @@ void server_models::load(const std::string & name) {
// prepare new instance info
instance_t inst;
inst.meta = meta;
inst.meta.port = get_free_port();
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
inst.meta.last_used = ggml_time_ms();
inst.meta = meta;
inst.meta.port = get_free_port();
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
inst.meta.loaded_info = json{};
inst.meta.last_used = ggml_time_ms();
if (inst.meta.port <= 0) {
throw std::runtime_error("failed to get a port number");
@ -767,12 +769,14 @@ void server_models::load(const std::string & name) {
// read stdout/stderr and forward to main server log
// also handle status report from child process
if (stdout_file) {
char buffer[4096];
char buffer[128 * 1024]; // large buffer for storing info
while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
LOG("[%5d] %s", port, buffer);
std::string str(buffer);
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
this->update_loaded_info(name, str);
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
}
@ -916,6 +920,29 @@ void server_models::update_status(const std::string & name, server_model_status
cv.notify_all();
}
void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
return;
}
json info;
try {
info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
} catch (const std::exception & e) {
SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
return;
}
std::unique_lock<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
auto & meta = it->second.meta;
meta.loaded_info = info;
}
cv.notify_all();
}
void server_models::wait_until_loading_finished(const std::string & name) {
std::unique_lock<std::mutex> lk(mutex);
cv.wait(lk, [this, &name]() {
@ -994,12 +1021,14 @@ bool server_models::is_child_server() {
return router_port != nullptr;
}
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
// send a notification to the router server that a model instance is ready
common_log_pause(common_log_main());
fflush(stdout);
fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
fflush(stdout);
fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
fflush(stdout);
common_log_resume(common_log_main());
// setup thread for monitoring stdin
@ -1176,7 +1205,8 @@ void server_models_routes::init_routes() {
status["exit_code"] = meta.exit_code;
status["failed"] = true;
}
models_json.push_back(json {
json model_info = json {
{"id", meta.name},
{"aliases", meta.aliases},
{"tags", meta.tags},
@ -1185,7 +1215,17 @@ void server_models_routes::init_routes() {
{"created", t}, // for OAI-compat
{"status", status},
// TODO: add other fields, may require reading GGUF metadata
});
};
// merge with loaded_info from the child process if available
if (meta.is_running()) {
for (auto it = meta.loaded_info.begin(); it != meta.loaded_info.end(); ++it) {
if (!model_info.contains(it.key())) {
model_info[it.key()] = it.value();
}
}
}
models_json.push_back(model_info);
}
res_ok(res, {
{"data", models_json},

View file

@ -63,6 +63,7 @@ struct server_model_meta {
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
int64_t last_used = 0; // for LRU unloading
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
json loaded_info; // info to be reflected via /v1/models endpoint
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
@ -145,6 +146,7 @@ public:
// update the status of a model instance (thread-safe)
void update_status(const std::string & name, server_model_status status, int exit_code);
void update_loaded_info(const std::string & name, std::string & raw_info);
// wait until the model instance is fully loaded (thread-safe)
// return when the model no longer in "loading" state
@ -163,7 +165,7 @@ public:
// notify the router server that a model instance is ready
// return the monitoring thread (to be joined by the caller)
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);
// notify the router server that the sleeping state has changed
static void notify_router_sleeping_state(bool sleeping);

View file

@ -334,7 +334,8 @@ int main(int argc, char ** argv) {
// optionally, notify router server that this instance is ready
std::thread monitor_thread;
if (server_models::is_child_server()) {
monitor_thread = server_models::setup_child_server(shutdown_handler);
json model_info = routes.get_model_info();
monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
}
// this call blocks the main thread until queue_tasks.terminate() is called