server: (router) expose child model info from router's /v1/models (#22683)

* server: (router) expose child model info from router's /v1/models * update docs
2026-05-17 04:09:19 +00:00 · 2026-05-08 14:42:15 +02:00 · 2026-05-08 14:42:15 +02:00 · 9dcf835528
commit 9dcf835528
parent 58e68df0f9
6 changed files with 79 additions and 26 deletions
--- a/tools/server/README.md
+++ b/tools/server/README.md
@ -1651,6 +1651,7 @@ Note:
 2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
    - If a model is running but updated or removed from the source, it will be unloaded
    - If a model is not running, it will be added or updated according to the source
+3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.

 The `status` object can be:

--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -3926,22 +3926,7 @@ void server_routes::init_routes() {
            }},
            {"object", "list"},
            {"data", {
-                {
-                    {"id",       meta->model_name},
-                    {"aliases",  meta->model_aliases},
-                    {"tags",     meta->model_tags},
-                    {"object",   "model"},
-                    {"created",  std::time(0)},
-                    {"owned_by", "llamacpp"},
-                    {"meta",     {
-                        {"vocab_type",  meta->model_vocab_type},
-                        {"n_vocab",     meta->model_vocab_n_tokens},
-                        {"n_ctx_train", meta->model_n_ctx_train},
-                        {"n_embd",      meta->model_n_embd_inp},
-                        {"n_params",    meta->model_n_params},
-                        {"size",        meta->model_size},
-                    }},
-                },
+                get_model_info(),
            }}
        };

@ -4155,6 +4140,26 @@ void server_routes::init_routes() {
    };
 }

+json server_routes::get_model_info() const {
+    return json {
+        {"id",       meta->model_name},
+        {"aliases",  meta->model_aliases},
+        {"tags",     meta->model_tags},
+        {"object",   "model"},
+        {"created",  std::time(0)},
+        {"owned_by", "llamacpp"},
+        {"meta",     {
+            {"vocab_type",  meta->model_vocab_type},
+            {"n_vocab",     meta->model_vocab_n_tokens},
+            {"n_ctx",       meta->slot_n_ctx},
+            {"n_ctx_train", meta->model_n_ctx_train},
+            {"n_embd",      meta->model_n_embd_inp},
+            {"n_params",    meta->model_n_params},
+            {"size",        meta->model_size},
+        }},
+    };
+}
+
 std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const server_http_req & req, int id_slot) {
    auto res = create_response();
    const json request_data = json::parse(req.body);
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@ -122,6 +122,10 @@ struct server_routes {
    server_http_context::handler_t post_rerank;
    server_http_context::handler_t get_lora_adapters;
    server_http_context::handler_t post_lora_adapters;
+
+    // to be used in router mode
+    json get_model_info() const;
+
 private:
    std::unique_ptr<server_res_generator> handle_completions_impl(
            const server_http_req & req,
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -44,6 +44,7 @@ extern char **environ;
 #define CMD_ROUTER_TO_CHILD_EXIT  "cmd_router_to_child:exit"
 #define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
 #define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
+#define CMD_CHILD_TO_ROUTER_INFO  "cmd_child_to_router:info:" // followed by json string

 // address for child process, this is needed because router may run on 0.0.0.0
 // ref: https://github.com/ggml-org/llama.cpp/issues/17862
@ -718,10 +719,11 @@ void server_models::load(const std::string & name) {

    // prepare new instance info
    instance_t inst;
-    inst.meta           = meta;
-    inst.meta.port      = get_free_port();
-    inst.meta.status    = SERVER_MODEL_STATUS_LOADING;
-    inst.meta.last_used = ggml_time_ms();
+    inst.meta             = meta;
+    inst.meta.port        = get_free_port();
+    inst.meta.status      = SERVER_MODEL_STATUS_LOADING;
+    inst.meta.loaded_info = json{};
+    inst.meta.last_used   = ggml_time_ms();

    if (inst.meta.port <= 0) {
        throw std::runtime_error("failed to get a port number");
@ -767,12 +769,14 @@ void server_models::load(const std::string & name) {
            // read stdout/stderr and forward to main server log
            // also handle status report from child process
            if (stdout_file) {
-                char buffer[4096];
+                char buffer[128 * 1024]; // large buffer for storing info
                while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
                    LOG("[%5d] %s", port, buffer);
                    std::string str(buffer);
                    if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
                        this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
+                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
+                        this->update_loaded_info(name, str);
                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
                        this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
                    }
@ -916,6 +920,29 @@ void server_models::update_status(const std::string & name, server_model_status
    cv.notify_all();
 }

+void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
+    if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
+        SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
+        return;
+    }
+
+    json info;
+    try {
+        info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
+    } catch (const std::exception & e) {
+        SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
+        return;
+    }
+
+    std::unique_lock<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        auto & meta = it->second.meta;
+        meta.loaded_info = info;
+    }
+    cv.notify_all();
+}
+
 void server_models::wait_until_loading_finished(const std::string & name) {
    std::unique_lock<std::mutex> lk(mutex);
    cv.wait(lk, [this, &name]() {
@ -994,12 +1021,14 @@ bool server_models::is_child_server() {
    return router_port != nullptr;
 }

-std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
+std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
    // send a notification to the router server that a model instance is ready
    common_log_pause(common_log_main());
    fflush(stdout);
    fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
    fflush(stdout);
+    fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
+    fflush(stdout);
    common_log_resume(common_log_main());

    // setup thread for monitoring stdin
@ -1176,7 +1205,8 @@ void server_models_routes::init_routes() {
                status["exit_code"] = meta.exit_code;
                status["failed"]    = true;
            }
-            models_json.push_back(json {
+
+            json model_info = json {
                {"id",       meta.name},
                {"aliases",  meta.aliases},
                {"tags",     meta.tags},
@ -1185,7 +1215,17 @@ void server_models_routes::init_routes() {
                {"created",  t},          // for OAI-compat
                {"status",   status},
                // TODO: add other fields, may require reading GGUF metadata
-            });
+            };
+
+            // merge with loaded_info from the child process if available
+            if (meta.is_running()) {
+                for (auto it = meta.loaded_info.begin(); it != meta.loaded_info.end(); ++it) {
+                    if (!model_info.contains(it.key())) {
+                        model_info[it.key()] = it.value();
+                    }
+                }
+            }
+            models_json.push_back(model_info);
        }
        res_ok(res, {
            {"data", models_json},
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@ -63,6 +63,7 @@ struct server_model_meta {
    server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
    int64_t last_used = 0; // for LRU unloading
    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
+    json loaded_info; // info to be reflected via /v1/models endpoint
    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
    int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown

@ -145,6 +146,7 @@ public:

    // update the status of a model instance (thread-safe)
    void update_status(const std::string & name, server_model_status status, int exit_code);
+    void update_loaded_info(const std::string & name, std::string & raw_info);

    // wait until the model instance is fully loaded (thread-safe)
    // return when the model no longer in "loading" state
@ -163,7 +165,7 @@ public:

    // notify the router server that a model instance is ready
    // return the monitoring thread (to be joined by the caller)
-    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
+    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);

    // notify the router server that the sleeping state has changed
    static void notify_router_sleeping_state(bool sleeping);
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -334,7 +334,8 @@ int main(int argc, char ** argv) {
        // optionally, notify router server that this instance is ready
        std::thread monitor_thread;
        if (server_models::is_child_server()) {
-            monitor_thread = server_models::setup_child_server(shutdown_handler);
+            json model_info = routes.get_model_info();
+            monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
        }

        // this call blocks the main thread until queue_tasks.terminate() is called