mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-17 04:09:19 +00:00
server: (router) expose child model info from router's /v1/models (#22683)
* server: (router) expose child model info from router's /v1/models * update docs
This commit is contained in:
parent
58e68df0f9
commit
9dcf835528
6 changed files with 79 additions and 26 deletions
|
|
@ -1651,6 +1651,7 @@ Note:
|
|||
2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
|
||||
- If a model is running but updated or removed from the source, it will be unloaded
|
||||
- If a model is not running, it will be added or updated according to the source
|
||||
3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
|
||||
|
||||
The `status` object can be:
|
||||
|
||||
|
|
|
|||
|
|
@ -3926,22 +3926,7 @@ void server_routes::init_routes() {
|
|||
}},
|
||||
{"object", "list"},
|
||||
{"data", {
|
||||
{
|
||||
{"id", meta->model_name},
|
||||
{"aliases", meta->model_aliases},
|
||||
{"tags", meta->model_tags},
|
||||
{"object", "model"},
|
||||
{"created", std::time(0)},
|
||||
{"owned_by", "llamacpp"},
|
||||
{"meta", {
|
||||
{"vocab_type", meta->model_vocab_type},
|
||||
{"n_vocab", meta->model_vocab_n_tokens},
|
||||
{"n_ctx_train", meta->model_n_ctx_train},
|
||||
{"n_embd", meta->model_n_embd_inp},
|
||||
{"n_params", meta->model_n_params},
|
||||
{"size", meta->model_size},
|
||||
}},
|
||||
},
|
||||
get_model_info(),
|
||||
}}
|
||||
};
|
||||
|
||||
|
|
@ -4155,6 +4140,26 @@ void server_routes::init_routes() {
|
|||
};
|
||||
}
|
||||
|
||||
json server_routes::get_model_info() const {
|
||||
return json {
|
||||
{"id", meta->model_name},
|
||||
{"aliases", meta->model_aliases},
|
||||
{"tags", meta->model_tags},
|
||||
{"object", "model"},
|
||||
{"created", std::time(0)},
|
||||
{"owned_by", "llamacpp"},
|
||||
{"meta", {
|
||||
{"vocab_type", meta->model_vocab_type},
|
||||
{"n_vocab", meta->model_vocab_n_tokens},
|
||||
{"n_ctx", meta->slot_n_ctx},
|
||||
{"n_ctx_train", meta->model_n_ctx_train},
|
||||
{"n_embd", meta->model_n_embd_inp},
|
||||
{"n_params", meta->model_n_params},
|
||||
{"size", meta->model_size},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const server_http_req & req, int id_slot) {
|
||||
auto res = create_response();
|
||||
const json request_data = json::parse(req.body);
|
||||
|
|
|
|||
|
|
@ -122,6 +122,10 @@ struct server_routes {
|
|||
server_http_context::handler_t post_rerank;
|
||||
server_http_context::handler_t get_lora_adapters;
|
||||
server_http_context::handler_t post_lora_adapters;
|
||||
|
||||
// to be used in router mode
|
||||
json get_model_info() const;
|
||||
|
||||
private:
|
||||
std::unique_ptr<server_res_generator> handle_completions_impl(
|
||||
const server_http_req & req,
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@ extern char **environ;
|
|||
#define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit"
|
||||
#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
|
||||
#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
|
||||
#define CMD_CHILD_TO_ROUTER_INFO "cmd_child_to_router:info:" // followed by json string
|
||||
|
||||
// address for child process, this is needed because router may run on 0.0.0.0
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
|
||||
|
|
@ -718,10 +719,11 @@ void server_models::load(const std::string & name) {
|
|||
|
||||
// prepare new instance info
|
||||
instance_t inst;
|
||||
inst.meta = meta;
|
||||
inst.meta.port = get_free_port();
|
||||
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
|
||||
inst.meta.last_used = ggml_time_ms();
|
||||
inst.meta = meta;
|
||||
inst.meta.port = get_free_port();
|
||||
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
|
||||
inst.meta.loaded_info = json{};
|
||||
inst.meta.last_used = ggml_time_ms();
|
||||
|
||||
if (inst.meta.port <= 0) {
|
||||
throw std::runtime_error("failed to get a port number");
|
||||
|
|
@ -767,12 +769,14 @@ void server_models::load(const std::string & name) {
|
|||
// read stdout/stderr and forward to main server log
|
||||
// also handle status report from child process
|
||||
if (stdout_file) {
|
||||
char buffer[4096];
|
||||
char buffer[128 * 1024]; // large buffer for storing info
|
||||
while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
|
||||
LOG("[%5d] %s", port, buffer);
|
||||
std::string str(buffer);
|
||||
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
|
||||
this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
|
||||
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
|
||||
this->update_loaded_info(name, str);
|
||||
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
|
||||
this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
|
||||
}
|
||||
|
|
@ -916,6 +920,29 @@ void server_models::update_status(const std::string & name, server_model_status
|
|||
cv.notify_all();
|
||||
}
|
||||
|
||||
void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
|
||||
if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
|
||||
SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
json info;
|
||||
try {
|
||||
info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
|
||||
} catch (const std::exception & e) {
|
||||
SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
|
||||
return;
|
||||
}
|
||||
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
auto it = mapping.find(name);
|
||||
if (it != mapping.end()) {
|
||||
auto & meta = it->second.meta;
|
||||
meta.loaded_info = info;
|
||||
}
|
||||
cv.notify_all();
|
||||
}
|
||||
|
||||
void server_models::wait_until_loading_finished(const std::string & name) {
|
||||
std::unique_lock<std::mutex> lk(mutex);
|
||||
cv.wait(lk, [this, &name]() {
|
||||
|
|
@ -994,12 +1021,14 @@ bool server_models::is_child_server() {
|
|||
return router_port != nullptr;
|
||||
}
|
||||
|
||||
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
|
||||
std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
|
||||
// send a notification to the router server that a model instance is ready
|
||||
common_log_pause(common_log_main());
|
||||
fflush(stdout);
|
||||
fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
|
||||
fflush(stdout);
|
||||
fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
|
||||
fflush(stdout);
|
||||
common_log_resume(common_log_main());
|
||||
|
||||
// setup thread for monitoring stdin
|
||||
|
|
@ -1176,7 +1205,8 @@ void server_models_routes::init_routes() {
|
|||
status["exit_code"] = meta.exit_code;
|
||||
status["failed"] = true;
|
||||
}
|
||||
models_json.push_back(json {
|
||||
|
||||
json model_info = json {
|
||||
{"id", meta.name},
|
||||
{"aliases", meta.aliases},
|
||||
{"tags", meta.tags},
|
||||
|
|
@ -1185,7 +1215,17 @@ void server_models_routes::init_routes() {
|
|||
{"created", t}, // for OAI-compat
|
||||
{"status", status},
|
||||
// TODO: add other fields, may require reading GGUF metadata
|
||||
});
|
||||
};
|
||||
|
||||
// merge with loaded_info from the child process if available
|
||||
if (meta.is_running()) {
|
||||
for (auto it = meta.loaded_info.begin(); it != meta.loaded_info.end(); ++it) {
|
||||
if (!model_info.contains(it.key())) {
|
||||
model_info[it.key()] = it.value();
|
||||
}
|
||||
}
|
||||
}
|
||||
models_json.push_back(model_info);
|
||||
}
|
||||
res_ok(res, {
|
||||
{"data", models_json},
|
||||
|
|
|
|||
|
|
@ -63,6 +63,7 @@ struct server_model_meta {
|
|||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||
int64_t last_used = 0; // for LRU unloading
|
||||
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
||||
json loaded_info; // info to be reflected via /v1/models endpoint
|
||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
|
||||
|
||||
|
|
@ -145,6 +146,7 @@ public:
|
|||
|
||||
// update the status of a model instance (thread-safe)
|
||||
void update_status(const std::string & name, server_model_status status, int exit_code);
|
||||
void update_loaded_info(const std::string & name, std::string & raw_info);
|
||||
|
||||
// wait until the model instance is fully loaded (thread-safe)
|
||||
// return when the model no longer in "loading" state
|
||||
|
|
@ -163,7 +165,7 @@ public:
|
|||
|
||||
// notify the router server that a model instance is ready
|
||||
// return the monitoring thread (to be joined by the caller)
|
||||
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
|
||||
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);
|
||||
|
||||
// notify the router server that the sleeping state has changed
|
||||
static void notify_router_sleeping_state(bool sleeping);
|
||||
|
|
|
|||
|
|
@ -334,7 +334,8 @@ int main(int argc, char ** argv) {
|
|||
// optionally, notify router server that this instance is ready
|
||||
std::thread monitor_thread;
|
||||
if (server_models::is_child_server()) {
|
||||
monitor_thread = server_models::setup_child_server(shutdown_handler);
|
||||
json model_info = routes.get_model_info();
|
||||
monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
|
||||
}
|
||||
|
||||
// this call blocks the main thread until queue_tasks.terminate() is called
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue