mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 19:47:49 +00:00
server: skip device enumeration in router mode to avoid creating CUDA primary context (#23137)
This commit is contained in:
parent
6049906133
commit
64b38b561b
3 changed files with 16 additions and 10 deletions
|
|
@ -373,7 +373,7 @@ void common_init() {
|
|||
llama_log_set(common_log_default_callback, NULL);
|
||||
}
|
||||
|
||||
void common_params_print_info(const common_params & params) {
|
||||
void common_params_print_info(const common_params & params, bool print_devices) {
|
||||
#ifdef NDEBUG
|
||||
const char * build_type = "";
|
||||
#else
|
||||
|
|
@ -382,12 +382,16 @@ void common_params_print_info(const common_params & params) {
|
|||
LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
|
||||
|
||||
LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
|
||||
LOG_INF("device_info:\n");
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||
|
||||
// device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
|
||||
if (print_devices) {
|
||||
LOG_INF("device_info:\n");
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -708,7 +708,7 @@ struct common_params {
|
|||
// initializes the logging system and prints info about the build
|
||||
void common_init();
|
||||
|
||||
void common_params_print_info(const common_params & params);
|
||||
void common_params_print_info(const common_params & params, bool print_devices = true);
|
||||
std::string common_params_get_system_info(const common_params & params);
|
||||
|
||||
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
|
|
|
|||
|
|
@ -86,7 +86,10 @@ int main(int argc, char ** argv) {
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
common_params_print_info(params);
|
||||
// router server never loads a model and must not touch the GPU
|
||||
// skip device enumeration so the CUDA primary context stays uncreated
|
||||
const bool is_router_server = params.model.path.empty();
|
||||
common_params_print_info(params, !is_router_server);
|
||||
|
||||
// validate batch size for embeddings
|
||||
// embeddings require all tokens to be processed in a single ubatch
|
||||
|
|
@ -126,7 +129,6 @@ int main(int argc, char ** argv) {
|
|||
server_routes routes(params, ctx_server);
|
||||
server_tools tools;
|
||||
|
||||
bool is_router_server = params.model.path.empty();
|
||||
std::optional<server_models_routes> models_routes{};
|
||||
if (is_router_server) {
|
||||
// setup server instances manager
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue