diff --git a/app/llama.cpp b/app/llama.cpp index e149975d2..b0b86fd47 100644 --- a/app/llama.cpp +++ b/app/llama.cpp @@ -1,6 +1,7 @@ #include "build-info.h" #include +#include #include #include @@ -77,6 +78,14 @@ int main(int argc, char ** argv) { for (const auto & cmd : cmds) { if (matches(arg, cmd)) { + + // router spawns children through this same binary, it needs the + // subcommand to relaunch as 'llama serve' and not bare options +#ifdef _WIN32 + _putenv_s("LLAMA_APP_CMD", cmd.name); +#else + setenv("LLAMA_APP_CMD", cmd.name, 1); +#endif return cmd.func(argc - 1, argv + 1); } } diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index ccf42320f..47b6c2a4e 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -159,6 +160,13 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str // TODO: maybe validate preset before rendering ? // render args args = preset.to_args(bin_path); + + // unified binary dispatches by subcommand, re-inject it right after the + // binary path so the child starts as 'llama serve ...' not 'llama ...' + const char * app_cmd = std::getenv("LLAMA_APP_CMD"); + if (app_cmd != nullptr && app_cmd[0] != '\0' && !bin_path.empty()) { + args.insert(args.begin() + 1, app_cmd); + } } void server_model_meta::update_caps() {