mtmd, server, common: expose modalities to /v1/models (#22952)

* mtmd, server, common: expose modalities to /v1/models * fix build * rename to mtmd_caps
2026-05-17 04:09:19 +00:00 · 2026-05-12 19:08:07 +02:00 · 2026-05-12 19:08:07 +02:00 · 7bfe120c21
commit 7bfe120c21
parent 927dada6c9
10 changed files with 121 additions and 27 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -435,6 +435,25 @@ static bool parse_bool_value(const std::string & value) {
 // CLI argument parsing functions
 //

+void common_params_handle_models(common_params & params, llama_example curr_ex) {
+    auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
+    if (params.no_mmproj) {
+        params.mmproj = {};
+    } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+        // optionally, handle mmproj model when -hf is specified
+        params.mmproj = res.mmproj;
+    }
+    // only download mmproj if the current example is using it
+    for (const auto & ex : mmproj_examples) {
+        if (curr_ex == ex) {
+            common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
+            break;
+        }
+    }
+    common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
+    common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
+}
+
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

@ -588,22 +607,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context

    // handle model and download
    if (!skip_model_download) {
-        auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
-        if (params.no_mmproj) {
-            params.mmproj = {};
-        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
-            // optionally, handle mmproj model when -hf is specified
-            params.mmproj = res.mmproj;
-        }
-        // only download mmproj if the current example is using it
-        for (const auto & ex : mmproj_examples) {
-            if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
-                break;
-            }
-        }
-        common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
-        common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
+        common_params_handle_models(params, ctx_arg.ex);
    }

    // model is required (except for server)
--- a/common/arg.h
+++ b/common/arg.h
@ -129,5 +129,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

+// Populate model paths (main model, mmproj, etc) from -hf if necessary
+void common_params_handle_models(common_params & params, llama_example curr_ex);
+
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/preset.cpp
+++ b/common/preset.cpp
@ -163,8 +163,13 @@ void common_preset::merge(const common_preset & other) {
    }
 }

-void common_preset::apply_to_params(common_params & params) const {
+void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {
    for (const auto & [opt, val] : options) {
+        if (!handled_keys.empty()) {
+            if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) {
+                continue;
+            }
+        }
        // apply each option to params
        if (opt.handler_string) {
            opt.handler_string(params, val);
--- a/common/preset.h
+++ b/common/preset.h
@ -43,7 +43,8 @@ struct common_preset {
    void merge(const common_preset & other);

    // apply preset options to common_params
-    void apply_to_params(common_params & params) const;
+    // optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options
+    void apply_to_params(common_params & params, const std::set<std::string> & handled_keys = std::set<std::string>()) const;
 };

 // interface for multiple presets in one file
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -994,7 +994,7 @@ struct clip_model_loader {
    bool has_audio  = false;

    // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
-    clip_model_loader(const char * fname) : fname(fname) {
+    clip_model_loader(const char * fname, bool skip_tensors = false) : fname(fname) {
        struct ggml_context * meta = nullptr;

        struct gguf_init_params params = {
@ -1040,7 +1040,7 @@ struct clip_model_loader {
        }

        // tensors
-        {
+        if (!skip_tensors) {
            for (int i = 0; i < n_tensors; ++i) {
                const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
                const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
@ -2927,6 +2927,14 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
    return {ctx_vision, ctx_audio};
 }

+struct clip_cap clip_get_cap(const char * fname) {
+    clip_cap res;
+    clip_model_loader loader(fname, /* skip_tensors= */ true);
+    res.has_vision = loader.has_vision;
+    res.has_audio  = loader.has_audio;
+    return res;
+}
+
 struct clip_image_size * clip_image_size_init() {
    struct clip_image_size * load_image_size = new struct clip_image_size();
    load_image_size->width = 448;
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@ -116,3 +116,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
+
+struct clip_cap {
+    bool has_vision;
+    bool has_audio;
+};
+struct clip_cap clip_get_cap(const char * fname);
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -1423,6 +1423,19 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
    g_logger_state.log_callback_user_data = user_data;
 }

+struct mtmd_caps mtmd_get_cap_from_file(const char * fname) {
+    try {
+        auto tmp = clip_get_cap(fname);
+        mtmd_caps cap;
+        cap.inp_audio  = tmp.has_audio;
+        cap.inp_vision = tmp.has_vision;
+        return cap;
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: failed to get capabilities from file '%s': %s\n", __func__, fname, e.what());
+        return mtmd_caps{ false, false };
+    }
+}
+
 //
 // Debugging API (NOT intended for public use)
 //
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@ -244,6 +244,14 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 // If this is not called, or NULL is supplied, everything is output on stderr.
 MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);

+// EXPERIMENTAL API to get mmproj's capabilities without initializing the full context
+// This is only intended to be used by llama-server, breaking changes is expected
+struct mtmd_caps {
+    bool inp_vision;
+    bool inp_audio;
+};
+MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
+
 /////////////////////////////////////////

 // test function, to be used in test-mtmd-c-api.c
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -161,6 +161,30 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
    args = preset.to_args(bin_path);
 }

+void server_model_meta::update_caps() {
+    try {
+        common_params params;
+        preset.apply_to_params(params, {
+            "LLAMA_ARG_MODEL",
+            "LLAMA_ARG_MODEL_URL",
+            "LLAMA_ARG_MMPROJ",
+            "LLAMA_ARG_MMPROJ_URL",
+            "LLAMA_ARG_HF_REPO",
+            "LLAMA_ARG_HF_REPO_FILE",
+        });
+        params.offline = true; // avoid any unwanted network call during capability detection
+        common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
+        if (params.mmproj.path.empty()) {
+            multimodal = { false, false };
+        } else {
+            multimodal = mtmd_get_cap_from_file(params.mmproj.path.c_str());
+        }
+    } catch (const std::exception & e) {
+        LOG_WRN("failed to initialize common_params for multimodal capability detection: %s\n", e.what());
+        multimodal = { false, false };
+    }
+}
+
 //
 // server_models
 //
@ -236,6 +260,7 @@ void server_models::add_model(server_model_meta && meta) {
    }

    meta.update_args(ctx_preset, bin_path); // render args
+    meta.update_caps();
    std::string name = meta.name;
    mapping[name] = instance_t{
        /* subproc */ std::make_shared<subprocess_s>(),
@ -346,8 +371,10 @@ void server_models::load_models() {
                /* status       */ SERVER_MODEL_STATUS_UNLOADED,
                /* last_used    */ 0,
                /* args         */ std::vector<std::string>(),
+                /* loaded_info  */ {},
                /* exit_code    */ 0,
                /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+                /* multimodal   */ mtmd_caps{false, false},
            };
            add_model(std::move(meta));
        }
@ -481,6 +508,7 @@ void server_models::load_models() {

            inst.meta.exit_code = 0; // clear failed state so the model can be reloaded
            inst.meta.update_args(ctx_preset, bin_path);
+            inst.meta.update_caps();
        }

        // add models that are new in this reload
@ -496,8 +524,10 @@ void server_models::load_models() {
                    /* status       */ SERVER_MODEL_STATUS_UNLOADED,
                    /* last_used    */ 0,
                    /* args         */ std::vector<std::string>(),
+                    /* loaded_info  */ {},
                    /* exit_code    */ 0,
                    /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+                    /* multimodal   */ mtmd_caps{false, false},
                };
                add_model(std::move(meta));
                newly_added.push_back(name);
@ -1206,14 +1236,28 @@ void server_models_routes::init_routes() {
                status["failed"]    = true;
            }

+            // pi coding agent multimodal compatibility
+            json input_modalities = json::array({"text"});
+            if (meta.multimodal.inp_vision) {
+                input_modalities.push_back("image");
+            }
+            if (meta.multimodal.inp_audio) {
+                input_modalities.push_back("audio");
+            }
+            json architecture {
+                {"input_modalities",  input_modalities},
+                {"output_modalities", json::array({"text"})},
+            };
+
            json model_info = json {
-                {"id",       meta.name},
-                {"aliases",  meta.aliases},
-                {"tags",     meta.tags},
-                {"object",   "model"},    // for OAI-compat
-                {"owned_by", "llamacpp"}, // for OAI-compat
-                {"created",  t},          // for OAI-compat
-                {"status",   status},
+                {"id",           meta.name},
+                {"aliases",      meta.aliases},
+                {"tags",         meta.tags},
+                {"object",       "model"},    // for OAI-compat
+                {"owned_by",     "llamacpp"}, // for OAI-compat
+                {"created",      t},          // for OAI-compat
+                {"status",       status},
+                {"architecture", architecture},
                // TODO: add other fields, may require reading GGUF metadata
            };

--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@ -66,6 +66,7 @@ struct server_model_meta {
    json loaded_info; // info to be reflected via /v1/models endpoint
    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
    int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
+    mtmd_caps multimodal; // multimodal capabilities

    bool is_ready() const {
        return status == SERVER_MODEL_STATUS_LOADED;
@ -80,6 +81,7 @@ struct server_model_meta {
    }

    void update_args(common_preset_context & ctx_presets, std::string bin_path);
+    void update_caps();
 };

 struct subprocess_s;