mtmd, server, common: expose modalities to /v1/models (#22952)

* mtmd, server, common: expose modalities to /v1/models

* fix build

* rename to mtmd_caps
This commit is contained in:
Xuan-Son Nguyen 2026-05-12 19:08:07 +02:00 committed by GitHub
parent 927dada6c9
commit 7bfe120c21
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 121 additions and 27 deletions

View file

@ -435,6 +435,25 @@ static bool parse_bool_value(const std::string & value) {
// CLI argument parsing functions
//
void common_params_handle_models(common_params & params, llama_example curr_ex) {
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
// optionally, handle mmproj model when -hf is specified
params.mmproj = res.mmproj;
}
// only download mmproj if the current example is using it
for (const auto & ex : mmproj_examples) {
if (curr_ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
break;
}
}
common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
}
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
common_params & params = ctx_arg.params;
@ -588,22 +607,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// handle model and download
if (!skip_model_download) {
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
// optionally, handle mmproj model when -hf is specified
params.mmproj = res.mmproj;
}
// only download mmproj if the current example is using it
for (const auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
break;
}
}
common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
common_params_handle_models(params, ctx_arg.ex);
}
// model is required (except for server)

View file

@ -129,5 +129,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
// see: https://github.com/ggml-org/llama.cpp/issues/18163
void common_params_add_preset_options(std::vector<common_arg> & args);
// Populate model paths (main model, mmproj, etc) from -hf if necessary
void common_params_handle_models(common_params & params, llama_example curr_ex);
// initialize argument parser context - used by test-arg-parser and preset
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

View file

@ -163,8 +163,13 @@ void common_preset::merge(const common_preset & other) {
}
}
void common_preset::apply_to_params(common_params & params) const {
void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {
for (const auto & [opt, val] : options) {
if (!handled_keys.empty()) {
if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) {
continue;
}
}
// apply each option to params
if (opt.handler_string) {
opt.handler_string(params, val);

View file

@ -43,7 +43,8 @@ struct common_preset {
void merge(const common_preset & other);
// apply preset options to common_params
void apply_to_params(common_params & params) const;
// optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options
void apply_to_params(common_params & params, const std::set<std::string> & handled_keys = std::set<std::string>()) const;
};
// interface for multiple presets in one file

View file

@ -994,7 +994,7 @@ struct clip_model_loader {
bool has_audio = false;
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
clip_model_loader(const char * fname) : fname(fname) {
clip_model_loader(const char * fname, bool skip_tensors = false) : fname(fname) {
struct ggml_context * meta = nullptr;
struct gguf_init_params params = {
@ -1040,7 +1040,7 @@ struct clip_model_loader {
}
// tensors
{
if (!skip_tensors) {
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
@ -2927,6 +2927,14 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
return {ctx_vision, ctx_audio};
}
struct clip_cap clip_get_cap(const char * fname) {
clip_cap res;
clip_model_loader loader(fname, /* skip_tensors= */ true);
res.has_vision = loader.has_vision;
res.has_audio = loader.has_audio;
return res;
}
struct clip_image_size * clip_image_size_init() {
struct clip_image_size * load_image_size = new struct clip_image_size();
load_image_size->width = 448;

View file

@ -116,3 +116,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
struct clip_cap {
bool has_vision;
bool has_audio;
};
struct clip_cap clip_get_cap(const char * fname);

View file

@ -1423,6 +1423,19 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
g_logger_state.log_callback_user_data = user_data;
}
struct mtmd_caps mtmd_get_cap_from_file(const char * fname) {
try {
auto tmp = clip_get_cap(fname);
mtmd_caps cap;
cap.inp_audio = tmp.has_audio;
cap.inp_vision = tmp.has_vision;
return cap;
} catch (const std::exception & e) {
LOG_ERR("%s: failed to get capabilities from file '%s': %s\n", __func__, fname, e.what());
return mtmd_caps{ false, false };
}
}
//
// Debugging API (NOT intended for public use)
//

View file

@ -244,6 +244,14 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
// If this is not called, or NULL is supplied, everything is output on stderr.
MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
// EXPERIMENTAL API to get mmproj's capabilities without initializing the full context
// This is only intended to be used by llama-server, breaking changes is expected
struct mtmd_caps {
bool inp_vision;
bool inp_audio;
};
MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
/////////////////////////////////////////
// test function, to be used in test-mtmd-c-api.c

View file

@ -161,6 +161,30 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
args = preset.to_args(bin_path);
}
void server_model_meta::update_caps() {
try {
common_params params;
preset.apply_to_params(params, {
"LLAMA_ARG_MODEL",
"LLAMA_ARG_MODEL_URL",
"LLAMA_ARG_MMPROJ",
"LLAMA_ARG_MMPROJ_URL",
"LLAMA_ARG_HF_REPO",
"LLAMA_ARG_HF_REPO_FILE",
});
params.offline = true; // avoid any unwanted network call during capability detection
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
if (params.mmproj.path.empty()) {
multimodal = { false, false };
} else {
multimodal = mtmd_get_cap_from_file(params.mmproj.path.c_str());
}
} catch (const std::exception & e) {
LOG_WRN("failed to initialize common_params for multimodal capability detection: %s\n", e.what());
multimodal = { false, false };
}
}
//
// server_models
//
@ -236,6 +260,7 @@ void server_models::add_model(server_model_meta && meta) {
}
meta.update_args(ctx_preset, bin_path); // render args
meta.update_caps();
std::string name = meta.name;
mapping[name] = instance_t{
/* subproc */ std::make_shared<subprocess_s>(),
@ -346,8 +371,10 @@ void server_models::load_models() {
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* loaded_info */ {},
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
/* multimodal */ mtmd_caps{false, false},
};
add_model(std::move(meta));
}
@ -481,6 +508,7 @@ void server_models::load_models() {
inst.meta.exit_code = 0; // clear failed state so the model can be reloaded
inst.meta.update_args(ctx_preset, bin_path);
inst.meta.update_caps();
}
// add models that are new in this reload
@ -496,8 +524,10 @@ void server_models::load_models() {
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* loaded_info */ {},
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
/* multimodal */ mtmd_caps{false, false},
};
add_model(std::move(meta));
newly_added.push_back(name);
@ -1206,14 +1236,28 @@ void server_models_routes::init_routes() {
status["failed"] = true;
}
// pi coding agent multimodal compatibility
json input_modalities = json::array({"text"});
if (meta.multimodal.inp_vision) {
input_modalities.push_back("image");
}
if (meta.multimodal.inp_audio) {
input_modalities.push_back("audio");
}
json architecture {
{"input_modalities", input_modalities},
{"output_modalities", json::array({"text"})},
};
json model_info = json {
{"id", meta.name},
{"aliases", meta.aliases},
{"tags", meta.tags},
{"object", "model"}, // for OAI-compat
{"owned_by", "llamacpp"}, // for OAI-compat
{"created", t}, // for OAI-compat
{"status", status},
{"id", meta.name},
{"aliases", meta.aliases},
{"tags", meta.tags},
{"object", "model"}, // for OAI-compat
{"owned_by", "llamacpp"}, // for OAI-compat
{"created", t}, // for OAI-compat
{"status", status},
{"architecture", architecture},
// TODO: add other fields, may require reading GGUF metadata
};

View file

@ -66,6 +66,7 @@ struct server_model_meta {
json loaded_info; // info to be reflected via /v1/models endpoint
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
mtmd_caps multimodal; // multimodal capabilities
bool is_ready() const {
return status == SERVER_MODEL_STATUS_LOADED;
@ -80,6 +81,7 @@ struct server_model_meta {
}
void update_args(common_preset_context & ctx_presets, std::string bin_path);
void update_caps();
};
struct subprocess_s;