diff --git a/common/arg.cpp b/common/arg.cpp index 9fefe411e..07ba71935 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -435,6 +435,25 @@ static bool parse_bool_value(const std::string & value) { // CLI argument parsing functions // +void common_params_handle_models(common_params & params, llama_example curr_ex) { + auto res = common_params_handle_model(params.model, params.hf_token, params.offline); + if (params.no_mmproj) { + params.mmproj = {}; + } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { + // optionally, handle mmproj model when -hf is specified + params.mmproj = res.mmproj; + } + // only download mmproj if the current example is using it + for (const auto & ex : mmproj_examples) { + if (curr_ex == ex) { + common_params_handle_model(params.mmproj, params.hf_token, params.offline); + break; + } + } + common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline); + common_params_handle_model(params.vocoder.model, params.hf_token, params.offline); +} + static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { common_params & params = ctx_arg.params; @@ -588,22 +607,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // handle model and download if (!skip_model_download) { - auto res = common_params_handle_model(params.model, params.hf_token, params.offline); - if (params.no_mmproj) { - params.mmproj = {}; - } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { - // optionally, handle mmproj model when -hf is specified - params.mmproj = res.mmproj; - } - // only download mmproj if the current example is using it - for (const auto & ex : mmproj_examples) { - if (ctx_arg.ex == ex) { - common_params_handle_model(params.mmproj, params.hf_token, params.offline); - break; - } - } - common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline); - common_params_handle_model(params.vocoder.model, params.hf_token, params.offline); + common_params_handle_models(params, ctx_arg.ex); } // model is required (except for server) diff --git a/common/arg.h b/common/arg.h index 2c2a4e38a..2a85f09f3 100644 --- a/common/arg.h +++ b/common/arg.h @@ -129,5 +129,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map & args); +// Populate model paths (main model, mmproj, etc) from -hf if necessary +void common_params_handle_models(common_params & params, llama_example curr_ex); + // initialize argument parser context - used by test-arg-parser and preset common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/preset.cpp b/common/preset.cpp index 9187a67f0..51ea984d8 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -163,8 +163,13 @@ void common_preset::merge(const common_preset & other) { } } -void common_preset::apply_to_params(common_params & params) const { +void common_preset::apply_to_params(common_params & params, const std::set & handled_keys) const { for (const auto & [opt, val] : options) { + if (!handled_keys.empty()) { + if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) { + continue; + } + } // apply each option to params if (opt.handler_string) { opt.handler_string(params, val); diff --git a/common/preset.h b/common/preset.h index 11ba6ef81..06f829c3e 100644 --- a/common/preset.h +++ b/common/preset.h @@ -43,7 +43,8 @@ struct common_preset { void merge(const common_preset & other); // apply preset options to common_params - void apply_to_params(common_params & params) const; + // optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options + void apply_to_params(common_params & params, const std::set & handled_keys = std::set()) const; }; // interface for multiple presets in one file diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f0c63d375..2e0cfa61f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -994,7 +994,7 @@ struct clip_model_loader { bool has_audio = false; // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model - clip_model_loader(const char * fname) : fname(fname) { + clip_model_loader(const char * fname, bool skip_tensors = false) : fname(fname) { struct ggml_context * meta = nullptr; struct gguf_init_params params = { @@ -1040,7 +1040,7 @@ struct clip_model_loader { } // tensors - { + if (!skip_tensors) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i); @@ -2927,6 +2927,14 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params return {ctx_vision, ctx_audio}; } +struct clip_cap clip_get_cap(const char * fname) { + clip_cap res; + clip_model_loader loader(fname, /* skip_tensors= */ true); + res.has_vision = loader.has_vision; + res.has_audio = loader.has_audio; + return res; +} + struct clip_image_size * clip_image_size_init() { struct clip_image_size * load_image_size = new struct clip_image_size(); load_image_size->width = 448; diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index a859b3865..5f9dc93c6 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -116,3 +116,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); bool clip_has_whisper_encoder(const struct clip_ctx * ctx); + +struct clip_cap { + bool has_vision; + bool has_audio; +}; +struct clip_cap clip_get_cap(const char * fname); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 22092f6a6..1ab8a4c04 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1423,6 +1423,19 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback_user_data = user_data; } +struct mtmd_caps mtmd_get_cap_from_file(const char * fname) { + try { + auto tmp = clip_get_cap(fname); + mtmd_caps cap; + cap.inp_audio = tmp.has_audio; + cap.inp_vision = tmp.has_vision; + return cap; + } catch (const std::exception & e) { + LOG_ERR("%s: failed to get capabilities from file '%s': %s\n", __func__, fname, e.what()); + return mtmd_caps{ false, false }; + } +} + // // Debugging API (NOT intended for public use) // diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index e364174b8..54b9515a3 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -244,6 +244,14 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); // If this is not called, or NULL is supplied, everything is output on stderr. MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); +// EXPERIMENTAL API to get mmproj's capabilities without initializing the full context +// This is only intended to be used by llama-server, breaking changes is expected +struct mtmd_caps { + bool inp_vision; + bool inp_audio; +}; +MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); + ///////////////////////////////////////// // test function, to be used in test-mtmd-c-api.c diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index e60e1abc3..698489a11 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -161,6 +161,30 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str args = preset.to_args(bin_path); } +void server_model_meta::update_caps() { + try { + common_params params; + preset.apply_to_params(params, { + "LLAMA_ARG_MODEL", + "LLAMA_ARG_MODEL_URL", + "LLAMA_ARG_MMPROJ", + "LLAMA_ARG_MMPROJ_URL", + "LLAMA_ARG_HF_REPO", + "LLAMA_ARG_HF_REPO_FILE", + }); + params.offline = true; // avoid any unwanted network call during capability detection + common_params_handle_models(params, LLAMA_EXAMPLE_SERVER); + if (params.mmproj.path.empty()) { + multimodal = { false, false }; + } else { + multimodal = mtmd_get_cap_from_file(params.mmproj.path.c_str()); + } + } catch (const std::exception & e) { + LOG_WRN("failed to initialize common_params for multimodal capability detection: %s\n", e.what()); + multimodal = { false, false }; + } +} + // // server_models // @@ -236,6 +260,7 @@ void server_models::add_model(server_model_meta && meta) { } meta.update_args(ctx_preset, bin_path); // render args + meta.update_caps(); std::string name = meta.name; mapping[name] = instance_t{ /* subproc */ std::make_shared(), @@ -346,8 +371,10 @@ void server_models::load_models() { /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, /* args */ std::vector(), + /* loaded_info */ {}, /* exit_code */ 0, /* stop_timeout */ DEFAULT_STOP_TIMEOUT, + /* multimodal */ mtmd_caps{false, false}, }; add_model(std::move(meta)); } @@ -481,6 +508,7 @@ void server_models::load_models() { inst.meta.exit_code = 0; // clear failed state so the model can be reloaded inst.meta.update_args(ctx_preset, bin_path); + inst.meta.update_caps(); } // add models that are new in this reload @@ -496,8 +524,10 @@ void server_models::load_models() { /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, /* args */ std::vector(), + /* loaded_info */ {}, /* exit_code */ 0, /* stop_timeout */ DEFAULT_STOP_TIMEOUT, + /* multimodal */ mtmd_caps{false, false}, }; add_model(std::move(meta)); newly_added.push_back(name); @@ -1206,14 +1236,28 @@ void server_models_routes::init_routes() { status["failed"] = true; } + // pi coding agent multimodal compatibility + json input_modalities = json::array({"text"}); + if (meta.multimodal.inp_vision) { + input_modalities.push_back("image"); + } + if (meta.multimodal.inp_audio) { + input_modalities.push_back("audio"); + } + json architecture { + {"input_modalities", input_modalities}, + {"output_modalities", json::array({"text"})}, + }; + json model_info = json { - {"id", meta.name}, - {"aliases", meta.aliases}, - {"tags", meta.tags}, - {"object", "model"}, // for OAI-compat - {"owned_by", "llamacpp"}, // for OAI-compat - {"created", t}, // for OAI-compat - {"status", status}, + {"id", meta.name}, + {"aliases", meta.aliases}, + {"tags", meta.tags}, + {"object", "model"}, // for OAI-compat + {"owned_by", "llamacpp"}, // for OAI-compat + {"created", t}, // for OAI-compat + {"status", status}, + {"architecture", architecture}, // TODO: add other fields, may require reading GGUF metadata }; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 2267de588..f1206c714 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -66,6 +66,7 @@ struct server_model_meta { json loaded_info; // info to be reflected via /v1/models endpoint int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown + mtmd_caps multimodal; // multimodal capabilities bool is_ready() const { return status == SERVER_MODEL_STATUS_LOADED; @@ -80,6 +81,7 @@ struct server_model_meta { } void update_args(common_preset_context & ctx_presets, std::string bin_path); + void update_caps(); }; struct subprocess_s;