mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-17 04:09:19 +00:00
mtmd, server, common: expose modalities to /v1/models (#22952)
* mtmd, server, common: expose modalities to /v1/models * fix build * rename to mtmd_caps
This commit is contained in:
parent
927dada6c9
commit
7bfe120c21
10 changed files with 121 additions and 27 deletions
|
|
@ -435,6 +435,25 @@ static bool parse_bool_value(const std::string & value) {
|
|||
// CLI argument parsing functions
|
||||
//
|
||||
|
||||
void common_params_handle_models(common_params & params, llama_example curr_ex) {
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
// optionally, handle mmproj model when -hf is specified
|
||||
params.mmproj = res.mmproj;
|
||||
}
|
||||
// only download mmproj if the current example is using it
|
||||
for (const auto & ex : mmproj_examples) {
|
||||
if (curr_ex == ex) {
|
||||
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
|
||||
break;
|
||||
}
|
||||
}
|
||||
common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
|
||||
}
|
||||
|
||||
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
||||
common_params & params = ctx_arg.params;
|
||||
|
||||
|
|
@ -588,22 +607,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||
|
||||
// handle model and download
|
||||
if (!skip_model_download) {
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
// optionally, handle mmproj model when -hf is specified
|
||||
params.mmproj = res.mmproj;
|
||||
}
|
||||
// only download mmproj if the current example is using it
|
||||
for (const auto & ex : mmproj_examples) {
|
||||
if (ctx_arg.ex == ex) {
|
||||
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
|
||||
break;
|
||||
}
|
||||
}
|
||||
common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
|
||||
common_params_handle_models(params, ctx_arg.ex);
|
||||
}
|
||||
|
||||
// model is required (except for server)
|
||||
|
|
|
|||
|
|
@ -129,5 +129,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
|||
// see: https://github.com/ggml-org/llama.cpp/issues/18163
|
||||
void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||
|
||||
// Populate model paths (main model, mmproj, etc) from -hf if necessary
|
||||
void common_params_handle_models(common_params & params, llama_example curr_ex);
|
||||
|
||||
// initialize argument parser context - used by test-arg-parser and preset
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
|
|
|||
|
|
@ -163,8 +163,13 @@ void common_preset::merge(const common_preset & other) {
|
|||
}
|
||||
}
|
||||
|
||||
void common_preset::apply_to_params(common_params & params) const {
|
||||
void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {
|
||||
for (const auto & [opt, val] : options) {
|
||||
if (!handled_keys.empty()) {
|
||||
if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// apply each option to params
|
||||
if (opt.handler_string) {
|
||||
opt.handler_string(params, val);
|
||||
|
|
|
|||
|
|
@ -43,7 +43,8 @@ struct common_preset {
|
|||
void merge(const common_preset & other);
|
||||
|
||||
// apply preset options to common_params
|
||||
void apply_to_params(common_params & params) const;
|
||||
// optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options
|
||||
void apply_to_params(common_params & params, const std::set<std::string> & handled_keys = std::set<std::string>()) const;
|
||||
};
|
||||
|
||||
// interface for multiple presets in one file
|
||||
|
|
|
|||
|
|
@ -994,7 +994,7 @@ struct clip_model_loader {
|
|||
bool has_audio = false;
|
||||
|
||||
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
|
||||
clip_model_loader(const char * fname) : fname(fname) {
|
||||
clip_model_loader(const char * fname, bool skip_tensors = false) : fname(fname) {
|
||||
struct ggml_context * meta = nullptr;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
|
|
@ -1040,7 +1040,7 @@ struct clip_model_loader {
|
|||
}
|
||||
|
||||
// tensors
|
||||
{
|
||||
if (!skip_tensors) {
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
|
||||
|
|
@ -2927,6 +2927,14 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
return {ctx_vision, ctx_audio};
|
||||
}
|
||||
|
||||
struct clip_cap clip_get_cap(const char * fname) {
|
||||
clip_cap res;
|
||||
clip_model_loader loader(fname, /* skip_tensors= */ true);
|
||||
res.has_vision = loader.has_vision;
|
||||
res.has_audio = loader.has_audio;
|
||||
return res;
|
||||
}
|
||||
|
||||
struct clip_image_size * clip_image_size_init() {
|
||||
struct clip_image_size * load_image_size = new struct clip_image_size();
|
||||
load_image_size->width = 448;
|
||||
|
|
|
|||
|
|
@ -116,3 +116,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
|
|||
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
|
||||
|
||||
struct clip_cap {
|
||||
bool has_vision;
|
||||
bool has_audio;
|
||||
};
|
||||
struct clip_cap clip_get_cap(const char * fname);
|
||||
|
|
|
|||
|
|
@ -1423,6 +1423,19 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
|
|||
g_logger_state.log_callback_user_data = user_data;
|
||||
}
|
||||
|
||||
struct mtmd_caps mtmd_get_cap_from_file(const char * fname) {
|
||||
try {
|
||||
auto tmp = clip_get_cap(fname);
|
||||
mtmd_caps cap;
|
||||
cap.inp_audio = tmp.has_audio;
|
||||
cap.inp_vision = tmp.has_vision;
|
||||
return cap;
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: failed to get capabilities from file '%s': %s\n", __func__, fname, e.what());
|
||||
return mtmd_caps{ false, false };
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Debugging API (NOT intended for public use)
|
||||
//
|
||||
|
|
|
|||
|
|
@ -244,6 +244,14 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
|||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
// EXPERIMENTAL API to get mmproj's capabilities without initializing the full context
|
||||
// This is only intended to be used by llama-server, breaking changes is expected
|
||||
struct mtmd_caps {
|
||||
bool inp_vision;
|
||||
bool inp_audio;
|
||||
};
|
||||
MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
|
||||
|
||||
/////////////////////////////////////////
|
||||
|
||||
// test function, to be used in test-mtmd-c-api.c
|
||||
|
|
|
|||
|
|
@ -161,6 +161,30 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
|
|||
args = preset.to_args(bin_path);
|
||||
}
|
||||
|
||||
void server_model_meta::update_caps() {
|
||||
try {
|
||||
common_params params;
|
||||
preset.apply_to_params(params, {
|
||||
"LLAMA_ARG_MODEL",
|
||||
"LLAMA_ARG_MODEL_URL",
|
||||
"LLAMA_ARG_MMPROJ",
|
||||
"LLAMA_ARG_MMPROJ_URL",
|
||||
"LLAMA_ARG_HF_REPO",
|
||||
"LLAMA_ARG_HF_REPO_FILE",
|
||||
});
|
||||
params.offline = true; // avoid any unwanted network call during capability detection
|
||||
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
|
||||
if (params.mmproj.path.empty()) {
|
||||
multimodal = { false, false };
|
||||
} else {
|
||||
multimodal = mtmd_get_cap_from_file(params.mmproj.path.c_str());
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("failed to initialize common_params for multimodal capability detection: %s\n", e.what());
|
||||
multimodal = { false, false };
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// server_models
|
||||
//
|
||||
|
|
@ -236,6 +260,7 @@ void server_models::add_model(server_model_meta && meta) {
|
|||
}
|
||||
|
||||
meta.update_args(ctx_preset, bin_path); // render args
|
||||
meta.update_caps();
|
||||
std::string name = meta.name;
|
||||
mapping[name] = instance_t{
|
||||
/* subproc */ std::make_shared<subprocess_s>(),
|
||||
|
|
@ -346,8 +371,10 @@ void server_models::load_models() {
|
|||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* loaded_info */ {},
|
||||
/* exit_code */ 0,
|
||||
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
|
||||
/* multimodal */ mtmd_caps{false, false},
|
||||
};
|
||||
add_model(std::move(meta));
|
||||
}
|
||||
|
|
@ -481,6 +508,7 @@ void server_models::load_models() {
|
|||
|
||||
inst.meta.exit_code = 0; // clear failed state so the model can be reloaded
|
||||
inst.meta.update_args(ctx_preset, bin_path);
|
||||
inst.meta.update_caps();
|
||||
}
|
||||
|
||||
// add models that are new in this reload
|
||||
|
|
@ -496,8 +524,10 @@ void server_models::load_models() {
|
|||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* loaded_info */ {},
|
||||
/* exit_code */ 0,
|
||||
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
|
||||
/* multimodal */ mtmd_caps{false, false},
|
||||
};
|
||||
add_model(std::move(meta));
|
||||
newly_added.push_back(name);
|
||||
|
|
@ -1206,14 +1236,28 @@ void server_models_routes::init_routes() {
|
|||
status["failed"] = true;
|
||||
}
|
||||
|
||||
// pi coding agent multimodal compatibility
|
||||
json input_modalities = json::array({"text"});
|
||||
if (meta.multimodal.inp_vision) {
|
||||
input_modalities.push_back("image");
|
||||
}
|
||||
if (meta.multimodal.inp_audio) {
|
||||
input_modalities.push_back("audio");
|
||||
}
|
||||
json architecture {
|
||||
{"input_modalities", input_modalities},
|
||||
{"output_modalities", json::array({"text"})},
|
||||
};
|
||||
|
||||
json model_info = json {
|
||||
{"id", meta.name},
|
||||
{"aliases", meta.aliases},
|
||||
{"tags", meta.tags},
|
||||
{"object", "model"}, // for OAI-compat
|
||||
{"owned_by", "llamacpp"}, // for OAI-compat
|
||||
{"created", t}, // for OAI-compat
|
||||
{"status", status},
|
||||
{"id", meta.name},
|
||||
{"aliases", meta.aliases},
|
||||
{"tags", meta.tags},
|
||||
{"object", "model"}, // for OAI-compat
|
||||
{"owned_by", "llamacpp"}, // for OAI-compat
|
||||
{"created", t}, // for OAI-compat
|
||||
{"status", status},
|
||||
{"architecture", architecture},
|
||||
// TODO: add other fields, may require reading GGUF metadata
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -66,6 +66,7 @@ struct server_model_meta {
|
|||
json loaded_info; // info to be reflected via /v1/models endpoint
|
||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
|
||||
mtmd_caps multimodal; // multimodal capabilities
|
||||
|
||||
bool is_ready() const {
|
||||
return status == SERVER_MODEL_STATUS_LOADED;
|
||||
|
|
@ -80,6 +81,7 @@ struct server_model_meta {
|
|||
}
|
||||
|
||||
void update_args(common_preset_context & ctx_presets, std::string bin_path);
|
||||
void update_caps();
|
||||
};
|
||||
|
||||
struct subprocess_s;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue