mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-06-01 06:00:36 +00:00
download: add option to skip_download (#23059)
* download: add option to skip_download * fix * fix 2 * if file doesn't exist, respect skip_download flag
This commit is contained in:
parent
da3f990a47
commit
06d26dfdff
8 changed files with 126 additions and 83 deletions
|
|
@ -1661,23 +1661,30 @@ Listing all models in cache. The model metadata will also include a field to ind
|
|||
{
|
||||
"data": [{
|
||||
"id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
||||
"in_cache": true,
|
||||
"path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
|
||||
"status": {
|
||||
"value": "loaded",
|
||||
"args": ["llama-server", "-ctx", "4096"]
|
||||
},
|
||||
"architecture": {
|
||||
"input_modalities": [
|
||||
"text",
|
||||
"image"
|
||||
],
|
||||
"output_modalities": [
|
||||
"text"
|
||||
]
|
||||
},
|
||||
...
|
||||
}]
|
||||
}
|
||||
```
|
||||
|
||||
Note:
|
||||
1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
|
||||
2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
|
||||
1. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
|
||||
- If a model is running but updated or removed from the source, it will be unloaded
|
||||
- If a model is not running, it will be added or updated according to the source
|
||||
3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
|
||||
2. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
|
||||
|
||||
The `status` object can be:
|
||||
|
||||
|
|
|
|||
|
|
@ -180,7 +180,8 @@ void server_model_meta::update_caps() {
|
|||
"LLAMA_ARG_HF_REPO",
|
||||
"LLAMA_ARG_HF_REPO_FILE",
|
||||
});
|
||||
params.offline = true; // avoid any unwanted network call during capability detection
|
||||
params.offline = true;
|
||||
// params.skip_download = true; // TODO: ideally, we should validate the model here, but it takes too much time
|
||||
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
|
||||
if (params.mmproj.path.empty()) {
|
||||
multimodal = { false, false };
|
||||
|
|
@ -371,18 +372,19 @@ void server_models::load_models() {
|
|||
// FIRST LOAD: add all models, then unlock for autoloading
|
||||
for (const auto & [name, preset] : final_presets) {
|
||||
server_model_meta meta{
|
||||
/* preset */ preset,
|
||||
/* name */ name,
|
||||
/* aliases */ {},
|
||||
/* tags */ {},
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* loaded_info */ {},
|
||||
/* exit_code */ 0,
|
||||
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
|
||||
/* multimodal */ mtmd_caps{false, false},
|
||||
/* preset */ preset,
|
||||
/* name */ name,
|
||||
/* aliases */ {},
|
||||
/* tags */ {},
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* loaded_info */ {},
|
||||
/* exit_code */ 0,
|
||||
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
|
||||
/* multimodal */ mtmd_caps{false, false},
|
||||
/* need_download */ false,
|
||||
};
|
||||
add_model(std::move(meta));
|
||||
}
|
||||
|
|
@ -524,18 +526,19 @@ void server_models::load_models() {
|
|||
for (const auto & [name, preset] : final_presets) {
|
||||
if (mapping.find(name) == mapping.end()) {
|
||||
server_model_meta meta{
|
||||
/* preset */ preset,
|
||||
/* name */ name,
|
||||
/* aliases */ {},
|
||||
/* tags */ {},
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* loaded_info */ {},
|
||||
/* exit_code */ 0,
|
||||
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
|
||||
/* multimodal */ mtmd_caps{false, false},
|
||||
/* preset */ preset,
|
||||
/* name */ name,
|
||||
/* aliases */ {},
|
||||
/* tags */ {},
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* loaded_info */ {},
|
||||
/* exit_code */ 0,
|
||||
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
|
||||
/* multimodal */ mtmd_caps{false, false},
|
||||
/* need_download */ false,
|
||||
};
|
||||
add_model(std::move(meta));
|
||||
newly_added.push_back(name);
|
||||
|
|
@ -1263,14 +1266,15 @@ void server_models_routes::init_routes() {
|
|||
};
|
||||
|
||||
json model_info = json {
|
||||
{"id", meta.name},
|
||||
{"aliases", meta.aliases},
|
||||
{"tags", meta.tags},
|
||||
{"object", "model"}, // for OAI-compat
|
||||
{"owned_by", "llamacpp"}, // for OAI-compat
|
||||
{"created", t}, // for OAI-compat
|
||||
{"status", status},
|
||||
{"architecture", architecture},
|
||||
{"id", meta.name},
|
||||
{"aliases", meta.aliases},
|
||||
{"tags", meta.tags},
|
||||
{"object", "model"}, // for OAI-compat
|
||||
{"owned_by", "llamacpp"}, // for OAI-compat
|
||||
{"created", t}, // for OAI-compat
|
||||
{"status", status},
|
||||
{"architecture", architecture},
|
||||
{"need_download", meta.need_download},
|
||||
// TODO: add other fields, may require reading GGUF metadata
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ struct server_model_meta {
|
|||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
|
||||
mtmd_caps multimodal; // multimodal capabilities
|
||||
bool need_download = false; // whether the model needs to be downloaded before loading
|
||||
|
||||
bool is_ready() const {
|
||||
return status == SERVER_MODEL_STATUS_LOADED;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue