download: add option to skip_download (#23059)
Some checks are pending
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
Python Type-Check / python type-check (push) Waiting to run

* download: add option to skip_download

* fix

* fix 2

* if file doesn't exist, respect skip_download flag
This commit is contained in:
Xuan-Son Nguyen 2026-05-29 16:30:55 +02:00 committed by GitHub
parent da3f990a47
commit 06d26dfdff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 126 additions and 83 deletions

View file

@ -1661,23 +1661,30 @@ Listing all models in cache. The model metadata will also include a field to ind
{
"data": [{
"id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
"in_cache": true,
"path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
"status": {
"value": "loaded",
"args": ["llama-server", "-ctx", "4096"]
},
"architecture": {
"input_modalities": [
"text",
"image"
],
"output_modalities": [
"text"
]
},
...
}]
}
```
Note:
1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
1. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
- If a model is running but updated or removed from the source, it will be unloaded
- If a model is not running, it will be added or updated according to the source
3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
2. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
The `status` object can be:

View file

@ -180,7 +180,8 @@ void server_model_meta::update_caps() {
"LLAMA_ARG_HF_REPO",
"LLAMA_ARG_HF_REPO_FILE",
});
params.offline = true; // avoid any unwanted network call during capability detection
params.offline = true;
// params.skip_download = true; // TODO: ideally, we should validate the model here, but it takes too much time
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
if (params.mmproj.path.empty()) {
multimodal = { false, false };
@ -371,18 +372,19 @@ void server_models::load_models() {
// FIRST LOAD: add all models, then unlock for autoloading
for (const auto & [name, preset] : final_presets) {
server_model_meta meta{
/* preset */ preset,
/* name */ name,
/* aliases */ {},
/* tags */ {},
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* loaded_info */ {},
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
/* multimodal */ mtmd_caps{false, false},
/* preset */ preset,
/* name */ name,
/* aliases */ {},
/* tags */ {},
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* loaded_info */ {},
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
/* multimodal */ mtmd_caps{false, false},
/* need_download */ false,
};
add_model(std::move(meta));
}
@ -524,18 +526,19 @@ void server_models::load_models() {
for (const auto & [name, preset] : final_presets) {
if (mapping.find(name) == mapping.end()) {
server_model_meta meta{
/* preset */ preset,
/* name */ name,
/* aliases */ {},
/* tags */ {},
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* loaded_info */ {},
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
/* multimodal */ mtmd_caps{false, false},
/* preset */ preset,
/* name */ name,
/* aliases */ {},
/* tags */ {},
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* loaded_info */ {},
/* exit_code */ 0,
/* stop_timeout */ DEFAULT_STOP_TIMEOUT,
/* multimodal */ mtmd_caps{false, false},
/* need_download */ false,
};
add_model(std::move(meta));
newly_added.push_back(name);
@ -1263,14 +1266,15 @@ void server_models_routes::init_routes() {
};
json model_info = json {
{"id", meta.name},
{"aliases", meta.aliases},
{"tags", meta.tags},
{"object", "model"}, // for OAI-compat
{"owned_by", "llamacpp"}, // for OAI-compat
{"created", t}, // for OAI-compat
{"status", status},
{"architecture", architecture},
{"id", meta.name},
{"aliases", meta.aliases},
{"tags", meta.tags},
{"object", "model"}, // for OAI-compat
{"owned_by", "llamacpp"}, // for OAI-compat
{"created", t}, // for OAI-compat
{"status", status},
{"architecture", architecture},
{"need_download", meta.need_download},
// TODO: add other fields, may require reading GGUF metadata
};

View file

@ -67,6 +67,7 @@ struct server_model_meta {
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
mtmd_caps multimodal; // multimodal capabilities
bool need_download = false; // whether the model needs to be downloaded before loading
bool is_ready() const {
return status == SERVER_MODEL_STATUS_LOADED;