mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 03:10:03 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # common/common.cpp # examples/batched-bench/batched-bench.cpp # examples/batched/batched.cpp # examples/export-lora/export-lora.cpp # examples/gritlm/gritlm.cpp # examples/parallel/parallel.cpp # examples/passkey/passkey.cpp # examples/speculative-simple/speculative-simple.cpp # examples/speculative/speculative.cpp # ggml/src/ggml-cann/CMakeLists.txt # ggml/src/ggml-cann/acl_tensor.cpp # ggml/src/ggml-cann/acl_tensor.h # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/aclnn_ops.h # ggml/src/ggml-vulkan/CMakeLists.txt # tests/test-arg-parser.cpp # tests/test-backend-ops.cpp
This commit is contained in:
commit
103d60ed2c
43 changed files with 1509 additions and 1129 deletions
|
|
@ -4,6 +4,26 @@
|
|||
>
|
||||
> This is very experimental, only used for demo purpose.
|
||||
|
||||
## Quick started
|
||||
|
||||
You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
|
||||
|
||||
```bash
|
||||
# build
|
||||
cmake -B build
|
||||
cmake --build build --target llama-gemma3-cli
|
||||
|
||||
# alternatively, install from brew (MacOS)
|
||||
brew install llama.cpp
|
||||
|
||||
# run it
|
||||
llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
|
||||
llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
|
||||
|
||||
# note: 1B model does not support vision
|
||||
```
|
||||
|
||||
## How to get mmproj.gguf?
|
||||
|
||||
```bash
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ struct gemma3_context {
|
|||
}
|
||||
|
||||
void init_clip_model(common_params & params) {
|
||||
const char * clip_path = params.mmproj.c_str();
|
||||
const char * clip_path = params.mmproj.path.c_str();
|
||||
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
|
||||
}
|
||||
|
||||
|
|
@ -232,13 +232,13 @@ int main(int argc, char ** argv) {
|
|||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty()) {
|
||||
if (params.mmproj.path.empty()) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
gemma3_context ctx(params);
|
||||
printf("%s: %s\n", __func__, params.model.c_str());
|
||||
printf("%s: %s\n", __func__, params.model.path.c_str());
|
||||
|
||||
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
||||
|
||||
|
|
|
|||
|
|
@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
|
|
@ -234,7 +234,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
const char * clip_path = params->mmproj.c_str();
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
|
|
@ -283,7 +283,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
|
|
@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|||
}
|
||||
|
||||
static struct clip_ctx * clip_init_context(common_params * params) {
|
||||
const char * clip_path = params->mmproj.c_str();
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
|
|
@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty())) {
|
||||
if (params.mmproj.path.empty() || (params.image.empty())) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -314,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
|
|
@ -323,7 +323,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
const char * clip_path = params->mmproj.c_str();
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
|
|
@ -524,7 +524,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -133,7 +133,8 @@ struct slot_params {
|
|||
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : sampling.grammar_triggers) {
|
||||
grammar_triggers.push_back(trigger.to_json<json>());
|
||||
server_grammar_trigger ct(std::move(trigger));
|
||||
grammar_triggers.push_back(ct.to_json());
|
||||
}
|
||||
|
||||
return json {
|
||||
|
|
@ -372,9 +373,9 @@ struct server_task {
|
|||
const auto grammar_triggers = data.find("grammar_triggers");
|
||||
if (grammar_triggers != data.end()) {
|
||||
for (const auto & t : *grammar_triggers) {
|
||||
auto ct = common_grammar_trigger::from_json(t);
|
||||
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
||||
const auto & word = ct.value;
|
||||
server_grammar_trigger ct(t);
|
||||
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
||||
const auto & word = ct.value.value;
|
||||
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
|
||||
if (ids.size() == 1) {
|
||||
auto token = ids[0];
|
||||
|
|
@ -392,7 +393,7 @@ struct server_task {
|
|||
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
|
||||
}
|
||||
} else {
|
||||
params.sampling.grammar_triggers.push_back(ct);
|
||||
params.sampling.grammar_triggers.push_back(std::move(ct.value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1876,7 +1877,7 @@ struct server_context {
|
|||
}
|
||||
|
||||
bool load_model(const common_params & params) {
|
||||
SRV_INF("loading model '%s'\n", params.model.c_str());
|
||||
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
||||
|
||||
params_base = params;
|
||||
|
||||
|
|
@ -1886,7 +1887,7 @@ struct server_context {
|
|||
ctx = llama_init.context.get();
|
||||
|
||||
if (model == nullptr) {
|
||||
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
||||
SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -1897,16 +1898,13 @@ struct server_context {
|
|||
add_bos_token = llama_vocab_get_add_bos(vocab);
|
||||
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
||||
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
|
||||
|
||||
auto params_dft = params_base;
|
||||
|
||||
params_dft.devices = params_base.speculative.devices;
|
||||
params_dft.hf_file = params_base.speculative.hf_file;
|
||||
params_dft.hf_repo = params_base.speculative.hf_repo;
|
||||
params_dft.model = params_base.speculative.model;
|
||||
params_dft.model_url = params_base.speculative.model_url;
|
||||
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||
params_dft.n_parallel = 1;
|
||||
|
|
@ -1920,12 +1918,12 @@ struct server_context {
|
|||
model_dft = llama_init_dft.model.get();
|
||||
|
||||
if (model_dft == nullptr) {
|
||||
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
||||
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
|
||||
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
||||
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
@ -3865,7 +3863,7 @@ int main(int argc, char ** argv) {
|
|||
json data = {
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "model_path", ctx_server.params_base.model },
|
||||
{ "model_path", ctx_server.params_base.model.path },
|
||||
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
||||
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
||||
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
||||
|
|
@ -4131,7 +4129,7 @@ int main(int argc, char ** argv) {
|
|||
{"object", "list"},
|
||||
{"data", {
|
||||
{
|
||||
{"id", params.model_alias.empty() ? params.model : params.model_alias},
|
||||
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
|
||||
{"object", "model"},
|
||||
{"created", std::time(0)},
|
||||
{"owned_by", "llamacpp"},
|
||||
|
|
|
|||
|
|
@ -58,6 +58,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|||
|
||||
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||
|
||||
// thin wrapper around common_grammar_trigger with (de)serialization functions
|
||||
struct server_grammar_trigger {
|
||||
common_grammar_trigger value;
|
||||
|
||||
server_grammar_trigger() = default;
|
||||
server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
|
||||
server_grammar_trigger(const json & in) {
|
||||
value.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
||||
value.value = in.at("value").get<std::string>();
|
||||
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
value.token = (llama_token) in.at("token").get<int>();
|
||||
}
|
||||
}
|
||||
|
||||
json to_json() const {
|
||||
json out {
|
||||
{"type", (int) value.type},
|
||||
{"value", value.value},
|
||||
};
|
||||
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
out["token"] = (int) value.token;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// tokenizer and input processing utils
|
||||
//
|
||||
|
|
@ -627,7 +653,8 @@ static json oaicompat_completion_params_parse(
|
|||
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : chat_params.grammar_triggers) {
|
||||
grammar_triggers.push_back(trigger.to_json<json>());
|
||||
server_grammar_trigger ct(trigger);
|
||||
grammar_triggers.push_back(ct.to_json());
|
||||
}
|
||||
llama_params["grammar_triggers"] = grammar_triggers;
|
||||
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
||||
|
|
|
|||
|
|
@ -577,12 +577,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
|
||||
|
||||
// TODO: refactor in a common struct
|
||||
params.model = params.vocoder.model;
|
||||
params.model_url = params.vocoder.model_url;
|
||||
params.hf_repo = params.vocoder.hf_repo;
|
||||
params.hf_file = params.vocoder.hf_file;
|
||||
|
||||
params.model = params.vocoder.model;
|
||||
params.embedding = true;
|
||||
|
||||
common_init_result llama_init_cts = common_init_from_params(params);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue