Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/ISSUE_TEMPLATE/010-bug-compilation.yml
#	.github/ISSUE_TEMPLATE/011-bug-results.yml
#	.github/ISSUE_TEMPLATE/019-bug-misc.yml
#	.github/workflows/build.yml
#	.github/workflows/docker.yml
#	CMakeLists.txt
#	Makefile
#	Package.swift
#	examples/CMakeLists.txt
#	examples/eval-callback/CMakeLists.txt
#	examples/llama-bench/llama-bench.cpp
#	examples/server/README.md
#	examples/server/server.cpp
#	examples/simple-chat/simple-chat.cpp
#	examples/simple/simple.cpp
#	examples/speculative-simple/speculative-simple.cpp
#	examples/speculative/speculative.cpp
#	ggml/CMakeLists.txt
#	ggml/src/CMakeLists.txt
#	ggml/src/ggml-amx/CMakeLists.txt
#	ggml/src/ggml-blas/CMakeLists.txt
#	ggml/src/ggml-cann/CMakeLists.txt
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-cuda/CMakeLists.txt
#	ggml/src/ggml-hip/CMakeLists.txt
#	ggml/src/ggml-kompute/CMakeLists.txt
#	ggml/src/ggml-kompute/ggml-kompute.cpp
#	ggml/src/ggml-metal/CMakeLists.txt
#	ggml/src/ggml-musa/CMakeLists.txt
#	ggml/src/ggml-rpc/CMakeLists.txt
#	ggml/src/ggml-sycl/CMakeLists.txt
#	ggml/src/ggml-vulkan/CMakeLists.txt
#	pocs/CMakeLists.txt
#	tests/CMakeLists.txt
#	tests/test-backend-ops.cpp
#	tests/test-quantize-fns.cpp
This commit is contained in:
Concedo 2024-11-26 17:01:20 +08:00
commit ec581b19d8
31 changed files with 1383 additions and 676 deletions

View file

@ -299,6 +299,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
print_options(specific_options); print_options(specific_options);
} }
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
std::vector<ggml_backend_dev_t> devices;
auto dev_names = string_split<std::string>(value, ',');
if (dev_names.empty()) {
throw std::invalid_argument("no devices specified");
}
if (dev_names.size() == 1 && dev_names[0] == "none") {
devices.push_back(nullptr);
} else {
for (const auto & device : dev_names) {
auto * dev = ggml_backend_dev_by_name(device.c_str());
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
}
devices.push_back(dev);
}
devices.push_back(nullptr);
}
return devices;
}
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
auto ctx_arg = common_params_parser_init(params, ex, print_usage); auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params const common_params params_org = ctx_arg.params; // the example can modify the default params
@ -325,6 +346,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
} }
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// load dynamic backends
ggml_backend_load_all();
common_params_context ctx_arg(params); common_params_context ctx_arg(params);
ctx_arg.print_usage = print_usage; ctx_arg.print_usage = print_usage;
ctx_arg.ex = ex; ctx_arg.ex = ex;
@ -1313,6 +1337,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
else { throw std::invalid_argument("invalid value"); } else { throw std::invalid_argument("invalid value"); }
} }
).set_env("LLAMA_ARG_NUMA")); ).set_env("LLAMA_ARG_NUMA"));
add_opt(common_arg(
{"-dev", "--device"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading (none = don't offload)\n"
"use --list-devices to see a list of available devices",
[](common_params & params, const std::string & value) {
params.devices = parse_device_list(value);
}
).set_env("LLAMA_ARG_DEVICE"));
add_opt(common_arg(
{"--list-devices"},
"print list of available devices and exit",
[](common_params &) {
printf("Available devices:\n");
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
}
}
exit(0);
}
));
add_opt(common_arg( add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM", "number of layers to store in VRAM",
@ -1337,10 +1385,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} else if (arg_next == "layer") { } else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER; params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (arg_next == "row") { } else if (arg_next == "row") {
#ifdef GGML_USE_SYCL
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
exit(1);
#endif // GGML_USE_SYCL
params.split_mode = LLAMA_SPLIT_MODE_ROW; params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else { } else {
throw std::invalid_argument("invalid value"); throw std::invalid_argument("invalid value");
@ -2043,6 +2087,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.n_ctx = value; params.speculative.n_ctx = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
"use --list-devices to see a list of available devices",
[](common_params & params, const std::string & value) {
params.speculative.devices = parse_device_list(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
"number of layers to store in VRAM for the draft model", "number of layers to store in VRAM for the draft model",

View file

@ -981,9 +981,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
} }
} }
struct llama_model_params common_model_params_to_llama(const common_params & params) { struct llama_model_params common_model_params_to_llama(common_params & params) {
auto mparams = llama_model_default_params(); auto mparams = llama_model_default_params();
if (!params.devices.empty()) {
mparams.devices = params.devices.data();
}
if (params.n_gpu_layers != -1) { if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers; mparams.n_gpu_layers = params.n_gpu_layers;
} }

View file

@ -152,6 +152,7 @@ struct common_params_sampling {
}; };
struct common_params_speculative { struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_ctx = 0; // draft context size int32_t n_ctx = 0; // draft context size
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@ -174,9 +175,6 @@ struct common_params {
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode int32_t n_sequences = 1; // number of sequences to decode
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled) int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
@ -189,6 +187,13 @@ struct common_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = 0.1f; // KV cache defragmentation threshold float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
struct cpu_params cpuparams; struct cpu_params cpuparams;
struct cpu_params cpuparams_batch; struct cpu_params cpuparams_batch;
@ -197,7 +202,6 @@ struct common_params {
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
@ -458,7 +462,7 @@ struct common_init_result {
struct common_init_result common_init_from_params(common_params & params); struct common_init_result common_init_from_params(common_params & params);
struct llama_model_params common_model_params_to_llama (const common_params & params); struct llama_model_params common_model_params_to_llama ( common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

View file

@ -90,9 +90,10 @@ bool common_speculative_are_compatible(
if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) || llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
llama_token_eos(model_tgt) != llama_token_eos(model_dft) llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
) {
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
return false; return false;
} }

View file

@ -3040,9 +3040,9 @@ class OlmoModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("Olmo1124ForCausalLM") @Model.register("Olmo2ForCausalLM")
class Olmo1124Model(Model): class Olmo2Model(Model):
model_arch = gguf.MODEL_ARCH.OLMO_1124 model_arch = gguf.MODEL_ARCH.OLMO2
@Model.register("OlmoeForCausalLM") @Model.register("OlmoeForCausalLM")

View file

@ -166,6 +166,10 @@ int main(int argc, char ** argv) {
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
struct ggml_threadpool_params tpp_batch = struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp = struct ggml_threadpool_params tpp =
@ -175,7 +179,7 @@ int main(int argc, char ** argv) {
struct ggml_threadpool * threadpool_batch = NULL; struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch); threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
if (!threadpool_batch) { if (!threadpool_batch) {
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
return 1; return 1;
@ -185,7 +189,7 @@ int main(int argc, char ** argv) {
tpp.paused = true; tpp.paused = true;
} }
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) { if (!threadpool) {
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
return 1; return 1;
@ -891,8 +895,8 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
ggml_threadpool_free(threadpool); ggml_threadpool_free_fn(threadpool);
ggml_threadpool_free(threadpool_batch); ggml_threadpool_free_fn(threadpool_batch);
return 0; return 0;
} }

View file

@ -81,7 +81,13 @@
<path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/> <path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
</svg> </svg>
</button> </button>
<button v-if="messages.length > 0" class="btn mr-1" @click="downloadConv(viewingConvId)" :disabled="isGenerating">
<!-- download conversation button -->
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-download" viewBox="0 0 16 16">
<path d="M.5 9.9a.5.5 0 0 1 .5.5v2.5a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-2.5a.5.5 0 0 1 1 0v2.5a2 2 0 0 1-2 2H2a2 2 0 0 1-2-2v-2.5a.5.5 0 0 1 .5-.5"/>
<path d="M7.646 11.854a.5.5 0 0 0 .708 0l3-3a.5.5 0 0 0-.708-.708L8.5 10.293V1.5a.5.5 0 0 0-1 0v8.793L5.354 8.146a.5.5 0 1 0-.708.708z"/>
</svg>
</button>
<button class="btn" @click="showConfigDialog = true" :disabled="isGenerating"> <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
<!-- edit config button --> <!-- edit config button -->
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16"> <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
@ -526,6 +532,23 @@
this.fetchMessages(); this.fetchMessages();
} }
}, },
downloadConv(convId) {
const conversation = StorageUtils.getOneConversation(convId);
if (!conversation) {
alert('Conversation not found.');
return;
}
const conversationJson = JSON.stringify(conversation, null, 2);
const blob = new Blob([conversationJson], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `conversation_${convId}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
},
async sendMessage() { async sendMessage() {
if (!this.inputMsg) return; if (!this.inputMsg) return;
const currConvId = this.viewingConvId; const currConvId = this.viewingConvId;

View file

@ -2,11 +2,11 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "sampling.h"
#include "json-schema-to-grammar.h" #include "json-schema-to-grammar.h"
#include "llama.h" #include "llama.h"
#include "build-info.h" #include "log.h"
#include "sampling.h"
#include "speculative.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT: // Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT #define JSON_ASSERT GGML_ASSERT
@ -111,7 +111,7 @@ struct server_static_file {
struct slot_params { struct slot_params {
bool stream = true; bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@ -122,12 +122,21 @@ struct slot_params {
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<std::string> antiprompt; std::vector<std::string> antiprompt;
struct common_params_sampling sampling;
struct common_params_speculative speculative;
}; };
struct server_slot { struct server_slot {
int id; int id;
int id_task = -1; int id_task = -1;
llama_batch batch_spec;
llama_context * ctx_dft = nullptr;
common_speculative * spec = nullptr;
// the index relative to completion multi-task request // the index relative to completion multi-task request
size_t index = 0; size_t index = 0;
@ -176,7 +185,6 @@ struct server_slot {
// sampling // sampling
json json_schema; json json_schema;
struct common_params_sampling sparams;
struct common_sampler * smpl = nullptr; struct common_sampler * smpl = nullptr;
llama_token sampled; llama_token sampled;
@ -213,7 +221,7 @@ struct server_slot {
generated_token_probs.clear(); generated_token_probs.clear();
} }
bool has_budget(common_params &global_params) { bool has_budget(const common_params & global_params) {
if (params.n_predict == -1 && global_params.n_predict == -1) { if (params.n_predict == -1 && global_params.n_predict == -1) {
return true; // limitless return true; // limitless
} }
@ -233,6 +241,10 @@ struct server_slot {
return state != SLOT_STATE_IDLE; return state != SLOT_STATE_IDLE;
} }
bool can_speculate() const {
return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
}
void add_token(const completion_token_output & token) { void add_token(const completion_token_output & token) {
if (!is_processing()) { if (!is_processing()) {
SLT_WRN(*this, "%s", "slot is not processing\n"); SLT_WRN(*this, "%s", "slot is not processing\n");
@ -592,11 +604,14 @@ struct server_response {
}; };
struct server_context { struct server_context {
common_params params_base;
llama_model * model = nullptr; llama_model * model = nullptr;
llama_context * ctx = nullptr; llama_context * ctx = nullptr;
std::vector<common_lora_adapter_container> loras; std::vector<common_lora_adapter_container> loras;
common_params params; llama_model * model_dft = nullptr;
llama_context_params cparams_dft;
llama_batch batch = {}; llama_batch batch = {};
@ -629,27 +644,41 @@ struct server_context {
model = nullptr; model = nullptr;
} }
if (model_dft) {
llama_free_model(model_dft);
model_dft = nullptr;
}
// Clear any sampling context // Clear any sampling context
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
if (slot.smpl != nullptr) {
common_sampler_free(slot.smpl); common_sampler_free(slot.smpl);
} slot.smpl = nullptr;
llama_free(slot.ctx_dft);
slot.ctx_dft = nullptr;
common_speculative_free(slot.spec);
slot.spec = nullptr;
llama_batch_free(slot.batch_spec);
} }
llama_batch_free(batch); llama_batch_free(batch);
} }
bool load_model(const common_params & params_) { bool load_model(const common_params & params) {
params = params_; SRV_INF("loading model '%s'\n", params.model.c_str());
common_init_result llama_init = common_init_from_params(params); params_base = params;
common_init_result llama_init = common_init_from_params(params_base);
model = llama_init.model; model = llama_init.model;
ctx = llama_init.context; ctx = llama_init.context;
loras = llama_init.lora_adapters; loras = llama_init.lora_adapters;
if (model == nullptr) { if (model == nullptr) {
SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
return false; return false;
} }
@ -658,6 +687,41 @@ struct server_context {
add_bos_token = llama_add_bos_token(model); add_bos_token = llama_add_bos_token(model);
has_eos_token = !llama_add_eos_token(model); has_eos_token = !llama_add_eos_token(model);
if (!params_base.speculative.model.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
auto params_dft = params_base;
params_dft.devices = params_base.speculative.devices;
params_dft.model = params_base.speculative.model;
params_dft.n_ctx = params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
common_init_result llama_init_dft = common_init_from_params(params_dft);
model_dft = llama_init_dft.model;
if (model_dft == nullptr) {
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
return false;
}
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
llama_free (llama_init_dft.context);
llama_free_model(llama_init_dft.model);
return false;
}
cparams_dft = common_context_params_to_llama(params_base);
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
// the context is not needed - we will create one for each slot
llama_free(llama_init_dft.context);
}
return true; return true;
} }
@ -675,20 +739,36 @@ struct server_context {
} }
void init() { void init() {
const int32_t n_ctx_slot = n_ctx / params.n_parallel; const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel); SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
for (int i = 0; i < params.n_parallel; i++) { for (int i = 0; i < params_base.n_parallel; i++) {
server_slot slot; server_slot slot;
slot.id = i; slot.id = i;
slot.n_ctx = n_ctx_slot; slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict; slot.n_predict = params_base.n_predict;
if (model_dft) {
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
if (slot.ctx_dft == nullptr) {
SRV_ERR("%s", "failed to create draft context\n");
return;
}
slot.spec = common_speculative_init(slot.ctx_dft);
if (slot.spec == nullptr) {
SRV_ERR("%s", "failed to create speculator\n");
return;
}
}
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
slot.sparams = params.sampling; slot.params.sampling = params_base.sampling;
slot.callback_on_release = [this](int) { slot.callback_on_release = [this](int) {
queue_tasks.pop_deferred_task(); queue_tasks.pop_deferred_task();
@ -708,7 +788,7 @@ struct server_context {
const int32_t n_batch = llama_n_batch(ctx); const int32_t n_batch = llama_n_batch(ctx);
// only a single seq_id per token is needed // only a single seq_id per token is needed
batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1); batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
} }
metrics.init(); metrics.init();
@ -787,9 +867,11 @@ struct server_context {
} }
bool launch_slot_with_task(server_slot & slot, const server_task & task) { bool launch_slot_with_task(server_slot & slot, const server_task & task) {
slot_params default_params;
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
auto default_sparams = params.sampling; slot_params defaults;
defaults.sampling = params_base.sampling;
defaults.speculative = params_base.speculative;
const auto & data = task.data; const auto & data = task.data;
if (data.count("__oaicompat") != 0) { if (data.count("__oaicompat") != 0) {
@ -801,41 +883,47 @@ struct server_context {
} }
slot.params.stream = json_value(data, "stream", false); slot.params.stream = json_value(data, "stream", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", false); slot.params.cache_prompt = json_value(data, "cache_prompt", true);
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent); slot.params.n_indent = json_value(data, "n_indent", defaults.n_indent);
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.params.n_keep = json_value(data, "n_keep", defaults.n_keep);
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.params.n_discard = json_value(data, "n_discard", defaults.n_discard);
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold);
slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
slot.sparams.dry_multiplier = json_value(data, "dry_multiplier", default_sparams.dry_multiplier);
slot.sparams.dry_base = json_value(data, "dry_base", default_sparams.dry_base);
slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length);
slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n);
slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep);
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
//slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement
slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms);
if (slot.sparams.dry_base < 1.0f) slot.params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
{ slot.params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
slot.sparams.dry_base = default_sparams.dry_base; slot.params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
slot.params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
slot.params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
slot.params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
slot.params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp);
slot.params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range);
slot.params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent);
slot.params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n);
slot.params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat);
slot.params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq);
slot.params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present);
slot.params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier);
slot.params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base);
slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
slot.params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
slot.params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
slot.params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
slot.params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl);
slot.params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
slot.params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
slot.params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
if (slot.params.sampling.dry_base < 1.0f) {
slot.params.sampling.dry_base = defaults.sampling.dry_base;
} }
// sequence breakers for DRY // sequence breakers for DRY
@ -844,8 +932,8 @@ struct server_context {
// Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
if (data.contains("dry_sequence_breakers")) { if (data.contains("dry_sequence_breakers")) {
slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>()); slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
if (slot.sparams.dry_sequence_breakers.empty()) { if (slot.params.sampling.dry_sequence_breakers.empty()) {
send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST);
return false; return false;
} }
@ -860,13 +948,13 @@ struct server_context {
if (data.contains("json_schema") && !data.contains("grammar")) { if (data.contains("json_schema") && !data.contains("grammar")) {
try { try {
auto schema = json_value(data, "json_schema", json::object()); auto schema = json_value(data, "json_schema", json::object());
slot.sparams.grammar = json_schema_to_grammar(schema); slot.params.sampling.grammar = json_schema_to_grammar(schema);
} catch (const std::exception & e) { } catch (const std::exception & e) {
send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
return false; return false;
} }
} else { } else {
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
} }
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@ -876,10 +964,10 @@ struct server_context {
} }
{ {
slot.sparams.logit_bias.clear(); slot.params.sampling.logit_bias.clear();
if (json_value(data, "ignore_eos", false) && has_eos_token) { if (json_value(data, "ignore_eos", false) && has_eos_token) {
slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY}); slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
} }
const auto & logit_bias = data.find("logit_bias"); const auto & logit_bias = data.find("logit_bias");
@ -900,12 +988,12 @@ struct server_context {
if (el[0].is_number_integer()) { if (el[0].is_number_integer()) {
llama_token tok = el[0].get<llama_token>(); llama_token tok = el[0].get<llama_token>();
if (tok >= 0 && tok < n_vocab) { if (tok >= 0 && tok < n_vocab) {
slot.sparams.logit_bias.push_back({tok, bias}); slot.params.sampling.logit_bias.push_back({tok, bias});
} }
} else if (el[0].is_string()) { } else if (el[0].is_string()) {
auto toks = common_tokenize(model, el[0].get<std::string>(), false); auto toks = common_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks) { for (auto tok : toks) {
slot.sparams.logit_bias.push_back({tok, bias}); slot.params.sampling.logit_bias.push_back({tok, bias});
} }
} }
} }
@ -936,16 +1024,16 @@ struct server_context {
sampler_names.emplace_back(name); sampler_names.emplace_back(name);
} }
} }
slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false); slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false);
} else if (samplers->is_string()){ } else if (samplers->is_string()){
std::string sampler_string; std::string sampler_string;
for (const auto & name : *samplers) { for (const auto & name : *samplers) {
sampler_string += name; sampler_string += name;
} }
slot.sparams.samplers = common_sampler_types_from_chars(sampler_string); slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string);
} }
} else { } else {
slot.sparams.samplers = default_sparams.samplers; slot.params.sampling.samplers = defaults.sampling.samplers;
} }
} }
@ -954,7 +1042,7 @@ struct server_context {
common_sampler_free(slot.smpl); common_sampler_free(slot.smpl);
} }
slot.smpl = common_sampler_init(model, slot.sparams); slot.smpl = common_sampler_init(model, slot.params.sampling);
if (slot.smpl == nullptr) { if (slot.smpl == nullptr) {
// for now, the only error that may happen here is invalid grammar // for now, the only error that may happen here is invalid grammar
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
@ -962,6 +1050,12 @@ struct server_context {
} }
} }
if (slot.ctx_dft) {
llama_batch_free(slot.batch_spec);
slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1);
}
slot.state = SLOT_STATE_STARTED; slot.state = SLOT_STATE_STARTED;
SLT_INF(slot, "%s", "processing task\n"); SLT_INF(slot, "%s", "processing task\n");
@ -979,7 +1073,7 @@ struct server_context {
bool process_token(completion_token_output & result, server_slot & slot) { bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling // remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special); const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
slot.sampled = result.tok; slot.sampled = result.tok;
// search stop word and delete it // search stop word and delete it
@ -1044,7 +1138,7 @@ struct server_context {
} }
// check the limits // check the limits
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) { if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
slot.stopped_limit = true; slot.stopped_limit = true;
slot.has_next_token = false; slot.has_next_token = false;
@ -1137,50 +1231,54 @@ struct server_context {
json get_formated_generation(const server_slot & slot) const { json get_formated_generation(const server_slot & slot) const {
std::vector<std::string> samplers; std::vector<std::string> samplers;
samplers.reserve(slot.sparams.samplers.size()); samplers.reserve(slot.params.sampling.samplers.size());
for (const auto & sampler : slot.sparams.samplers) { for (const auto & sampler : slot.params.sampling.samplers) {
samplers.emplace_back(common_sampler_type_to_str(sampler)); samplers.emplace_back(common_sampler_type_to_str(sampler));
} }
return json { return json {
{"n_ctx", slot.n_ctx}, {"n_ctx", slot.n_ctx},
{"n_predict", slot.n_predict}, // Server configured n_predict {"n_predict", slot.n_predict}, // Server configured n_predict
{"model", params.model_alias}, {"model", params_base.model_alias},
{"seed", slot.sparams.seed}, {"seed", slot.params.sampling.seed},
{"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
{"temperature", slot.sparams.temp}, {"temperature", slot.params.sampling.temp},
{"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_range", slot.params.sampling.dynatemp_range},
{"dynatemp_exponent", slot.sparams.dynatemp_exponent}, {"dynatemp_exponent", slot.params.sampling.dynatemp_exponent},
{"top_k", slot.sparams.top_k}, {"top_k", slot.params.sampling.top_k},
{"top_p", slot.sparams.top_p}, {"top_p", slot.params.sampling.top_p},
{"min_p", slot.sparams.min_p}, {"min_p", slot.params.sampling.min_p},
{"xtc_probability", slot.sparams.xtc_probability}, {"xtc_probability", slot.params.sampling.xtc_probability},
{"xtc_threshold", slot.sparams.xtc_threshold}, {"xtc_threshold", slot.params.sampling.xtc_threshold},
{"typical_p", slot.sparams.typ_p}, {"typical_p", slot.params.sampling.typ_p},
{"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_last_n", slot.params.sampling.penalty_last_n},
{"repeat_penalty", slot.sparams.penalty_repeat}, {"repeat_penalty", slot.params.sampling.penalty_repeat},
{"presence_penalty", slot.sparams.penalty_present}, {"presence_penalty", slot.params.sampling.penalty_present},
{"frequency_penalty", slot.sparams.penalty_freq}, {"frequency_penalty", slot.params.sampling.penalty_freq},
{"dry_multiplier", slot.sparams.dry_multiplier}, {"dry_multiplier", slot.params.sampling.dry_multiplier},
{"dry_base", slot.sparams.dry_base}, {"dry_base", slot.params.sampling.dry_base},
{"dry_allowed_length", slot.sparams.dry_allowed_length}, {"dry_allowed_length", slot.params.sampling.dry_allowed_length},
{"dry_penalty_last_n", slot.sparams.dry_penalty_last_n}, {"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n},
{"dry_sequence_breakers", slot.sparams.dry_sequence_breakers}, {"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers},
{"mirostat", slot.sparams.mirostat}, {"mirostat", slot.params.sampling.mirostat},
{"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_tau", slot.params.sampling.mirostat_tau},
{"mirostat_eta", slot.sparams.mirostat_eta}, {"mirostat_eta", slot.params.sampling.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl}, {"penalize_nl", slot.params.sampling.penalize_nl},
{"stop", slot.params.antiprompt}, {"stop", slot.params.antiprompt},
{"max_tokens", slot.params.n_predict}, // User configured n_predict {"max_tokens", slot.params.n_predict}, // User configured n_predict
{"n_keep", slot.params.n_keep}, {"n_keep", slot.params.n_keep},
{"n_discard", slot.params.n_discard}, {"n_discard", slot.params.n_discard},
{"ignore_eos", slot.sparams.ignore_eos}, {"ignore_eos", slot.params.sampling.ignore_eos},
{"stream", slot.params.stream}, {"stream", slot.params.stream},
//{"logit_bias", slot.sparams.logit_bias}, //{"logit_bias", slot.params.sampling.logit_bias},
{"n_probs", slot.sparams.n_probs}, {"n_probs", slot.params.sampling.n_probs},
{"min_keep", slot.sparams.min_keep}, {"min_keep", slot.params.sampling.min_keep},
{"grammar", slot.sparams.grammar}, {"grammar", slot.params.sampling.grammar},
{"samplers", samplers}, {"samplers", samplers},
{"speculative", slot.can_speculate()},
{"speculative.n_max", slot.params.speculative.n_max},
{"speculative.n_min", slot.params.speculative.n_min},
{"speculative.p_min", slot.params.speculative.p_min},
}; };
} }
@ -1217,7 +1315,7 @@ struct server_context {
{"index", slot.index}, {"index", slot.index},
}; };
if (slot.sparams.n_probs > 0) { if (slot.params.sampling.n_probs > 0) {
const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
@ -1250,7 +1348,7 @@ struct server_context {
{"content", !slot.params.stream ? slot.generated_text : ""}, {"content", !slot.params.stream ? slot.generated_text : ""},
{"id_slot", slot.id}, {"id_slot", slot.id},
{"stop", true}, {"stop", true},
{"model", params.model_alias}, {"model", params_base.model_alias},
{"tokens_predicted", slot.n_decoded}, {"tokens_predicted", slot.n_decoded},
{"tokens_evaluated", slot.n_prompt_tokens}, {"tokens_evaluated", slot.n_prompt_tokens},
{"generation_settings", get_formated_generation(slot)}, {"generation_settings", get_formated_generation(slot)},
@ -1266,7 +1364,7 @@ struct server_context {
{"index", slot.index}, {"index", slot.index},
}; };
if (slot.sparams.n_probs > 0) { if (slot.params.sampling.n_probs > 0) {
std::vector<completion_token_output> probs; std::vector<completion_token_output> probs;
if (!slot.params.stream && slot.stopped_word) { if (!slot.params.stream && slot.stopped_word) {
const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
@ -1423,10 +1521,10 @@ struct server_context {
data.at("input_prefix"), data.at("input_prefix"),
data.at("input_suffix"), data.at("input_suffix"),
data.at("input_extra"), data.at("input_extra"),
params.n_batch, params_base.n_batch,
params.n_predict, params_base.n_predict,
slots[0].n_ctx, // TODO: there should be a better way slots[0].n_ctx, // TODO: there should be a better way
params.spm_infill, params_base.spm_infill,
tokenized_prompts[i] tokenized_prompts[i]
); );
create_task(data, tokens); create_task(data, tokens);
@ -1799,7 +1897,7 @@ struct server_context {
// TODO: simplify and improve // TODO: simplify and improve
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) { if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
if (!params.ctx_shift) { if (!params_base.ctx_shift) {
// this check is redundant (for good) // this check is redundant (for good)
// we should never get here, because generation should already stopped in process_token() // we should never get here, because generation should already stopped in process_token()
slot.release(); slot.release();
@ -1865,7 +1963,7 @@ struct server_context {
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
// next, batch any pending prompts without exceeding n_batch // next, batch any pending prompts without exceeding n_batch
if (params.cont_batching || batch.n_tokens == 0) { if (params_base.cont_batching || batch.n_tokens == 0) {
for (auto & slot : slots) { for (auto & slot : slots) {
// this slot still has a prompt to be processed // this slot still has a prompt to be processed
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
@ -1918,7 +2016,7 @@ struct server_context {
continue; continue;
} }
} else { } else {
if (!params.ctx_shift) { if (!params_base.ctx_shift) {
// if context shift is disabled, we make sure prompt size is smaller than KV size // if context shift is disabled, we make sure prompt size is smaller than KV size
// TODO: there should be a separate parameter that control prompt truncation // TODO: there should be a separate parameter that control prompt truncation
// context shift should be applied only during the generation phase // context shift should be applied only during the generation phase
@ -1964,11 +2062,11 @@ struct server_context {
slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens); slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
// reuse chunks from the cached prompt by shifting their KV cache in the new position // reuse chunks from the cached prompt by shifting their KV cache in the new position
if (params.n_cache_reuse > 0) { if (params_base.n_cache_reuse > 0) {
size_t head_c = slot.n_past; // cache size_t head_c = slot.n_past; // cache
size_t head_p = slot.n_past; // current prompt size_t head_p = slot.n_past; // current prompt
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
while (head_c < slot.cache_tokens.size() && while (head_c < slot.cache_tokens.size() &&
head_p < prompt_tokens.size()) { head_p < prompt_tokens.size()) {
@ -1981,7 +2079,7 @@ struct server_context {
n_match++; n_match++;
} }
if (n_match >= (size_t) params.n_cache_reuse) { if (n_match >= (size_t) params_base.n_cache_reuse) {
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
//for (size_t i = head_p; i < head_p + n_match; i++) { //for (size_t i = head_p; i < head_p + n_match; i++) {
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@ -2169,8 +2267,14 @@ struct server_context {
continue; // continue loop of slots continue; // continue loop of slots
} }
llama_token id;
{
completion_token_output result; completion_token_output result;
const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
slot.i_batch = -1;
common_sampler_accept(slot.smpl, id, true); common_sampler_accept(slot.smpl, id, true);
@ -2185,7 +2289,7 @@ struct server_context {
const auto * cur_p = common_sampler_get_candidates(slot.smpl); const auto * cur_p = common_sampler_get_candidates(slot.smpl);
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
result.probs.push_back({ result.probs.push_back({
cur_p->data[i].id, cur_p->data[i].id,
i >= cur_p->size ? 0.0f : cur_p->data[i].p, i >= cur_p->size ? 0.0f : cur_p->data[i].p,
@ -2198,9 +2302,64 @@ struct server_context {
slot.print_timings(); slot.print_timings();
send_final_response(slot); send_final_response(slot);
metrics.on_prediction(slot); metrics.on_prediction(slot);
continue;
}
} }
slot.i_batch = -1; // check if the slot supports speculative decoding
if (!slot.can_speculate()) {
continue;
}
struct common_speculative_params params_spec;
params_spec.n_draft = slot.params.speculative.n_max;
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
params_spec.p_min = slot.params.speculative.p_min;
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
// ignore small drafts
if (slot.params.speculative.n_min > (int) draft.size()) {
continue;
}
// construct the speculation batch
common_batch_clear(slot.batch_spec);
common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true);
for (size_t i = 0; i < draft.size(); ++i) {
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
}
llama_decode(ctx, slot.batch_spec);
// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
slot.n_past += ids.size();
slot.n_decoded += ids.size();
slot.cache_tokens.push_back(id);
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
for (size_t i = 0; i < ids.size(); ++i) {
completion_token_output result;
result.tok = ids[i];
if (!process_token(result, slot)) {
// release slot because of stop condition
slot.release();
slot.print_timings();
send_final_response(slot);
metrics.on_prediction(slot);
break;
}
}
SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size());
} }
} }
@ -2698,7 +2857,7 @@ int main(int argc, char ** argv) {
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
json data = { json data = {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params.n_parallel }, { "total_slots", ctx_server.params_base.n_parallel },
{ "chat_template", llama_get_chat_template(ctx_server.model) }, { "chat_template", llama_get_chat_template(ctx_server.model) },
}; };
@ -2706,7 +2865,7 @@ int main(int argc, char ** argv) {
}; };
const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
if (!ctx_server.params.endpoint_props) { if (!ctx_server.params_base.endpoint_props) {
res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
@ -2719,7 +2878,7 @@ int main(int argc, char ** argv) {
}; };
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
if (ctx_server.params.embedding) { if (ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
@ -2825,7 +2984,7 @@ int main(int argc, char ** argv) {
// TODO: maybe merge this function with "handle_completions_generic" // TODO: maybe merge this function with "handle_completions_generic"
const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params.embedding) { if (ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
@ -3002,7 +3161,7 @@ int main(int argc, char ** argv) {
}; };
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
if (!ctx_server.params.reranking || ctx_server.params.embedding) { if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }

View file

@ -190,6 +190,14 @@ extern "C" {
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array) // Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
// Set the abort callback for the backend
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
struct ggml_backend_feature {
const char * name;
const char * value;
};
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
// //
// Backend registry // Backend registry
@ -214,6 +222,13 @@ extern "C" {
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void); GGML_API ggml_backend_t ggml_backend_init_best(void);
// Load a backend from a dynamic library and register it
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
// Unload a backend if loaded dynamically and unregister it
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
// Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void);
// //
// Backend scheduler // Backend scheduler
// //

View file

@ -7,29 +7,6 @@
extern "C" { extern "C" {
#endif #endif
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute() // the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287 // since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan { struct ggml_cplan {
@ -75,12 +52,9 @@ extern "C" {
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
@ -104,10 +78,10 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_sse3 (void); GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
GGML_BACKEND_API int ggml_cpu_has_avx (void); GGML_BACKEND_API int ggml_cpu_has_avx (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx2 (void); GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
GGML_BACKEND_API int ggml_cpu_has_f16c (void); GGML_BACKEND_API int ggml_cpu_has_f16c (void);
GGML_BACKEND_API int ggml_cpu_has_fma (void); GGML_BACKEND_API int ggml_cpu_has_fma (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx512 (void); GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);

View file

@ -2221,6 +2221,37 @@ extern "C" {
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
// ggml threadpool
// TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
// the goal should be to create an API that other backends can use move everything to the ggml base
// scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -409,6 +409,7 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
ggml_backend_reg_t ggml_backend_amx_reg(void) { ggml_backend_reg_t ggml_backend_amx_reg(void) {
static struct ggml_backend_reg ggml_backend_amx_reg = { static struct ggml_backend_reg ggml_backend_amx_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_amx_reg_i, /* .iface = */ ggml_backend_amx_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
@ -444,3 +445,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
} }
#endif #endif
GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)

View file

@ -8,6 +8,8 @@
extern "C" { extern "C" {
#endif #endif
#define GGML_BACKEND_API_VERSION 1
// //
// Backend buffer type // Backend buffer type
// //
@ -63,20 +65,20 @@ extern "C" {
enum ggml_backend_buffer_usage usage; enum ggml_backend_buffer_usage usage;
}; };
ggml_backend_buffer_t ggml_backend_buffer_init( GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft, ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface, struct ggml_backend_buffer_i iface,
void * context, void * context,
size_t size); size_t size);
// do not use directly, use ggml_backend_tensor_copy instead // do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// multi-buffer // multi-buffer
// buffer that contains a collection of buffers // buffer that contains a collection of buffers
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
// //
// Backend (stream) // Backend (stream)
@ -199,17 +201,37 @@ extern "C" {
}; };
struct ggml_backend_reg { struct ggml_backend_reg {
// int api_version; // TODO: for dynamic loading int api_version; // initialize to GGML_BACKEND_API_VERSION
struct ggml_backend_reg_i iface; struct ggml_backend_reg_i iface;
void * context; void * context;
}; };
// Internal backend registry API // Internal backend registry API
void ggml_backend_register(ggml_backend_reg_t reg); GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
void ggml_backend_device_register(ggml_backend_dev_t device); GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
// typedef ggml_backend_register_t * (*ggml_backend_init)(void); // Add backend dynamic loading support to the backend
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
#ifdef GGML_BACKEND_DL
#ifdef __cplusplus
# define GGML_BACKEND_DL_IMPL(reg_fn) \
extern "C" { \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
} \
ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \
}
#else
# define GGML_BACKEND_DL_IMPL(reg_fn) \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \
}
#endif
#else
# define GGML_BACKEND_DL_IMPL(reg_fn)
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -1,11 +1,29 @@
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include <algorithm>
#include <cstring> #include <cstring>
#include <string>
#include <vector> #include <vector>
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
#elif defined(__APPLE__)
# include <mach-o/dyld.h>
# include <dlfcn.h>
#else
# include <dlfcn.h>
# include <unistd.h>
#endif
// Backend registry // Backend registry
#ifdef GGML_USE_CPU
#include "ggml-cpu.h"
#endif
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
#include "ggml-cuda.h" #include "ggml-cuda.h"
@ -43,8 +61,13 @@
#include "ggml-kompute.h" #include "ggml-kompute.h"
#endif #endif
struct ggml_backend_reg_entry {
ggml_backend_reg_t reg;
void * handle;
};
struct ggml_backend_registry { struct ggml_backend_registry {
std::vector<ggml_backend_reg_t> backends; std::vector<ggml_backend_reg_entry> backends;
std::vector<ggml_backend_dev_t> devices; std::vector<ggml_backend_dev_t> devices;
ggml_backend_registry() { ggml_backend_registry() {
@ -75,11 +98,19 @@ struct ggml_backend_registry {
#ifdef GGML_USE_KOMPUTE #ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg()); register_backend(ggml_backend_kompute_reg());
#endif #endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg()); register_backend(ggml_backend_cpu_reg());
#endif
} }
void register_backend(ggml_backend_reg_t reg) { ~ggml_backend_registry() {
while (!backends.empty()) {
// use silent since the log system may have been destroyed at this point
unload_backend(backends.back().reg, true);
}
}
void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
if (!reg) { if (!reg) {
return; return;
} }
@ -88,7 +119,7 @@ struct ggml_backend_registry {
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif #endif
backends.push_back(reg); backends.push_back({ reg, handle });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i)); register_device(ggml_backend_reg_dev_get(reg, i));
} }
@ -100,6 +131,111 @@ struct ggml_backend_registry {
#endif #endif
devices.push_back(device); devices.push_back(device);
} }
ggml_backend_reg_t load_backend(const char * path, bool silent) {
#ifdef _WIN32
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryA(path);
if (!handle) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
}
SetErrorMode(old_mode);
return nullptr;
}
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
SetErrorMode(old_mode);
if (!backend_init) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
}
FreeLibrary(handle);
return nullptr;
}
#else
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
if (!handle) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
}
return nullptr;
}
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
if (!backend_init) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
}
dlclose(handle);
return nullptr;
}
#endif
ggml_backend_reg_t reg = backend_init();
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) {
if (!reg) {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
} else {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
}
}
#ifdef _WIN32
FreeLibrary(handle);
#else
dlclose(handle);
#endif
return nullptr;
}
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
register_backend(reg, handle);
return reg;
}
void unload_backend(ggml_backend_reg_t reg, bool silent) {
auto it = std::find_if(backends.begin(), backends.end(),
[reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
if (it == backends.end()) {
if (!silent) {
GGML_LOG_ERROR("%s: backend not found\n", __func__);
}
return;
}
if (!silent) {
GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
}
// remove devices
devices.erase(
std::remove_if(devices.begin(), devices.end(),
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
devices.end());
// unload library
if (it->handle) {
#ifdef _WIN32
FreeLibrary((HMODULE) it->handle);
#else
dlclose(it->handle);
#endif
}
// remove backend
backends.erase(it);
}
}; };
static ggml_backend_registry & get_reg() { static ggml_backend_registry & get_reg() {
@ -117,23 +253,32 @@ void ggml_backend_device_register(ggml_backend_dev_t device) {
} }
// Backend (reg) enumeration // Backend (reg) enumeration
static bool striequals(const char * a, const char * b) {
for (; *a && *b; a++, b++) {
if (std::tolower(*a) != std::tolower(*b)) {
return false;
}
}
return *a == *b;
}
size_t ggml_backend_reg_count() { size_t ggml_backend_reg_count() {
return get_reg().backends.size(); return get_reg().backends.size();
} }
ggml_backend_reg_t ggml_backend_reg_get(size_t index) { ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
GGML_ASSERT(index < ggml_backend_reg_count()); GGML_ASSERT(index < ggml_backend_reg_count());
return get_reg().backends[index]; return get_reg().backends[index].reg;
} }
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_reg_count(); i++) { for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
ggml_backend_reg_t reg = ggml_backend_reg_get(i); ggml_backend_reg_t reg = ggml_backend_reg_get(i);
if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) { if (striequals(ggml_backend_reg_name(reg), name)) {
return reg; return reg;
} }
} }
return NULL; return nullptr;
} }
// Device enumeration // Device enumeration
@ -149,11 +294,11 @@ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) { for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i); ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (strcmp(ggml_backend_dev_name(dev), name) == 0) { if (striequals(ggml_backend_dev_name(dev), name)) {
return dev; return dev;
} }
} }
return NULL; return nullptr;
} }
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
@ -163,14 +308,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
return dev; return dev;
} }
} }
return NULL; return nullptr;
} }
// Convenience functions // Convenience functions
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
if (!dev) { if (!dev) {
return NULL; return nullptr;
} }
return ggml_backend_dev_init(dev, params); return ggml_backend_dev_init(dev, params);
} }
@ -178,7 +323,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
if (!dev) { if (!dev) {
return NULL; return nullptr;
} }
return ggml_backend_dev_init(dev, params); return ggml_backend_dev_init(dev, params);
} }
@ -189,7 +334,97 @@ ggml_backend_t ggml_backend_init_best(void) {
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
} }
if (!dev) { if (!dev) {
return NULL; return nullptr;
} }
return ggml_backend_dev_init(dev, NULL); return ggml_backend_dev_init(dev, nullptr);
}
// Dynamic loading
ggml_backend_reg_t ggml_backend_load(const char * path) {
return get_reg().load_backend(path, false);
}
void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true);
}
void ggml_backend_load_all() {
std::vector<std::string> search_prefix;
// add the executable directory to the search path
// FIXME: this is convenient for development, but it should probably be disabled in production
#if defined(__APPLE__)
// get executable path
std::vector<char> path;
uint32_t size;
while (true) {
size = path.size();
if (_NSGetExecutablePath(path.data(), &size) == 0) {
break;
}
path.resize(size);
}
std::string base_path(path.data(), size);
// remove executable name
auto last_slash = base_path.find_last_of('/');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
search_prefix.push_back(base_path + "/");
#elif defined(__linux__)
std::string base_path = ".";
std::vector<char> path(1024);
while (true) {
// get executable path
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
if (len == -1) {
break;
}
if (len < (ssize_t) path.size()) {
base_path = std::string(path.data(), len);
// remove executable name
auto last_slash = base_path.find_last_of('/');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
break;
}
path.resize(path.size() * 2);
}
search_prefix.push_back(base_path + "/");
#endif
auto & reg = get_reg();
auto try_load = [&](const std::string & name) {
std::string os_name;
#ifdef _WIN32
os_name = "ggml-" + name + ".dll";
#else
os_name = "libggml-" + name + ".so";
#endif
if (reg.load_backend(os_name.c_str(), true)) {
return;
}
for (const auto & prefix : search_prefix) {
if (reg.load_backend((prefix + os_name).c_str(), true)) {
return;
}
}
};
try_load("amx");
try_load("blas");
try_load("cann");
try_load("cuda");
try_load("hip");
try_load("kompute");
try_load("metal");
try_load("rpc");
try_load("sycl");
try_load("vulkan");
try_load("musa");
try_load("cpu");
} }

View file

@ -506,9 +506,12 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
ggml_backend_reg_t ggml_backend_blas_reg(void) { ggml_backend_reg_t ggml_backend_blas_reg(void) {
static struct ggml_backend_reg ggml_backend_blas_reg = { static struct ggml_backend_reg ggml_backend_blas_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_blas_reg_i, /* .iface = */ ggml_backend_blas_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
return &ggml_backend_blas_reg; return &ggml_backend_blas_reg;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)

View file

@ -2064,7 +2064,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
dev_ctx->name = GGML_CANN_NAME + std::to_string(i); dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
ggml_cann_set_device(i); ggml_cann_set_device(i);
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .interface = */ ggml_backend_cann_device_interface, /* .iface = */ ggml_backend_cann_device_interface,
/* .reg = */ &reg, /* .reg = */ &reg,
/* .context = */ dev_ctx /* .context = */ dev_ctx
}; };
@ -2072,7 +2072,8 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
} }
reg = ggml_backend_reg { reg = ggml_backend_reg {
/* .interface = */ ggml_backend_cann_reg_interface, /* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_cann_reg_interface,
/* .context = */ ctx /* .context = */ ctx
}; };
} }
@ -2126,3 +2127,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
ggml_cann_set_device(device); ggml_cann_set_device(device);
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
} }
GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)

View file

@ -13619,29 +13619,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
#endif // GGML_USE_OPENMP #endif // GGML_USE_OPENMP
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
p->n_threads = n_threads;
p->prio = 0; // default priority (usually means normal or inherited)
p->poll = 50; // hybrid-polling enabled
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
p->paused = false; // threads are ready to go
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
}
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
struct ggml_threadpool_params p;
ggml_threadpool_params_init(&p, n_threads);
return p;
}
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}
static struct ggml_threadpool * ggml_threadpool_new_impl( static struct ggml_threadpool * ggml_threadpool_new_impl(
struct ggml_threadpool_params * tpp, struct ggml_threadpool_params * tpp,
struct ggml_cgraph * cgraph, struct ggml_cgraph * cgraph,

View file

@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
return &ggml_backend_cpu_device; return &ggml_backend_cpu_device;
} }
struct ggml_backend_feature {
const char * name;
const char * value;
};
// Not used yet
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
// and additionally to allow other backends to expose their own list of features that applications can query using the same API. // and additionally to allow other backends to expose their own list of features that applications can query using the same API
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
static std::vector<ggml_backend_feature> features = []() { static std::vector<ggml_backend_feature> features = []() {
ggml_cpu_init();
std::vector<ggml_backend_feature> features; std::vector<ggml_backend_feature> features;
if (ggml_cpu_has_sse3()) { if (ggml_cpu_has_sse3()) {
features.push_back({ "SSE3", "1" }); features.push_back({ "SSE3", "1" });
@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_avx()) { if (ggml_cpu_has_avx()) {
features.push_back({ "AVX", "1" }); features.push_back({ "AVX", "1" });
} }
if (ggml_cpu_has_avx_vnni()) {
features.push_back({ "AVX_VNNI", "1" });
}
if (ggml_cpu_has_avx2()) { if (ggml_cpu_has_avx2()) {
features.push_back({ "AVX2", "1" }); features.push_back({ "AVX2", "1" });
} }
@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_fma()) { if (ggml_cpu_has_fma()) {
features.push_back({ "FMA", "1" }); features.push_back({ "FMA", "1" });
} }
if (ggml_cpu_has_avx_vnni()) {
features.push_back({ "AVX_VNNI", "1" });
}
if (ggml_cpu_has_avx512()) { if (ggml_cpu_has_avx512()) {
features.push_back({ "AVX512", "1" }); features.push_back({ "AVX512", "1" });
} }
@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_llamafile()) { if (ggml_cpu_has_llamafile()) {
features.push_back({ "LLAMAFILE", "1" }); features.push_back({ "LLAMAFILE", "1" });
} }
// TODO: rename this
#ifdef GGML_USE_CPU_AARCH64
features.push_back({ "AARCH64_REPACK", "1" });
#endif
features.push_back({ nullptr, nullptr }); features.push_back({ nullptr, nullptr });
@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
return (void *)ggml_backend_cpu_get_extra_bufts; return (void *)ggml_backend_cpu_get_extra_bufts;
} }
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_cpu_get_features;
}
if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
return (void *)ggml_backend_cpu_set_abort_callback;
}
if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
return (void *)ggml_numa_init;
}
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
return (void *)ggml_is_numa;
}
// threadpool - TODO: move to ggml-base
if (strcmp(name, "ggml_threadpool_new") == 0) {
return (void *)ggml_threadpool_new;
}
if (strcmp(name, "ggml_threadpool_free") == 0) {
return (void *)ggml_threadpool_free;
}
if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
return (void *)ggml_backend_cpu_set_threadpool;
}
return NULL; return NULL;
@ -655,9 +678,12 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
ggml_cpu_init(); ggml_cpu_init();
static struct ggml_backend_reg ggml_backend_cpu_reg = { static struct ggml_backend_reg ggml_backend_cpu_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_cpu_reg_i, /* .iface = */ ggml_backend_cpu_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
return &ggml_backend_cpu_reg; return &ggml_backend_cpu_reg;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)

View file

@ -3131,6 +3131,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re
return ctx->devices[index]; return ctx->devices[index];
} }
static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
static std::vector<ggml_backend_feature> features = []() {
std::vector<ggml_backend_feature> features;
#define _STRINGIFY(...) #__VA_ARGS__
#define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
#ifdef __CUDA_ARCH_LIST__
features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
#endif
#ifdef GGML_CUDA_FORCE_MMQ
features.push_back({ "FORCE_MMQ", "1" });
#endif
#ifdef GGML_CUDA_FORCE_CUBLAS
features.push_back({ "FORCE_CUBLAS", "1" });
#endif
#ifdef GGML_CUDA_NO_VMM
features.push_back({ "NO_VMM", "1" });
#endif
#ifdef GGML_CUDA_NO_PEER_COPY
features.push_back({ "NO_PEER_COPY", "1" });
#endif
#ifdef GGML_CUDA_F16
features.push_back({ "F16", "1" });
#endif
#ifdef GGML_CUDA_USE_GRAPHS
features.push_back({ "USE_GRAPHS", "1" });
#endif
#ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
#endif
#ifdef GGML_CUDA_FA_ALL_QUANTS
features.push_back({ "FA_ALL_QUANTS", "1" });
#endif
#undef _STRINGIFY
#undef STRINGIFY
features.push_back({ nullptr, nullptr });
return features;
}();
return features.data();
GGML_UNUSED(reg);
}
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg); GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
@ -3142,6 +3197,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
return (void *)ggml_backend_cuda_unregister_host_buffer; return (void *)ggml_backend_cuda_unregister_host_buffer;
} }
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_cuda_get_features;
}
return nullptr; return nullptr;
} }
@ -3174,7 +3232,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .interface = */ ggml_backend_cuda_device_interface, /* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg, /* .reg = */ &reg,
/* .context = */ dev_ctx /* .context = */ dev_ctx
}; };
@ -3182,7 +3240,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
} }
reg = ggml_backend_reg { reg = ggml_backend_reg {
/* .interface = */ ggml_backend_cuda_reg_interface, /* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_cuda_reg_interface,
/* .context = */ ctx /* .context = */ ctx
}; };
} }
@ -3214,3 +3273,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
return cuda_backend; return cuda_backend;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)

View file

@ -1927,7 +1927,7 @@ static void ggml_metal_encode_node(
// find the break-even point where the matrix-matrix kernel becomes more efficient compared // find the break-even point where the matrix-matrix kernel becomes more efficient compared
// to the matrix-vector kernel // to the matrix-vector kernel
int ne11_mm_min = 1; int ne11_mm_min = 4;
#if 0 #if 0
// the numbers below are measured on M2 Ultra for 7B and 13B models // the numbers below are measured on M2 Ultra for 7B and 13B models
@ -4372,17 +4372,43 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r
GGML_UNUSED(index); GGML_UNUSED(index);
} }
static struct ggml_backend_feature g_ggml_backend_metal_features[] = {
#if defined(GGML_METAL_EMBED_LIBRARY)
{ "EMBED_LIBRARY", "1" },
#endif
#if defined(GGML_METAL_USE_BF16)
{ "BF16", "1" },
#endif
{ nil, nil },
};
static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) {
return g_ggml_backend_metal_features;
GGML_UNUSED(reg);
}
static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_metal_get_features;
}
return NULL;
GGML_UNUSED(reg);
}
static struct ggml_backend_reg_i ggml_backend_metal_reg_i = { static struct ggml_backend_reg_i ggml_backend_metal_reg_i = {
/* .get_name = */ ggml_backend_metal_reg_get_name, /* .get_name = */ ggml_backend_metal_reg_get_name,
/* .device_count = */ ggml_backend_metal_reg_device_count, /* .device_count = */ ggml_backend_metal_reg_device_count,
/* .device_get = */ ggml_backend_metal_reg_device_get, /* .device_get = */ ggml_backend_metal_reg_device_get,
/* .get_proc_address = */ NULL, /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
}; };
ggml_backend_reg_t ggml_backend_metal_reg(void) { ggml_backend_reg_t ggml_backend_metal_reg(void) {
// TODO: make this thread-safe somehow? // TODO: make this thread-safe somehow?
{ {
g_ggml_backend_metal_reg = (struct ggml_backend_reg) { g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_metal_reg_i, /* .iface = */ ggml_backend_metal_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
@ -4396,3 +4422,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
return &g_ggml_backend_metal_reg; return &g_ggml_backend_metal_reg;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)

View file

@ -5447,12 +5447,12 @@ kernel void kernel_mul_mm(
const int im = tgpig.z; const int im = tgpig.z;
// if this block is of 64x32 shape or smaller // if this block is of 64x32 shape or smaller
short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
// a thread shouldn't load data outside of the matrix // a thread shouldn't load data outside of the matrix
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
simdgroup_T8x8 ma[4]; simdgroup_T8x8 ma[4];
simdgroup_float8x8 mb[2]; simdgroup_float8x8 mb[2];
@ -5467,20 +5467,23 @@ kernel void kernel_mul_mm(
const int i12 = im%args.ne12; const int i12 = im%args.ne12;
const int i13 = im/args.ne12; const int i13 = im/args.ne12;
uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
short offset1 = il/nl; const short offset1 = il/nl;
device const block_q * x = (device const block_q *)(src0
+ args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*args.nb01 + offset0) + offset1;
device const float * y = (device const float *)(src1 device const float * y = (device const float *)(src1
+ args.nb13*i13 + args.nb13*i13
+ args.nb12*i12 + args.nb12*i12
+ args.nb11*(r1 * BLOCK_SIZE_N + thread_col) + args.nb11*(r1*BLOCK_SIZE_N + thread_col)
+ args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) { for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
// load data and store to threadgroup memory // load data and store to threadgroup memory
T4x4 temp_a; T4x4 temp_a;
dequantize_func(x, il, temp_a); dequantize_func(x, il, temp_a);
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16) #pragma unroll(16)
@ -5490,44 +5493,46 @@ kernel void kernel_mul_mm(
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4]; + (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4];
} }
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y); *(threadgroup float2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
il = (il + 2 < nl) ? il + 2 : il % 2; il = (il + 2 < nl) ? il + 2 : il % 2;
x = (il < 2) ? x + (2+nl-1)/nl : x; x = (il < 2) ? x + (2 + nl - 1)/nl : x;
y += BLOCK_SIZE_K; y += BLOCK_SIZE_K;
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
// load matrices from threadgroup memory and conduct outer products // load matrices from threadgroup memory and conduct outer products
threadgroup T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2)); threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2)); threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
#pragma unroll(4) #pragma unroll(4)
for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
#pragma unroll(4) #pragma unroll(4)
for (short i = 0; i < 4; i++) { for (short i = 0; i < 4; i++) {
simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i); simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
} }
simdgroup_barrier(mem_flags::mem_none); simdgroup_barrier(mem_flags::mem_none);
#pragma unroll(2) #pragma unroll(2)
for (short i = 0; i < 2; i++) { for (short i = 0; i < 2; i++) {
simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i); simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
} }
lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE;
lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE;
#pragma unroll(8) #pragma unroll(8)
for (short i = 0; i < 8; i++){ for (short i = 0; i < 8; i++){
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
} }
lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
} }
} }
if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) { if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) {
device float * C = (device float *) dst + device float * C = (device float *) dst +
(BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) + \ (BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \
(BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0; (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
for (short i = 0; i < 8; i++) { for (short i = 0; i < 8; i++) {
simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0); simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
@ -5536,7 +5541,7 @@ kernel void kernel_mul_mm(
// block is smaller than 64x32, we should avoid writing data outside of the matrix // block is smaller than 64x32, we should avoid writing data outside of the matrix
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float * temp_str = ((threadgroup float *) shmem) \ threadgroup float * temp_str = ((threadgroup float *) shmem) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M; + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
for (short i = 0; i < 8; i++) { for (short i = 0; i < 8; i++) {
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M); simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
} }

View file

@ -1369,6 +1369,7 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
ggml_backend_reg_t ggml_backend_rpc_reg(void) { ggml_backend_reg_t ggml_backend_rpc_reg(void) {
static struct ggml_backend_reg ggml_backend_rpc_reg = { static struct ggml_backend_reg ggml_backend_rpc_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_rpc_reg_i, /* .iface = */ ggml_backend_rpc_reg_i,
/* .context = */ NULL, /* .context = */ NULL,
}; };
@ -1401,3 +1402,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
return dev; return dev;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)

View file

@ -4637,7 +4637,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
dev_ctx->description = prop.get_name(); dev_ctx->description = prop.get_name();
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .interface = */ ggml_backend_sycl_device_interface, /* .iface = */ ggml_backend_sycl_device_interface,
/* .reg = */ &reg, /* .reg = */ &reg,
/* .context = */ dev_ctx /* .context = */ dev_ctx
}; };
@ -4645,7 +4645,8 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
} }
reg = ggml_backend_reg { reg = ggml_backend_reg {
/* .interface = */ ggml_backend_sycl_reg_interface, /* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_sycl_reg_interface,
/* .context = */ ctx /* .context = */ ctx
}; };
} }
@ -4678,3 +4679,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
return sycl_backend; return sycl_backend;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)

View file

@ -6738,6 +6738,7 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
ggml_backend_reg_t ggml_backend_vk_reg() { ggml_backend_reg_t ggml_backend_vk_reg() {
static ggml_backend_reg reg = { static ggml_backend_reg reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_vk_reg_i, /* .iface = */ ggml_backend_vk_reg_i,
/* .context = */ nullptr, /* .context = */ nullptr,
}; };
@ -7365,3 +7366,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
} }
#endif #endif
GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)

View file

@ -475,9 +475,15 @@ void write_output_files() {
int main(int argc, char** argv) { int main(int argc, char** argv) {
std::map<std::string, std::string> args; std::map<std::string, std::string> args;
for (int i = 1; i < argc; i += 2) { for (int i = 1; i < argc; ++i) {
if (i + 1 < argc) { std::string arg = argv[i];
args[argv[i]] = argv[i + 1]; if (arg.rfind("--", 0) == 0) {
if (i + 1 < argc && argv[i + 1][0] != '-') {
args[arg] = argv[i + 1];
++i;
} else {
args[arg] = "";
}
} }
} }

View file

@ -7616,3 +7616,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
g_logger_state.log_callback_user_data = user_data; g_logger_state.log_callback_user_data = user_data;
} }
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
p->n_threads = n_threads;
p->prio = 0; // default priority (usually means normal or inherited)
p->poll = 50; // hybrid-polling enabled
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
p->paused = false; // threads are ready to go
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
}
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
struct ggml_threadpool_params p;
ggml_threadpool_params_init(&p, n_threads);
return p;
}
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}

View file

@ -243,7 +243,7 @@ class MODEL_ARCH(IntEnum):
COMMAND_R = auto() COMMAND_R = auto()
DBRX = auto() DBRX = auto()
OLMO = auto() OLMO = auto()
OLMO_1124 = auto() OLMO2 = auto()
OLMOE = auto() OLMOE = auto()
OPENELM = auto() OPENELM = auto()
ARCTIC = auto() ARCTIC = auto()
@ -405,7 +405,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMO_1124: "olmo_1124", MODEL_ARCH.OLMO2: "olmo2",
MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.ARCTIC: "arctic",
@ -1071,7 +1071,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.OLMO_1124: [ MODEL_ARCH.OLMO2: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT,

View file

@ -13,7 +13,7 @@ class TensorNameMap:
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
"transformer.word_embeddings", # falcon "transformer.word_embeddings", # falcon
"word_embeddings", # bloom "word_embeddings", # bloom
"model.embed_tokens", # llama-hf nemotron olmoe olmo_1124 "model.embed_tokens", # llama-hf nemotron olmoe olmo2
"tok_embeddings", # llama-pth "tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert "embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon "language_model.embedding.word_embeddings", # persimmon
@ -54,7 +54,7 @@ class TensorNameMap:
# Output # Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124 "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
"output", # llama-pth bloom internlm2 "output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
@ -66,7 +66,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT_NORM: ( MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox "gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone "transformer.ln_f", # gpt2 gpt-j falcon jais exaone
"model.norm", # llama-hf baichuan internlm2 olmoe olmo_1124 "model.norm", # llama-hf baichuan internlm2 olmoe olmo2
"norm", # llama-pth "norm", # llama-pth
"transformer.norm_f", # mpt dbrx "transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2 "ln_f", # refact bloom qwen gpt2
@ -145,7 +145,7 @@ class TensorNameMap:
# Attention query # Attention query
MODEL_TENSOR.ATTN_Q: ( MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo_1124 "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wq", # llama-pth "layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert "encoder.layer.{bid}.attention.self.query", # bert
"transformer.h.{bid}.attn.q_proj", # gpt-j "transformer.h.{bid}.attn.q_proj", # gpt-j
@ -157,7 +157,7 @@ class TensorNameMap:
# Attention key # Attention key
MODEL_TENSOR.ATTN_K: ( MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo_1124 "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wk", # llama-pth "layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert "encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k_proj", # gpt-j "transformer.h.{bid}.attn.k_proj", # gpt-j
@ -170,7 +170,7 @@ class TensorNameMap:
# Attention value # Attention value
MODEL_TENSOR.ATTN_V: ( MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo_1124 "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wv", # llama-pth "layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert "encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", # gpt-j "transformer.h.{bid}.attn.v_proj", # gpt-j
@ -188,7 +188,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", # falcon "transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom "h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo_1124 "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
"layers.{bid}.attention.wo", # llama-pth "layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert "encoder.layer.{bid}.attention.output.dense", # bert
"transformer.h.{bid}.attn.out_proj", # gpt-j "transformer.h.{bid}.attn.out_proj", # gpt-j
@ -215,7 +215,7 @@ class TensorNameMap:
), ),
MODEL_TENSOR.ATTN_POST_NORM: ( MODEL_TENSOR.ATTN_POST_NORM: (
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo_1124 "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
), ),
# Rotary embeddings # Rotary embeddings
@ -250,7 +250,7 @@ class TensorNameMap:
# Post feed-forward norm # Post feed-forward norm
MODEL_TENSOR.FFN_POST_NORM: ( MODEL_TENSOR.FFN_POST_NORM: (
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124 "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
), ),
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
@ -273,7 +273,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.blocks.{bid}.ffn.up_proj", # mpt
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
"h.{bid}.mlp.dense_h_to_4h", # bloom "h.{bid}.mlp.dense_h_to_4h", # bloom
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo_1124 "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
"layers.{bid}.feed_forward.w3", # llama-pth "layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert "encoder.layer.{bid}.intermediate.dense", # bert
"transformer.h.{bid}.mlp.fc_in", # gpt-j "transformer.h.{bid}.mlp.fc_in", # gpt-j
@ -314,7 +314,7 @@ class TensorNameMap:
# Feed-forward gate # Feed-forward gate
MODEL_TENSOR.FFN_GATE: ( MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo_1124 "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
"layers.{bid}.feed_forward.w1", # llama-pth "layers.{bid}.feed_forward.w1", # llama-pth
"transformer.h.{bid}.mlp.w2", # qwen "transformer.h.{bid}.mlp.w2", # qwen
"transformer.h.{bid}.mlp.c_fc2", # jais "transformer.h.{bid}.mlp.c_fc2", # jais
@ -346,7 +346,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"h.{bid}.mlp.dense_4h_to_h", # bloom "h.{bid}.mlp.dense_4h_to_h", # bloom
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo_1124 "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
"layers.{bid}.feed_forward.w2", # llama-pth "layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert "encoder.layer.{bid}.output.dense", # bert
"transformer.h.{bid}.mlp.fc_out", # gpt-j "transformer.h.{bid}.mlp.fc_out", # gpt-j
@ -383,7 +383,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon "model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo_1124 "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
"transformer.blocks.{bid}.attn.q_ln", # sea-lion "transformer.blocks.{bid}.attn.q_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", # openelm "transformer.layers.{bid}.attn.q_norm", # openelm
@ -392,7 +392,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_K_NORM: ( MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon "model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo_1124 "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
"transformer.blocks.{bid}.attn.k_ln", # sea-lion "transformer.blocks.{bid}.attn.k_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", # openelm "transformer.layers.{bid}.attn.k_norm", # openelm

25
include/llama-cpp.h Normal file
View file

@ -0,0 +1,25 @@
#pragma once
#ifndef __cplusplus
#error "This header is for C++ only"
#endif
#include <memory>
#include "llama.h"
struct llama_model_deleter {
void operator()(llama_model * model) { llama_free_model(model); }
};
struct llama_context_deleter {
void operator()(llama_context * context) { llama_free(context); }
};
struct llama_sampler_deleter {
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
};
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;

View file

@ -272,6 +272,9 @@ extern "C" {
}; };
struct llama_model_params { struct llama_model_params {
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
ggml_backend_dev_t * devices;
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs enum llama_split_mode split_mode; // how to split the model across multiple GPUs

View file

@ -193,7 +193,7 @@ enum llm_arch {
LLM_ARCH_COMMAND_R, LLM_ARCH_COMMAND_R,
LLM_ARCH_DBRX, LLM_ARCH_DBRX,
LLM_ARCH_OLMO, LLM_ARCH_OLMO,
LLM_ARCH_OLMO_1124, LLM_ARCH_OLMO2,
LLM_ARCH_OLMOE, LLM_ARCH_OLMOE,
LLM_ARCH_OPENELM, LLM_ARCH_OPENELM,
LLM_ARCH_ARCTIC, LLM_ARCH_ARCTIC,
@ -247,7 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COMMAND_R, "command-r" },
{ LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_DBRX, "dbrx" },
{ LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_OLMO_1124, "olmo_1124" }, { LLM_ARCH_OLMO2, "olmo2" },
{ LLM_ARCH_OLMOE, "olmoe" }, { LLM_ARCH_OLMOE, "olmoe" },
{ LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_OPENELM, "openelm" },
{ LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_ARCTIC, "arctic" },
@ -1224,7 +1224,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
}, },
}, },
{ {
LLM_ARCH_OLMO_1124, LLM_ARCH_OLMO2,
{ {
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
@ -4888,7 +4888,9 @@ struct llama_model_loader {
mappings.reserve(files.size()); mappings.reserve(files.size());
mmaps_used.reserve(files.size()); mmaps_used.reserve(files.size());
for (const auto & file : files) { for (const auto & file : files) {
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
mmaps_used.emplace_back(mapping->size, 0); mmaps_used.emplace_back(mapping->size, 0);
if (mlock_mmaps) { if (mlock_mmaps) {
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@ -5930,7 +5932,7 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_OLMO_1124: case LLM_ARCH_OLMO2:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -8722,7 +8724,7 @@ static bool llm_load_tensors(
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
} }
} break; } break;
case LLM_ARCH_OLMO_1124: case LLM_ARCH_OLMO2:
{ {
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -9322,7 +9324,7 @@ static bool llm_load_tensors(
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (!dev) { if (!dev) {
// FIXME: workaround for CPU backend buft having a NULL device // FIXME: workaround for CPU backend buft having a NULL device
dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0); dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
} }
ggml_backend_dev_props props; ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props); ggml_backend_dev_get_props(dev, &props);
@ -14645,7 +14647,7 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_olmo_1124() { struct ggml_cgraph * build_olmo2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens // mutable variable, needed during the last layer of the computation to skip unused tokens
@ -16961,9 +16963,9 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_olmo(); result = llm.build_olmo();
} break; } break;
case LLM_ARCH_OLMO_1124: case LLM_ARCH_OLMO2:
{ {
result = llm.build_olmo_1124(); result = llm.build_olmo2();
} break; } break;
case LLM_ARCH_OLMOE: case LLM_ARCH_OLMOE:
{ {
@ -17607,8 +17609,9 @@ static enum ggml_status llama_graph_compute(
int n_threads, int n_threads,
ggml_threadpool * threadpool) { ggml_threadpool * threadpool) {
if (lctx.backend_cpu != nullptr) { if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
set_threadpool_fn(lctx.backend_cpu, threadpool);
} }
// set the number of threads for all the backends // set the number of threads for all the backends
@ -19525,6 +19528,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
// //
struct llama_model_params llama_model_default_params() { struct llama_model_params llama_model_default_params() {
struct llama_model_params result = { struct llama_model_params result = {
/*.devices =*/ nullptr,
/*.n_gpu_layers =*/ 0, /*.n_gpu_layers =*/ 0,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
@ -19646,7 +19650,11 @@ void llama_backend_init(void) {
void llama_numa_init(enum ggml_numa_strategy numa) { void llama_numa_init(enum ggml_numa_strategy numa) {
if (numa != GGML_NUMA_STRATEGY_DISABLED) { if (numa != GGML_NUMA_STRATEGY_DISABLED) {
ggml_numa_init(numa); auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
GGML_ASSERT(dev && "CPU backend is not loaded");
auto * reg = ggml_backend_dev_backend_reg(dev);
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
numa_init_fn(numa);
} }
} }
@ -19737,8 +19745,12 @@ struct llama_model * llama_load_model_from_file(
} }
// create list of devices to use with this model // create list of devices to use with this model
// currently, we use all available devices if (params.devices) {
// TODO: rework API to give user more control over device selection for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
model->devices.push_back(*dev);
}
} else {
// use all available devices
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i); ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) { switch (ggml_backend_dev_type(dev)) {
@ -19752,6 +19764,7 @@ struct llama_model * llama_load_model_from_file(
break; break;
} }
} }
}
// if using single GPU mode, remove all except the main GPU // if using single GPU mode, remove all except the main GPU
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
@ -19920,9 +19933,6 @@ struct llama_context * llama_new_context_with_model(
__func__, n_ctx_per_seq, hparams.n_ctx_train); __func__, n_ctx_per_seq, hparams.n_ctx_train);
} }
ctx->abort_callback = params.abort_callback;
ctx->abort_callback_data = params.abort_callback_data;
ctx->logits_all = params.logits_all; ctx->logits_all = params.logits_all;
// build worst-case graph for encoder if a model contains encoder // build worst-case graph for encoder if a model contains encoder
@ -19971,7 +19981,7 @@ struct llama_context * llama_new_context_with_model(
} }
// add CPU backend // add CPU backend
ctx->backend_cpu = ggml_backend_cpu_init(); ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (ctx->backend_cpu == nullptr) { if (ctx->backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
llama_free(ctx); llama_free(ctx);
@ -19991,6 +20001,8 @@ struct llama_context * llama_new_context_with_model(
} }
} }
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) { if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx); llama_free(ctx);
@ -20036,7 +20048,8 @@ struct llama_context * llama_new_context_with_model(
std::vector<ggml_backend_t> backend_ptrs; std::vector<ggml_backend_t> backend_ptrs;
for (auto & backend : ctx->backends) { for (auto & backend : ctx->backends) {
auto * buft = ggml_backend_get_default_buffer_type(backend.get()); auto * buft = ggml_backend_get_default_buffer_type(backend.get());
if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) { auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
// use the host buffer of the first device CPU for faster transfer of the intermediate state // use the host buffer of the first device CPU for faster transfer of the intermediate state
auto * dev = model->devices[0]; auto * dev = model->devices[0];
auto * host_buft = ggml_backend_dev_host_buffer_type(dev); auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
@ -20064,7 +20077,8 @@ struct llama_context * llama_new_context_with_model(
// pipeline parallelism requires support for async compute and events in all devices // pipeline parallelism requires support for async compute and events in all devices
if (pipeline_parallel) { if (pipeline_parallel) {
for (auto & backend : ctx->backends) { for (auto & backend : ctx->backends) {
if (ggml_backend_is_cpu(backend.get())) { auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
// ignore CPU backend // ignore CPU backend
continue; continue;
} }
@ -20238,7 +20252,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_QWEN: case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN2MOE:
case LLM_ARCH_OLMO_1124: case LLM_ARCH_OLMO2:
case LLM_ARCH_OLMOE: case LLM_ARCH_OLMOE:
case LLM_ARCH_PHI2: case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3: case LLM_ARCH_PHI3:
@ -21628,6 +21642,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
ctx->abort_callback = abort_callback; ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data; ctx->abort_callback_data = abort_callback_data;
for (auto & backend : ctx->backends) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
if (set_abort_callback_fn) {
set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
}
}
} }
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
@ -22369,32 +22391,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
} }
const char * llama_print_system_info(void) { const char * llama_print_system_info(void) {
ggml_cpu_init(); // some ARM features are detected at runtime
static std::string s; static std::string s;
s = ""; for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; auto * reg = ggml_backend_reg_get(i);
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | "; auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; if (get_features_fn) {
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; ggml_backend_feature * features = get_features_fn(reg);
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; s += ggml_backend_reg_name(reg);
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; s += " : ";
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | "; for (; features->name; features++) {
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | "; s += features->name;
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; s += " = ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; s += features->value;
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; s += " | ";
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; }
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; }
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; }
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
return s.c_str(); return s.c_str();
} }