mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-08 01:41:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # docs/build-s390x.md # docs/ops.md # docs/ops/zDNN.csv # ggml/include/ggml-zdnn.h # ggml/src/ggml-sycl/binbcast.cpp # ggml/src/ggml-sycl/concat.cpp # ggml/src/ggml-sycl/conv.cpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/cpy.cpp # ggml/src/ggml-sycl/dmmv.cpp # ggml/src/ggml-sycl/dpct/helper.hpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/getrows.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/gla.cpp # ggml/src/ggml-sycl/im2col.cpp # ggml/src/ggml-sycl/mmq.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/norm.cpp # ggml/src/ggml-sycl/rope.cpp # ggml/src/ggml-sycl/set_rows.cpp # ggml/src/ggml-sycl/softmax.cpp # ggml/src/ggml-sycl/tsembd.cpp # ggml/src/ggml-sycl/wkv.cpp # ggml/src/ggml-zdnn/ggml-zdnn-impl.h # ggml/src/ggml-zdnn/ggml-zdnn.cpp # tools/llama-bench/llama-bench.cpp
This commit is contained in:
commit
1dbd2fc259
14 changed files with 406 additions and 68 deletions
|
|
@ -89,6 +89,7 @@ bool llama_supports_gpu_offload(void) {
|
|||
return true;
|
||||
#else
|
||||
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
||||
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
|
||||
llama_supports_rpc();
|
||||
#endif
|
||||
}
|
||||
|
|
@ -215,8 +216,13 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
model->devices.push_back(*dev);
|
||||
}
|
||||
} else {
|
||||
// default device selection
|
||||
|
||||
// build list of available devices
|
||||
std::vector<ggml_backend_dev_t> gpus;
|
||||
std::vector<ggml_backend_dev_t> igpus;
|
||||
std::vector<ggml_backend_dev_t> rpc_servers;
|
||||
// use all available devices
|
||||
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
switch (ggml_backend_dev_type(dev)) {
|
||||
|
|
@ -225,19 +231,51 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
// skip CPU backends since they are handled separately
|
||||
break;
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||
rpc_servers.push_back(dev);
|
||||
} else {
|
||||
model->devices.push_back(dev);
|
||||
// check if there is already a GPU with the same device id
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
||||
ggml_backend_dev_props d_props;
|
||||
ggml_backend_dev_get_props(d, &d_props);
|
||||
if (props.device_id && d_props.device_id) {
|
||||
return strcmp(props.device_id, d_props.device_id) == 0;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (it != gpus.end()) {
|
||||
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
||||
__func__,
|
||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||
props.device_id ? props.device_id : "unknown id",
|
||||
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
||||
} else {
|
||||
gpus.push_back(dev);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||
igpus.push_back(dev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// add RPC servers at the front of the list
|
||||
if (!rpc_servers.empty()) {
|
||||
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
||||
|
||||
// add RPC servers at the front of the list to minimize network transfers
|
||||
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
||||
|
||||
// add GPUs
|
||||
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
|
||||
|
||||
// add integrated GPUs only if no other devices were found
|
||||
if (model->devices.empty()) {
|
||||
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -258,9 +296,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
}
|
||||
|
||||
for (auto * dev : model->devices) {
|
||||
size_t free, total; // NOLINT
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
||||
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||
props.device_id ? props.device_id : "unknown id",
|
||||
props.memory_free/1024/1024);
|
||||
}
|
||||
|
||||
const int status = llama_model_load(path_model, splits, *model, params);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue