mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-14 00:01:30 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/act-ops.c # ggml/src/ggml-hexagon/htp/hvx-utils.c # ggml/src/ggml-hexagon/htp/main.c # src/llama-model.cpp # tools/server/README.md
This commit is contained in:
commit
58eb5573de
46 changed files with 1288 additions and 1485 deletions
|
|
@ -2480,7 +2480,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||
int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
printf("\nOpenCL GPU Offload Fallback...\n");
|
||||
|
|
@ -2491,9 +2491,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||
const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
|
||||
const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
|
||||
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
||||
// LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
|
||||
return {cpu_dev, &pimpl->cpu_buft_list};
|
||||
|
|
@ -6852,10 +6852,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
if (llama_supports_gpu_offload()) {
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
int n_repeating = n_gpu;
|
||||
if (n_repeating > 0) {
|
||||
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
|
||||
n_repeating--;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
|
||||
|
||||
const int max_backend_supported_layers = hparams.n_layer + 1;
|
||||
const int max_offloadable_layers = hparams.n_layer + 1;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue