mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-04-26 10:41:25 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # CODEOWNERS # common/CMakeLists.txt # ggml/CMakeLists.txt # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl # scripts/sync-ggml.last # tools/cli/cli.cpp # tools/llama-bench/llama-bench.cpp # tools/perplexity/perplexity.cpp
This commit is contained in:
commit
19a12bb080
31 changed files with 1594 additions and 1172 deletions
|
|
@ -2428,6 +2428,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_FIT"));
|
||||
add_opt(common_arg(
|
||||
{ "-fitp", "--fit-print" }, "[on|off]",
|
||||
string_format("print the estimated required memory ('on' or 'off', default: '%s')", params.fit_params_print ? "on" : "off"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (is_truthy(value)) {
|
||||
params.fit_params_print = true;
|
||||
} else if (is_falsey(value)) {
|
||||
params.fit_params_print = false;
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
string_format("error: unknown value for --fit-print: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_FIT_PARAMS}).set_env("LLAMA_ARG_FIT_ESTIMATE"));
|
||||
add_opt(common_arg(
|
||||
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
|
||||
string_format("target margin per device for --fit, comma-separated list of values, "
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "fit.h"
|
||||
#include "log.h"
|
||||
#include "log.cpp"
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
|
|
@ -1153,7 +1154,7 @@ common_init_result::common_init_result(common_params & params) :
|
|||
|
||||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
common_fit_params(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split,
|
||||
params.tensor_buft_overrides.data(),
|
||||
params.fit_params_target.data(),
|
||||
|
|
|
|||
|
|
@ -421,11 +421,12 @@ struct common_params {
|
|||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
bool fit_params_print = false; // print the estimated required memory to run the model
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
|
||||
// margin per device in bytes for fitting parameters to free memory:
|
||||
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
||||
|
|
|
|||
951
common/fit.cpp
Normal file
951
common/fit.cpp
Normal file
|
|
@ -0,0 +1,951 @@
|
|||
#include "fit.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
#include <cinttypes>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||
// enum to identify part of a layer for distributing its tensors:
|
||||
enum common_layer_fraction_t {
|
||||
LAYER_FRACTION_NONE = 0, // nothing
|
||||
LAYER_FRACTION_ATTN = 1, // attention
|
||||
LAYER_FRACTION_UP = 2, // attention + up
|
||||
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||
};
|
||||
|
||||
class common_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
ggml_log_level log_level) {
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original_logger;
|
||||
ggml_log_level min_level; // prints below this log level go to debug log
|
||||
};
|
||||
user_data_t ud;
|
||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||
ud.min_level = log_level;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||
const user_data_t * ud = (const user_data_t *) user_data;
|
||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||
}, &ud);
|
||||
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.use_mmap = false;
|
||||
mparams_copy.use_mlock = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||
if (model == nullptr) {
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to load model");
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||
if (ctx == nullptr) {
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to create llama_context from model");
|
||||
}
|
||||
|
||||
const size_t nd = llama_model_n_devices(model);
|
||||
std::vector<llama_device_memory_data> ret(nd + 1);
|
||||
|
||||
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
|
||||
|
||||
for (const auto & [buft, mb] : memory_breakdown) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
ret.back().mb.model += mb.model;
|
||||
ret.back().mb.context += mb.context;
|
||||
ret.back().mb.compute += mb.compute;
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
if (dev == llama_model_get_device(model, i)) {
|
||||
ret[i].mb.model += mb.model;
|
||||
ret[i].mb.context += mb.context;
|
||||
ret[i].mb.compute += mb.compute;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error("no CPU backend found");
|
||||
}
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
ret.back().free = free;
|
||||
ret.back().total = total;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
free = ret.back().free;
|
||||
total = ret.back().total;
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
}
|
||||
|
||||
devs.clear();
|
||||
for (int i = 0; i < llama_model_n_devices(model); i++) {
|
||||
devs.push_back(llama_model_get_device(model, i));
|
||||
}
|
||||
|
||||
hp_ngl = llama_model_n_layer(model);
|
||||
hp_n_ctx_train = llama_model_n_ctx_train(model);
|
||||
hp_n_expert = llama_model_n_expert(model);
|
||||
|
||||
common_memory_breakdown_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void common_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
|
||||
}
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
if (nd == 0) {
|
||||
margins.push_back(margins_s[0]);
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
size_t max_length = 0;
|
||||
for (const auto & dev : devs) {
|
||||
std::string name = ggml_backend_dev_name(dev);
|
||||
name += " (";
|
||||
name += ggml_backend_dev_description(dev);
|
||||
name += ")";
|
||||
dev_names.push_back(name);
|
||||
max_length = std::max(max_length, name.length());
|
||||
}
|
||||
for (std::string & dn : dev_names) {
|
||||
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||
}
|
||||
}
|
||||
|
||||
int64_t sum_free = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_model = 0;
|
||||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd == 0) {
|
||||
sum_projected_used = dmds_full.back().mb.total();
|
||||
sum_free = dmds_full.back().total;
|
||||
sum_projected_free = sum_free - sum_projected_used;
|
||||
LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (sum_projected_free >= margins[0]) {
|
||||
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
|
||||
__func__, sum_projected_free/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (nd > 1) {
|
||||
LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LOG_INF("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
if (nd == 0) {
|
||||
global_surplus -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
if (nd <= 1) {
|
||||
LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
LOG_INF(
|
||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, -global_surplus/MiB);
|
||||
}
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
int64_t sum_used_target = sum_free;
|
||||
if (nd == 0) {
|
||||
sum_used_target -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
}
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
// - for dense models only whole layers can be assigned to devices
|
||||
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
|
||||
// - on average we expect a waste of 0.5 layers/tensors per device
|
||||
// - use slightly more than the expected average for nd devices to be safe
|
||||
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
||||
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
||||
}
|
||||
|
||||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
if (nd == 0) {
|
||||
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
|
||||
}
|
||||
}
|
||||
if (sum_used_target > sum_projected_used_min_ctx) {
|
||||
// linear interpolation between minimum and maximum context size:
|
||||
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
||||
/ (sum_projected_used - sum_projected_used_min_ctx);
|
||||
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
||||
|
||||
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
||||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
||||
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (nd <= 1) {
|
||||
LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||
} else {
|
||||
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
||||
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
}
|
||||
} else {
|
||||
if (n_ctx_min == UINT32_MAX) {
|
||||
LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
||||
} else {
|
||||
LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nd == 0) {
|
||||
throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
}
|
||||
if (nd > 1) {
|
||||
if (!tensor_split) {
|
||||
throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
||||
}
|
||||
if (mparams->tensor_split) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (mparams->tensor_split[id] != 0.0f) {
|
||||
throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||
}
|
||||
}
|
||||
if (!tensor_buft_overrides) {
|
||||
throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
||||
}
|
||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||
throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
||||
}
|
||||
|
||||
// step 3: iteratively fill the back to front with "dense" layers
|
||||
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||
|
||||
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||
auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
|
||||
constexpr size_t n_strings = 1000;
|
||||
if (il >= n_strings) {
|
||||
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||
}
|
||||
switch (lf) {
|
||||
case LAYER_FRACTION_ATTN: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_UP: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_GATE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_MOE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
};
|
||||
|
||||
struct ngl_t {
|
||||
uint32_t n_layer = 0; // number of total layers
|
||||
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||
|
||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||
common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||
|
||||
uint32_t n_full() const {
|
||||
assert(n_layer >= n_part);
|
||||
return n_layer - n_part;
|
||||
}
|
||||
};
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
|
||||
// utility function to set n_gpu_layers and tensor_split
|
||||
auto set_ngl_tensor_split_tbo = [&](
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
llama_model_params & mparams) {
|
||||
mparams.n_gpu_layers = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||
if (nd > 1) {
|
||||
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||
}
|
||||
}
|
||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
|
||||
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||
|
||||
mparams.tensor_split = tensor_split;
|
||||
|
||||
size_t itbo = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
il0 += ngl_per_device[id].n_full();
|
||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||
if (itbo + 1 >= ntbo) {
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
||||
+ std::to_string(ntbo) + " is insufficient for model");
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
||||
itbo++;
|
||||
}
|
||||
il0 += ngl_per_device[id].n_part;
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
};
|
||||
|
||||
// utility function that returns the memory use per device for given numbers of layers per device
|
||||
auto get_memory_for_layers = [&](
|
||||
const char * func_name,
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
||||
|
||||
const dmds_t dmd_nl = common_get_device_memory_data(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LOG_INF("%s: memory for test allocation by device:\n", func_name);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const ngl_t & n = ngl_per_device[id];
|
||||
LOG_INF(
|
||||
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ret;
|
||||
ret.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ret.push_back(dmd_nl[id].mb.total());
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
int64_t global_surplus_cpu_moe = 0;
|
||||
if (hp_nex > 0) {
|
||||
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
|
||||
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = common_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
||||
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||
__func__, global_surplus_cpu_moe/MiB);
|
||||
} else {
|
||||
LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||
__func__, -global_surplus_cpu_moe/MiB);
|
||||
}
|
||||
|
||||
// reset
|
||||
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
}
|
||||
|
||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margins[id]);
|
||||
LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
||||
overflow_bufts.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device(nd);
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
||||
|
||||
// optimize the number of layers per device using the method of false position:
|
||||
// - ngl_per_device has 0 layers for each device, lower bound
|
||||
// - try a "high" configuration where a device is given all unassigned layers
|
||||
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||
// - check memory use of our guess, replace either the low or high bound
|
||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||
// - the last device has the output layer, which cannot be a partial layer
|
||||
if (hp_nex == 0) {
|
||||
LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
|
||||
} else {
|
||||
LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
}
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
uint32_t n_unassigned = hp_ngl + 1;
|
||||
for (size_t jd = id + 1; jd < nd; ++jd) {
|
||||
assert(n_unassigned >= ngl_per_device[jd].n_layer);
|
||||
n_unassigned -= ngl_per_device[jd].n_layer;
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||
if (hp_nex > 0) {
|
||||
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
|
||||
}
|
||||
if (ngl_per_device_high[id].n_layer > 0) {
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
ngl_per_device_test[id].n_layer += step_size;
|
||||
if (hp_nex) {
|
||||
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
||||
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
||||
}
|
||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
}
|
||||
} else {
|
||||
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 4: for a MoE model where all dense tensors fit,
|
||||
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||
|
||||
size_t id_dense_start = nd;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
if (ngl_per_device[id].n_layer > 0) {
|
||||
id_dense_start = id;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
||||
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||
ngl_per_device_high[jd].n_part = 0;
|
||||
}
|
||||
size_t id_dense_start_high = nd - 1;
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
uint32_t n_converted_test = 0;
|
||||
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||
n_converted_test += n_convert_jd;
|
||||
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
id_dense_start_high = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
|
||||
// try to fit at least part of one more layer
|
||||
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||
ngl_per_device_test[id].n_layer++;
|
||||
ngl_per_device_test[id].n_part++;
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
||||
id_dense_start_test++;
|
||||
}
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
||||
if (id < nd - 1) {
|
||||
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
||||
}
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
// print info for devices that were not changed during the conversion from dense only to full layers:
|
||||
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LOG_INF(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
}
|
||||
|
||||
enum common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams,
|
||||
float * tensor_split,
|
||||
llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins,
|
||||
uint32_t n_ctx_min,
|
||||
ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
|
||||
try {
|
||||
common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
||||
LOG_INF("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const common_params_fit_exception & e) {
|
||||
LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = COMMON_PARAMS_FIT_STATUS_FAILURE;
|
||||
} catch (const std::runtime_error & e) {
|
||||
LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = COMMON_PARAMS_FIT_STATUS_ERROR;
|
||||
}
|
||||
const int64_t t1_us = llama_time_us();
|
||||
LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
return status;
|
||||
}
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
//const auto & devices = ctx->get_model().devices;
|
||||
const auto * model = llama_get_model(ctx);
|
||||
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
for (int i = 0; i < llama_model_n_devices(model); i++) {
|
||||
devices.push_back(llama_model_get_device(model, i));
|
||||
}
|
||||
|
||||
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
|
||||
|
||||
std::vector<std::array<std::string, 9>> table_data;
|
||||
table_data.reserve(devices.size());
|
||||
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
||||
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
||||
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
||||
|
||||
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
||||
|
||||
constexpr size_t MiB = 1024 * 1024;
|
||||
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
||||
|
||||
// track seen buffer types to avoid double counting:
|
||||
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
||||
|
||||
// accumulative memory breakdown for each device and for host:
|
||||
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
||||
llama_memory_breakdown_data mb_host;
|
||||
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
mb_host.model += mb.model;
|
||||
mb_host.context += mb.context;
|
||||
mb_host.compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (dev) {
|
||||
int i_dev = -1;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (devices[i] == dev) {
|
||||
i_dev = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i_dev != -1) {
|
||||
mb_dev[i_dev].model += mb.model;
|
||||
mb_dev[i_dev].context += mb.context;
|
||||
mb_dev[i_dev].compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print memory breakdown for each device:
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
ggml_backend_dev_t dev = devices[i];
|
||||
llama_memory_breakdown_data mb = mb_dev[i];
|
||||
|
||||
const std::string name = ggml_backend_dev_name(dev);
|
||||
std::string desc = ggml_backend_dev_description(dev);
|
||||
for (const std::string & prefix : desc_prefixes_strip) {
|
||||
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
||||
desc = desc.substr(prefix.length());
|
||||
}
|
||||
}
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
const size_t unaccounted = total - self - free;
|
||||
|
||||
table_data.push_back({
|
||||
template_gpu,
|
||||
" - " + name + " (" + desc + ")",
|
||||
std::to_string(total / MiB),
|
||||
std::to_string(free / MiB),
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
std::to_string(unaccounted / MiB)});
|
||||
}
|
||||
|
||||
// print memory breakdown for host:
|
||||
{
|
||||
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - Host",
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb_host.model / MiB),
|
||||
std::to_string(mb_host.context / MiB),
|
||||
std::to_string(mb_host.compute / MiB),
|
||||
""}); // unaccounted
|
||||
}
|
||||
|
||||
// print memory breakdown for all remaining buffer types:
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (seen_buffer_types.count(buft) == 1) {
|
||||
continue;
|
||||
}
|
||||
const std::string name = ggml_backend_buft_name(buft);
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - " + name,
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
""}); // unaccounted
|
||||
seen_buffer_types.insert(buft);
|
||||
}
|
||||
|
||||
for (size_t j = 1; j < table_data[0].size(); j++) {
|
||||
size_t max_len = 0;
|
||||
for (const auto & td : table_data) {
|
||||
max_len = std::max(max_len, td[j].length());
|
||||
}
|
||||
for (auto & td : table_data) {
|
||||
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
||||
}
|
||||
}
|
||||
for (const auto & td : table_data) {
|
||||
LOG_INF(td[0].c_str(),
|
||||
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
||||
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void common_fit_print(
|
||||
const char * path_model,
|
||||
llama_model_params * mparams,
|
||||
llama_context_params * cparams) {
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
|
||||
GGML_ASSERT(dmd.size() == devs.size() + 1);
|
||||
|
||||
for (size_t id = 0; id < devs.size(); id++) {
|
||||
printf("%s ", ggml_backend_dev_name(devs[id]));
|
||||
printf("%zu ", dmd[id].mb.model/1024/1024);
|
||||
printf("%zu ", dmd[id].mb.context/1024/1024);
|
||||
printf("%zu ", dmd[id].mb.compute/1024/1024);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Host ");
|
||||
printf("%zu ", dmd.back().mb.model/1024/1024);
|
||||
printf("%zu ", dmd.back().mb.context/1024/1024);
|
||||
printf("%zu ", dmd.back().mb.compute/1024/1024);
|
||||
printf("\n");
|
||||
}
|
||||
32
common/fit.h
Normal file
32
common/fit.h
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
enum common_params_fit_status {
|
||||
COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
COMMON_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
||||
COMMON_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
|
||||
};
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// - returns true if the parameters could be successfully modified to fit device memory
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
// with the exception of the context size which is modified if and only if equal to 0
|
||||
enum common_params_fit_status common_fit_params(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
// print estimated memory to stdout
|
||||
void common_fit_print(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams);
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx);
|
||||
|
|
@ -1,10 +1,12 @@
|
|||
#include "sampling.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "fit.h"
|
||||
#include "log.h"
|
||||
#include "reasoning-budget.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <climits>
|
||||
|
|
@ -511,7 +513,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|||
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
|
||||
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
||||
|
||||
llama_memory_breakdown_print(ctx);
|
||||
common_memory_breakdown_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1133,7 +1133,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
|||
if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
|
||||
t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
|
||||
if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
|
||||
GGML_ASSERT(ne[split_dim] != 0 && tensor->ne[split_dim] != 0);
|
||||
GGML_ASSERT(tensor->ne[split_dim] != 0);
|
||||
const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
|
||||
GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);
|
||||
|
||||
|
|
@ -1170,6 +1170,28 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
|||
|
||||
simple_tensors.push_back(t_ij);
|
||||
}
|
||||
|
||||
// If one of the sources has a zero-sized slice, disable the computation:
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (tensor->src[i] == nullptr || !ggml_backend_buffer_is_meta(tensor->src[i]->buffer)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const ggml_backend_meta_split_state split_state_src = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
|
||||
if (split_state_src.axis < 0 || split_state_src.axis >= GGML_MAX_DIMS) {
|
||||
continue;
|
||||
}
|
||||
for (size_t j = 0; j < n_simple_bufs; j++) {
|
||||
int64_t ne_sum = 0;
|
||||
for (size_t s = 0; s < split_state_src.n_segments; s++) {
|
||||
ne_sum += split_state_src.ne[s*n_simple_bufs + j];
|
||||
}
|
||||
if (ne_sum == 0) {
|
||||
simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buf_ctx->simple_tensors[tensor] = simple_tensors;
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
|
@ -1442,17 +1464,20 @@ struct ggml_backend_meta_context {
|
|||
struct backend_config {
|
||||
ggml_backend_t backend;
|
||||
|
||||
std::vector<cgraph_config> cgraphs;
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
ggml_backend_buffer_ptr buf;
|
||||
std::vector<cgraph_config> cgraphs;
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
backend_config(ggml_backend_t backend) : backend(backend) {}
|
||||
backend_config(ggml_backend_t backend, const size_t n_reduce_steps) : backend(backend) {
|
||||
bufs.resize(n_reduce_steps);
|
||||
}
|
||||
};
|
||||
std::string name;
|
||||
std::vector<backend_config> backend_configs;
|
||||
ggml_context_ptr ctx;
|
||||
std::vector<ggml_cgraph *> cgraphs_aux;
|
||||
std::vector<ggml_tensor *> nodes_aux;
|
||||
size_t n_reduce_steps;
|
||||
int max_nnodes = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
size_t max_subgraphs = 0;
|
||||
|
|
@ -1464,6 +1489,7 @@ struct ggml_backend_meta_context {
|
|||
|
||||
ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
|
||||
const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
|
||||
n_reduce_steps = std::ceil(std::log2(n_devs));
|
||||
name = "Meta(";
|
||||
std::vector<ggml_backend_t> simple_backends;
|
||||
backend_configs.reserve(n_devs);
|
||||
|
|
@ -1475,7 +1501,7 @@ struct ggml_backend_meta_context {
|
|||
}
|
||||
name += ggml_backend_dev_name(simple_dev);
|
||||
simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
|
||||
backend_configs.emplace_back(simple_backends.back());
|
||||
backend_configs.emplace_back(simple_backends.back(), n_reduce_steps);
|
||||
}
|
||||
name += ")";
|
||||
|
||||
|
|
@ -1505,10 +1531,6 @@ struct ggml_backend_meta_context {
|
|||
ggml_backend_free(bc.backend);
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_reduce_steps() const {
|
||||
return std::ceil(std::log2(backend_configs.size()));
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_meta_get_name(ggml_backend_t backend) {
|
||||
|
|
@ -1661,6 +1683,36 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
|
||||
ggml_tensor * node = cgraph->nodes[id];
|
||||
int32_t n_used = ggml_node_get_use_count(cgraph, id);
|
||||
|
||||
// Skip MIRRORED nodes that don't consume node
|
||||
auto skip_unrelated = [&]() {
|
||||
while (id + 1 < cgraph->n_nodes) {
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (ggml_backend_meta_get_split_state(next, false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
break;
|
||||
}
|
||||
bool safe = true;
|
||||
for (int s = 0; s < GGML_MAX_SRC; s++) {
|
||||
if (next->src[s] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (next->src[s] == node) {
|
||||
safe = false;
|
||||
break;
|
||||
}
|
||||
if (ggml_backend_meta_get_split_state(next->src[s], false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
safe = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!safe) {
|
||||
break;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
};
|
||||
|
||||
skip_unrelated();
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
|
|
@ -1675,10 +1727,12 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
}
|
||||
}
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
{
|
||||
// Chain of MULs with MIRRORED src[1]
|
||||
while (true) {
|
||||
skip_unrelated();
|
||||
if (id + 1 >= cgraph->n_nodes) {
|
||||
return idr;
|
||||
}
|
||||
ggml_tensor * next = cgraph->nodes[id+1];
|
||||
if (next->op == GGML_OP_MUL && next->src[0] == node &&
|
||||
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
|
|
@ -1686,6 +1740,8 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
id++;
|
||||
idr = id;
|
||||
n_used = ggml_node_get_use_count(cgraph, id);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1754,16 +1810,17 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
if (max_tmp_size > backend_ctx->max_tmp_size) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
for (size_t i = 0; i < backend_ctx->n_reduce_steps; i++) {
|
||||
bcj.bufs[i].reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
|
||||
}
|
||||
}
|
||||
backend_ctx->max_tmp_size = max_tmp_size;
|
||||
}
|
||||
|
||||
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
|
||||
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
|
||||
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
|
||||
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
|
||||
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
|
||||
const size_t n_nodes_per_device = 3 * backend_ctx->n_reduce_steps; // tmp + ADD (+zeroing) graph per step and device
|
||||
const size_t n_cgraphs_per_device = 2 * backend_ctx->n_reduce_steps; // ADD ( + zeroing) graph per step and device
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
|
|
@ -1812,11 +1869,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
size_t iga = 0; // i graph aux
|
||||
size_t ina = 0; // i node aux
|
||||
|
||||
// FIXME usage_counts
|
||||
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
|
||||
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
|
||||
return ret;
|
||||
};
|
||||
auto get_node_aux = [&](ggml_tensor * t) -> ggml_tensor * {
|
||||
ggml_tensor * ret = backend_ctx->nodes_aux[ina++];
|
||||
memset(ret, 0, sizeof(ggml_tensor));
|
||||
|
|
@ -1828,75 +1880,110 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
}
|
||||
return ret;
|
||||
};
|
||||
auto set_tmp_data = [&](ggml_tensor * tensor, const size_t j, const size_t i_buf) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
ggml_backend_buffer_ptr & buf_ptr = bcj.bufs[i_buf];
|
||||
if (!buf_ptr || ggml_backend_buffer_get_size(buf_ptr.get()) < backend_ctx->max_tmp_size) {
|
||||
buf_ptr.reset(ggml_backend_alloc_buffer(bcj.backend, backend_ctx->max_tmp_size));
|
||||
}
|
||||
tensor->buffer = buf_ptr.get();
|
||||
tensor->data = ggml_backend_buffer_get_base(buf_ptr.get());
|
||||
};
|
||||
// FIXME usage_counts
|
||||
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
|
||||
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
|
||||
return ret;
|
||||
};
|
||||
|
||||
// Preferentially use backend-specific allreduce_tensor_async (e.g. NCCL for CUDA), use a generic fallback if unavailable:
|
||||
auto allreduce_fallback = [&](size_t i) -> ggml_status {
|
||||
std::vector<ggml_cgraph *> step_cgraphs(n_backends, nullptr);
|
||||
|
||||
for (size_t offset_j = 1; offset_j < n_backends; offset_j *= 2) {
|
||||
// Zero out nodes that were disabled due to having a zero-sized slice:
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
ggml_tensor * node = bcj.cgraphs[i].cgraph_main->nodes[bcj.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
if (node->flags & GGML_TENSOR_FLAG_COMPUTE) {
|
||||
continue;
|
||||
}
|
||||
ggml_tensor * node_zero = get_node_aux(node);
|
||||
node_zero->op = GGML_OP_SCALE; // FIXME 0.0f * NaN == NaN
|
||||
node_zero->src[0] = node;
|
||||
ggml_set_op_params_f32(node_zero, 0, 0.0f);
|
||||
node_zero->data = node->data;
|
||||
node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
|
||||
step_cgraphs[j] = get_cgraph_aux();
|
||||
step_cgraphs[j]->nodes[0] = node_zero;
|
||||
step_cgraphs[j]->n_nodes = 1;
|
||||
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, step_cgraphs[j]);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
|
||||
|
||||
auto push_data = [&](const size_t j_src, const size_t j_dst, const size_t i_buf) {
|
||||
assert(step_cgraphs[j_dst] == nullptr);
|
||||
auto & bcj_src = backend_ctx->backend_configs[j_src];
|
||||
auto & bcj_dst = backend_ctx->backend_configs[j_dst];
|
||||
|
||||
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
GGML_ASSERT(ggml_is_contiguous(node_src));
|
||||
GGML_ASSERT(ggml_is_contiguous(node_dst));
|
||||
|
||||
ggml_tensor * node_tmp = get_node_aux(node_dst);
|
||||
set_tmp_data(node_tmp, j_dst, i_buf);
|
||||
|
||||
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_tmp);
|
||||
|
||||
ggml_tensor * node_red = get_node_aux(node_dst);
|
||||
node_red->view_src = node_dst->view_src == nullptr ? node_dst : node_dst->view_src;
|
||||
node_red->view_offs = node_dst->view_offs;
|
||||
node_red->op = GGML_OP_ADD;
|
||||
node_red->src[0] = node_dst;
|
||||
node_red->src[1] = node_tmp;
|
||||
node_red->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
ggml_backend_view_init(node_red);
|
||||
|
||||
ggml_cgraph * cgraph_aux = get_cgraph_aux();
|
||||
cgraph_aux->nodes[0] = node_red;
|
||||
cgraph_aux->n_nodes = 1;
|
||||
step_cgraphs[j_dst] = cgraph_aux;
|
||||
};
|
||||
|
||||
size_t offset_j = n_backends/2;
|
||||
while ((offset_j & (offset_j - 1)) != 0) {
|
||||
offset_j--;
|
||||
}
|
||||
const size_t offset_j_max = offset_j;
|
||||
size_t i_buf = 0;
|
||||
|
||||
// If n_backends is not a power of 2, fold in the excess prior to butterfly reduction:
|
||||
for (size_t j_src = 2*offset_j_max; j_src < n_backends; j_src++) {
|
||||
const size_t j_dst = j_src - 2*offset_j_max;
|
||||
push_data(j_src, j_dst, i_buf);
|
||||
const ggml_status status = ggml_backend_graph_compute_async(backend_ctx->backend_configs[j_dst].backend, step_cgraphs[j_dst]);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
i_buf = 1;
|
||||
}
|
||||
|
||||
// Butterfly reduction:
|
||||
for (; offset_j >= 1; offset_j /= 2) {
|
||||
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
for (size_t j = 0; j < 2*offset_j_max; j++) {
|
||||
const size_t j_other = j ^ offset_j;
|
||||
if (j_other > j) {
|
||||
if (j_other >= n_backends) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto & bcj1 = backend_ctx->backend_configs[j];
|
||||
auto & bcj2 = backend_ctx->backend_configs[j_other];
|
||||
|
||||
ggml_tensor * node1 = bcj1.cgraphs[i].cgraph_main->nodes[bcj1.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_tensor * node2 = bcj2.cgraphs[i].cgraph_main->nodes[bcj2.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
GGML_ASSERT(ggml_is_contiguous(node1));
|
||||
GGML_ASSERT(ggml_is_contiguous(node2));
|
||||
|
||||
// Tmp tensors to receive P2P copies
|
||||
ggml_tensor * node_tmp_1 = get_node_aux(node1);
|
||||
node_tmp_1->buffer = bcj1.buf.get();
|
||||
node_tmp_1->data = ggml_backend_buffer_get_base(bcj1.buf.get());
|
||||
|
||||
ggml_tensor * node_tmp_2 = get_node_aux(node2);
|
||||
node_tmp_2->buffer = bcj2.buf.get();
|
||||
node_tmp_2->data = ggml_backend_buffer_get_base(bcj2.buf.get());
|
||||
|
||||
// 2 P2P copies: exchange full buffers
|
||||
ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
|
||||
ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
|
||||
|
||||
// Local ADD: node1 += tmp1 (in-place via view)
|
||||
ggml_tensor * node_red_1 = get_node_aux(node1);
|
||||
node_red_1->view_src = node1->view_src == nullptr ? node1 : node1->view_src;
|
||||
node_red_1->view_offs = node1->view_offs;
|
||||
node_red_1->op = GGML_OP_ADD;
|
||||
node_red_1->src[0] = node1;
|
||||
node_red_1->src[1] = node_tmp_1;
|
||||
node_red_1->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
ggml_backend_view_init(node_red_1);
|
||||
|
||||
// Local ADD: node2 += tmp2 (in-place via view)
|
||||
ggml_tensor * node_red_2 = get_node_aux(node2);
|
||||
node_red_2->view_src = node2->view_src == nullptr ? node2 : node2->view_src;
|
||||
node_red_2->view_offs = node2->view_offs;
|
||||
node_red_2->op = GGML_OP_ADD;
|
||||
node_red_2->src[0] = node2;
|
||||
node_red_2->src[1] = node_tmp_2;
|
||||
node_red_2->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
ggml_backend_view_init(node_red_2);
|
||||
|
||||
// Build 1-node cgraphs for the ADD ops
|
||||
ggml_cgraph * cgraph_aux_1 = get_cgraph_aux();
|
||||
cgraph_aux_1->nodes[0] = node_red_1;
|
||||
cgraph_aux_1->n_nodes = 1;
|
||||
step_cgraphs[j] = cgraph_aux_1;
|
||||
|
||||
ggml_cgraph * cgraph_aux_2 = get_cgraph_aux();
|
||||
cgraph_aux_2->nodes[0] = node_red_2;
|
||||
cgraph_aux_2->n_nodes = 1;
|
||||
step_cgraphs[j_other] = cgraph_aux_2;
|
||||
push_data(j, j_other, i_buf);
|
||||
}
|
||||
|
||||
// Execute local ADDs for this step
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
for (size_t j = 0; j < 2*offset_j_max; j++) {
|
||||
if (step_cgraphs[j] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -1906,7 +1993,20 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
|||
return status;
|
||||
}
|
||||
}
|
||||
i_buf++;
|
||||
}
|
||||
assert(i_buf == backend_ctx->n_reduce_steps);
|
||||
|
||||
// If n_backends is not a power of 2, copy back the reduced tensors to the excess:
|
||||
for (size_t j = 2*offset_j_max; j < n_backends; j++) {
|
||||
auto & bcj_src = backend_ctx->backend_configs[j - 2*offset_j_max];
|
||||
auto & bcj_dst = backend_ctx->backend_configs[j];
|
||||
|
||||
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
|
||||
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_dst);
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -73,7 +73,6 @@
|
|||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
|
|
|
|||
|
|
@ -151,8 +151,6 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
const block_q1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t sumv = vdupq_n_f32(0.0f);
|
||||
|
||||
|
|
@ -212,31 +210,13 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
}
|
||||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv);
|
||||
*s = vaddvq_f32(sumv);
|
||||
#else
|
||||
// Scalar fallback
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
// Process 4 Q8_0 blocks
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const int bit_index = k * QK8_0 + j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
|
||||
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
||||
sumi += xi * y[i*4 + k].qs[j];
|
||||
}
|
||||
sumf += d0 * d1 * sumi;
|
||||
}
|
||||
}
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -275,6 +275,18 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
|
|||
}
|
||||
#endif
|
||||
#elif defined(__SSSE3__)
|
||||
static inline __m128i bytes_from_bits_16(const uint8_t * x) {
|
||||
uint16_t x16;
|
||||
memcpy(&x16, x, sizeof(uint16_t));
|
||||
|
||||
const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
|
||||
__m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
|
||||
const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
|
||||
bytes = _mm_or_si128(bytes, bit_mask);
|
||||
|
||||
return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
|
||||
}
|
||||
|
||||
// horizontally add 4x4 floats
|
||||
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
|
||||
__m128 res_0 =_mm_hadd_ps(a, b);
|
||||
|
|
@ -541,6 +553,152 @@ static inline __m128i get_scale_shuffle(int i) {
|
|||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK1_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
const __m256i ones_8 = _mm256_set1_epi8(1);
|
||||
const __m256i ones_16 = _mm256_set1_epi16(1);
|
||||
const __m256i byte_shuf = _mm256_setr_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
|
||||
const __m256i bit_masks = _mm256_setr_epi8(
|
||||
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
|
||||
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
|
||||
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
|
||||
|
||||
__m256 acc_block;
|
||||
{
|
||||
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
|
||||
const __m256i sm = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
|
||||
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
|
||||
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
|
||||
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
|
||||
}
|
||||
for (int K = 1; K < 4; ++K) {
|
||||
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs);
|
||||
const __m256i sm = _mm256_cmpeq_epi8(
|
||||
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero);
|
||||
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
|
||||
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
|
||||
acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block);
|
||||
}
|
||||
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
const __m128i ones_8 = _mm_set1_epi8(1);
|
||||
const __m128i ones_16 = _mm_set1_epi16(1);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
|
||||
__m256 acc_block;
|
||||
{
|
||||
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[0]);
|
||||
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
|
||||
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
|
||||
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[0]);
|
||||
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[16]);
|
||||
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
|
||||
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
|
||||
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
|
||||
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
|
||||
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
|
||||
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
|
||||
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
|
||||
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
|
||||
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
|
||||
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), q);
|
||||
}
|
||||
for(int K = 1; K < 4; ++K) {
|
||||
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[(K) * 4]);
|
||||
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
|
||||
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
|
||||
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[0]);
|
||||
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[16]);
|
||||
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
|
||||
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
|
||||
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
|
||||
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
|
||||
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
|
||||
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
|
||||
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
|
||||
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
|
||||
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
|
||||
acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q));
|
||||
}
|
||||
#undef Q1_AVX_BLOCK
|
||||
|
||||
acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__SSSE3__)
|
||||
const __m128i ones_8 = _mm_set1_epi8(1);
|
||||
const __m128i ones_16 = _mm_set1_epi16(1);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128 acc_0 = _mm_setzero_ps();
|
||||
__m128 acc_1 = _mm_setzero_ps();
|
||||
__m128 acc_2 = _mm_setzero_ps();
|
||||
__m128 acc_3 = _mm_setzero_ps();
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
|
||||
|
||||
#define Q1_SSSE3_BLOCK(QS_OFF, Y_IDX, ACC) \
|
||||
{ \
|
||||
const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 0]); \
|
||||
const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 2]); \
|
||||
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[0]); \
|
||||
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[16]); \
|
||||
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
|
||||
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
|
||||
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
|
||||
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
|
||||
const __m128i sum_0 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_0), ones_16); \
|
||||
const __m128i sum_1 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_1), ones_16); \
|
||||
const __m128 q = _mm_cvtepi32_ps(_mm_add_epi32(sum_0, sum_1)); \
|
||||
(ACC) = _mm_add_ps((ACC), _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(Y_IDX)].d))), q)); \
|
||||
}
|
||||
Q1_SSSE3_BLOCK(0, 0, acc_0)
|
||||
Q1_SSSE3_BLOCK(4, 1, acc_1)
|
||||
Q1_SSSE3_BLOCK(8, 2, acc_2)
|
||||
Q1_SSSE3_BLOCK(12, 3, acc_3)
|
||||
#undef Q1_SSSE3_BLOCK
|
||||
}
|
||||
|
||||
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
|
|||
|
|
@ -137,22 +137,28 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
float sumf = 0.0;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
float sumi = 0.0f;
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
|
||||
|
||||
const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
|
||||
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
|
||||
int sumi_block = 0;
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const int bit_index = k * QK8_0 + j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
|
||||
const int8_t * GGML_RESTRICT qy = yb->qs;
|
||||
|
||||
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
||||
sumi_block += xi * y[i*4 + k].qs[j];
|
||||
for (int b = 0; b < 4; ++b, qy += 8) {
|
||||
const unsigned mask = bits[b];
|
||||
sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
|
||||
+ ((mask & 0x02) ? qy[1] : -qy[1])
|
||||
+ ((mask & 0x04) ? qy[2] : -qy[2])
|
||||
+ ((mask & 0x08) ? qy[3] : -qy[3])
|
||||
+ ((mask & 0x10) ? qy[4] : -qy[4])
|
||||
+ ((mask & 0x20) ? qy[5] : -qy[5])
|
||||
+ ((mask & 0x40) ? qy[6] : -qy[6])
|
||||
+ ((mask & 0x80) ? qy[7] : -qy[7]);
|
||||
}
|
||||
|
||||
sumi += d1 * sumi_block;
|
||||
|
|
|
|||
|
|
@ -370,15 +370,21 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|||
}
|
||||
|
||||
~ggml_cuda_pool_leg() {
|
||||
clear_pool();
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
void clear_pool() {
|
||||
ggml_cuda_set_device(device);
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cuda_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
CUDA_CHECK(cudaFree(b.ptr));
|
||||
pool_size -= b.size;
|
||||
b.ptr = nullptr;
|
||||
b.size = 0;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
|
|
@ -425,7 +431,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
ggml_cuda_set_device(device);
|
||||
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||
cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
|
||||
if (err == cudaErrorMemoryAllocation) {
|
||||
(void)cudaGetLastError();
|
||||
const size_t cached_bytes = pool_size;
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n",
|
||||
device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
clear_pool();
|
||||
err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
|
||||
if (err == cudaSuccess) {
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
|
||||
}
|
||||
}
|
||||
CUDA_CHECK(err);
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
return ptr;
|
||||
|
|
@ -1203,6 +1222,13 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
|||
// For small tensors, simply reduce them as FP32.
|
||||
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
|
||||
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
if ((tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, ggml_nbytes(tensors[i]), cuda_ctx->stream()));
|
||||
}
|
||||
}
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
|
|
@ -1224,7 +1250,11 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
|||
tmp[i].alloc(ne);
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
if (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) {
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemsetAsync(tmp[i].get(), 0, ne * sizeof(nv_bfloat16), cuda_ctx->stream()));
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
|
|
|||
1
ggml/src/ggml-cuda/vendors/hip.h
vendored
1
ggml/src/ggml-cuda/vendors/hip.h
vendored
|
|
@ -58,6 +58,7 @@
|
|||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaErrorMemoryAllocation hipErrorOutOfMemory
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
|
|
|
|||
1
ggml/src/ggml-cuda/vendors/musa.h
vendored
1
ggml/src/ggml-cuda/vendors/musa.h
vendored
|
|
@ -42,6 +42,7 @@
|
|||
#define cudaDeviceProp musaDeviceProp
|
||||
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||
#define cudaError_t musaError_t
|
||||
#define cudaErrorMemoryAllocation musaErrorMemoryAllocation
|
||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||
|
|
|
|||
|
|
@ -804,6 +804,7 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_arange_f32;
|
||||
|
||||
vk_pipeline pipeline_fill_f32;
|
||||
vk_pipeline pipeline_fill_f16;
|
||||
|
||||
vk_pipeline pipeline_geglu[2];
|
||||
vk_pipeline pipeline_reglu[2];
|
||||
|
|
@ -4589,6 +4590,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f16, "fill_f16", fill_f16_len, fill_f16_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
#define CREATE_GLU(name) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
|
|
@ -9878,6 +9880,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
if (dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_fill_f32;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_fill_f16;
|
||||
}
|
||||
return nullptr;
|
||||
default:
|
||||
return nullptr;
|
||||
|
|
@ -15747,8 +15752,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32)
|
||||
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16);
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_FILL:
|
||||
return op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_FILL:
|
||||
return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
|
||||
case GGML_OP_SCALE:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_PAD:
|
||||
|
|
|
|||
|
|
@ -906,6 +906,7 @@ void process_shaders() {
|
|||
string_to_spv("add1_f32_f32", "add1.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("arange_f32", "arange.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("fill_f32", "fill.comp", {{"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("fill_f16", "fill.comp", {{"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("step_f16", "step.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("step_f32", "step.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("round_f16", "round.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
|
|
|
|||
|
|
@ -2629,7 +2629,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
llama_log_set(log_callback_off, nullptr);
|
||||
}
|
||||
fit_params_target[0] = taxmb*1024*1024;
|
||||
bool success = (llama_params_fit(kcpp_data->model_filename.c_str(), &model_params, &llama_ctx_params,
|
||||
bool success = (common_fit_params(kcpp_data->model_filename.c_str(), &model_params, &llama_ctx_params,
|
||||
tensor_split_temp, tenos.data(), fit_params_target.data(), kcpp_data->n_ctx,
|
||||
GGML_LOG_LEVEL_NONE)==0);
|
||||
if(!dospam)
|
||||
|
|
|
|||
|
|
@ -514,27 +514,6 @@ extern "C" {
|
|||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
enum llama_params_fit_status {
|
||||
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
||||
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
|
||||
};
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// - returns true if the parameters could be successfully modified to fit device memory
|
||||
// - this function is NOT thread safe because it modifies the global llama logger state
|
||||
// - only parameters that have the same value as in llama_default_model_params are modified
|
||||
// with the exception of the context size which is modified if and only if equal to 0
|
||||
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t * margins, // margins of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API size_t llama_max_devices(void);
|
||||
|
|
@ -1549,9 +1528,6 @@ extern "C" {
|
|||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||
|
||||
// print a breakdown of per-device memory use via LLAMA_LOG:
|
||||
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
|
|
|||
|
|
@ -2646,7 +2646,7 @@ void llama_context::perf_reset() {
|
|||
n_reused = 0;
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||
llama_memory_breakdown llama_context::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||
for (const auto & [buft, size] : model.memory_breakdown()) {
|
||||
ret[buft].model += size;
|
||||
|
|
@ -3503,142 +3503,6 @@ void llama_perf_context_reset(llama_context * ctx) {
|
|||
ctx->perf_reset();
|
||||
}
|
||||
|
||||
void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
const auto & devices = ctx->get_model().devices;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
std::vector<std::array<std::string, 9>> table_data;
|
||||
table_data.reserve(devices.size());
|
||||
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
||||
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
||||
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
||||
|
||||
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
||||
|
||||
constexpr size_t MiB = 1024 * 1024;
|
||||
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
||||
|
||||
// track seen buffer types to avoid double counting:
|
||||
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
||||
|
||||
// accumulative memory breakdown for each device and for host:
|
||||
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
||||
llama_memory_breakdown_data mb_host;
|
||||
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
mb_host.model += mb.model;
|
||||
mb_host.context += mb.context;
|
||||
mb_host.compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (dev) {
|
||||
int i_dev = -1;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (devices[i].dev == dev) {
|
||||
i_dev = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i_dev != -1) {
|
||||
mb_dev[i_dev].model += mb.model;
|
||||
mb_dev[i_dev].context += mb.context;
|
||||
mb_dev[i_dev].compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print memory breakdown for each device:
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
ggml_backend_dev_t dev = devices[i].dev;
|
||||
llama_memory_breakdown_data mb = mb_dev[i];
|
||||
|
||||
const std::string name = ggml_backend_dev_name(dev);
|
||||
std::string desc = ggml_backend_dev_description(dev);
|
||||
for (const std::string & prefix : desc_prefixes_strip) {
|
||||
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
||||
desc = desc.substr(prefix.length());
|
||||
}
|
||||
}
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
const size_t unaccounted = total - self - free;
|
||||
|
||||
table_data.push_back({
|
||||
template_gpu,
|
||||
" - " + name + " (" + desc + ")",
|
||||
std::to_string(total / MiB),
|
||||
std::to_string(free / MiB),
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
std::to_string(unaccounted / MiB)});
|
||||
}
|
||||
|
||||
// print memory breakdown for host:
|
||||
{
|
||||
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - Host",
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb_host.model / MiB),
|
||||
std::to_string(mb_host.context / MiB),
|
||||
std::to_string(mb_host.compute / MiB),
|
||||
""}); // unaccounted
|
||||
}
|
||||
|
||||
// print memory breakdown for all remaining buffer types:
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (seen_buffer_types.count(buft) == 1) {
|
||||
continue;
|
||||
}
|
||||
const std::string name = ggml_backend_buft_name(buft);
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - " + name,
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
""}); // unaccounted
|
||||
seen_buffer_types.insert(buft);
|
||||
}
|
||||
|
||||
for (size_t j = 1; j < table_data[0].size(); j++) {
|
||||
size_t max_len = 0;
|
||||
for (const auto & td : table_data) {
|
||||
max_len = std::max(max_len, td[j].length());
|
||||
}
|
||||
for (auto & td : table_data) {
|
||||
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
||||
}
|
||||
}
|
||||
for (const auto & td : table_data) {
|
||||
LLAMA_LOG_INFO(td[0].c_str(),
|
||||
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
||||
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
|
@ -3669,3 +3533,11 @@ void llama_opt_epoch(
|
|||
callback_train,
|
||||
callback_eval);
|
||||
}
|
||||
|
||||
//
|
||||
// ext
|
||||
//
|
||||
|
||||
llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) {
|
||||
return ctx->memory_breakdown();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-ext.h"
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-adapter.h"
|
||||
|
|
@ -22,17 +23,6 @@ class llama_io_write_i;
|
|||
struct llama_memory_i;
|
||||
struct llama_memory_context_i;
|
||||
|
||||
// "memory" as in physical memory for a buffer type, in bytes
|
||||
struct llama_memory_breakdown_data {
|
||||
size_t model = 0; // memory allocated for the model
|
||||
size_t context = 0; // memory allocated for the context
|
||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||
|
||||
size_t total() const {
|
||||
return model + context + compute;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
// init scheduler and compute buffers, reserve worst-case graphs
|
||||
llama_context(
|
||||
|
|
@ -172,7 +162,7 @@ struct llama_context {
|
|||
llama_perf_context_data perf_get_data() const;
|
||||
void perf_reset();
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
||||
llama_memory_breakdown memory_breakdown() const;
|
||||
|
||||
//
|
||||
// training
|
||||
|
|
|
|||
|
|
@ -1,8 +1,12 @@
|
|||
#pragma once
|
||||
|
||||
// this is a staging header for new llama.cpp API
|
||||
// breaking changes and C++ are allowed. everything here should be considered WIP
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
|
||||
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
|
||||
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||
|
|
@ -14,7 +18,6 @@ LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
|||
// Get the default ggml_type for a given ftype.
|
||||
LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
|
||||
|
||||
// Quantization state.
|
||||
struct quantize_state_impl;
|
||||
|
||||
LLAMA_API quantize_state_impl * llama_quant_init(
|
||||
|
|
@ -54,3 +57,34 @@ LLAMA_API void llama_quant_compute_types(
|
|||
ggml_tensor ** tensors,
|
||||
ggml_type * result_types,
|
||||
size_t n_tensors);
|
||||
|
||||
//
|
||||
// device memory querying
|
||||
//
|
||||
|
||||
// "memory" as in physical memory for a buffer type, in bytes
|
||||
struct llama_memory_breakdown_data {
|
||||
size_t model = 0; // memory allocated for the model
|
||||
size_t context = 0; // memory allocated for the context
|
||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||
|
||||
size_t total() const {
|
||||
return model + context + compute;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_device_memory_data {
|
||||
int64_t total;
|
||||
int64_t free;
|
||||
llama_memory_breakdown_data mb;
|
||||
};
|
||||
|
||||
// TODO: convert to C-style data structure
|
||||
using llama_memory_breakdown = std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data>;
|
||||
|
||||
LLAMA_API int32_t llama_model_n_expert (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
|
||||
|
||||
LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
|
||||
|
||||
LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "llama-model.h"
|
||||
|
||||
#include "llama-arch.h"
|
||||
#include "llama-ext.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
|
|
@ -192,11 +193,23 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
|||
const ggml_tensor * tensor_axis_0;
|
||||
|
||||
uint32_t il;
|
||||
size_t rotation;
|
||||
size_t rotation; // when assigning tensor slices, rotate how the rounding is done for more even allocation
|
||||
};
|
||||
|
||||
auto get_tensor_config_impl = [&](
|
||||
const ggml_backend_meta_split_axis axis, const std::string & suffix = "", const std::string & suffix_fallback = "") -> tensor_config {
|
||||
// the layers in a tensor can be inhomogeneous, if the pattern is cleanly divided by the number of GPUs there can be aliasing effects,
|
||||
// count only the same type of previous layers to avoid this
|
||||
auto get_il_eff = [&](const size_t il){
|
||||
size_t ret = 0;
|
||||
const bool il_is_recurrent = hparams.is_recurrent(il);
|
||||
const bool il_is_swa = hparams.is_swa(il);
|
||||
for (size_t il_prev = 0; il_prev < il; il_prev++) {
|
||||
ret += hparams.is_recurrent(il_prev) == il_is_recurrent && hparams.is_swa(il_prev) == il_is_swa;
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
uint32_t il;
|
||||
std::string prefix;
|
||||
size_t rotation;
|
||||
|
|
@ -205,13 +218,13 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
|||
GGML_ASSERT(length_prefix != std::string::npos);
|
||||
prefix = tensor_name.substr(0, length_prefix + 1);
|
||||
il = std::stoull(tensor_name.substr(4, length_prefix));
|
||||
rotation = il % ud->n_devices;
|
||||
rotation = get_il_eff(il) % ud->n_devices;
|
||||
} else if (tensor_name.substr(0, 6) == "cache_") {
|
||||
const size_t layer_index_start = tensor_name.find("_l", 6);
|
||||
GGML_ASSERT(layer_index_start != std::string::npos);
|
||||
il = std::stoull(tensor_name.substr(layer_index_start + 2));
|
||||
prefix = "blk." + std::to_string(il) + ".";
|
||||
rotation = il % ud->n_devices;
|
||||
rotation = get_il_eff(il) % ud->n_devices;
|
||||
} else {
|
||||
il = 0;
|
||||
rotation = hparams.n_layer % ud->n_devices;
|
||||
|
|
@ -9596,3 +9609,18 @@ bool llama_model_is_diffusion(const llama_model * model) {
|
|||
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|
||||
return model->tensors_by_name;
|
||||
}
|
||||
|
||||
int32_t llama_model_n_expert(const struct llama_model * model) {
|
||||
return model->hparams.n_expert;
|
||||
}
|
||||
|
||||
int32_t llama_model_n_devices(const struct llama_model * model) {
|
||||
return (int32_t)model->devices.size();
|
||||
}
|
||||
|
||||
ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i) {
|
||||
if (i < 0 || i >= (int)model->devices.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
return model->devices[i].dev;
|
||||
}
|
||||
|
|
|
|||
761
src/llama.cpp
761
src/llama.cpp
|
|
@ -25,6 +25,7 @@ static bool old_mixtral_warning_showed = false;
|
|||
#include "llama-graph.cpp"
|
||||
#include "llama-io.cpp"
|
||||
#include "llama-memory.cpp"
|
||||
#include "fit.cpp"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpp.h"
|
||||
|
|
@ -70,766 +71,6 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
struct llama_device_memory_data {
|
||||
int64_t total;
|
||||
int64_t free;
|
||||
llama_memory_breakdown_data mb;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
|
||||
std::vector<llama_device> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
|
||||
const ggml_log_level log_level) {
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original_logger;
|
||||
ggml_log_level min_level; // prints below this log level go to debug log
|
||||
};
|
||||
user_data_t ud;
|
||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||
ud.min_level = log_level;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||
const user_data_t * ud = (const user_data_t *) user_data;
|
||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||
}, &ud);
|
||||
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.use_mmap = false;
|
||||
mparams_copy.use_mlock = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||
if (model == nullptr) {
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to load model");
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||
if (ctx == nullptr) {
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to create llama_context from model");
|
||||
}
|
||||
|
||||
const size_t nd = model->n_devices();
|
||||
std::vector<llama_device_memory_data> ret(nd + 1);
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
for (const auto & [buft, mb] : memory_breakdown) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
ret.back().mb.model += mb.model;
|
||||
ret.back().mb.context += mb.context;
|
||||
ret.back().mb.compute += mb.compute;
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
if (model->devices[i].dev == dev) {
|
||||
ret[i].mb.model += mb.model;
|
||||
ret[i].mb.context += mb.context;
|
||||
ret[i].mb.compute += mb.compute;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (cpu_dev == nullptr) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||
ret.back().free = free;
|
||||
ret.back().total = total;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
if (free == 0 && total == 0) {
|
||||
free = ret.back().free;
|
||||
total = ret.back().total;
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
}
|
||||
|
||||
devs = model->devices;
|
||||
hp_ngl = model->hparams.n_layer;
|
||||
hp_n_ctx_train = model->hparams.n_ctx_train;
|
||||
hp_n_expert = model->hparams.n_expert;
|
||||
|
||||
llama_memory_breakdown_print(ctx); // goes to debug log
|
||||
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// enum to identify part of a layer for distributing its tensors:
|
||||
enum layer_fraction_t {
|
||||
LAYER_FRACTION_NONE = 0, // nothing
|
||||
LAYER_FRACTION_ATTN = 1, // attention
|
||||
LAYER_FRACTION_UP = 2, // attention + up
|
||||
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||
};
|
||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||
|
||||
class llama_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static void llama_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
throw llama_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
|
||||
}
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
std::vector<llama_device> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
|
||||
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
margins.reserve(nd);
|
||||
if (nd == 0) {
|
||||
margins.push_back(margins_s[0]);
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
margins.push_back(margins_s[id]);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
size_t max_length = 0;
|
||||
for (const llama_device & dev : devs) {
|
||||
std::string name = ggml_backend_dev_name(dev.dev);
|
||||
name += " (";
|
||||
name += ggml_backend_dev_description(dev.dev);
|
||||
name += ")";
|
||||
dev_names.push_back(name);
|
||||
max_length = std::max(max_length, name.length());
|
||||
}
|
||||
for (std::string & dn : dev_names) {
|
||||
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||
}
|
||||
}
|
||||
|
||||
int64_t sum_free = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_model = 0;
|
||||
std::vector<int64_t> projected_free_per_device;
|
||||
projected_free_per_device.reserve(nd);
|
||||
|
||||
if (nd == 0) {
|
||||
sum_projected_used = dmds_full.back().mb.total();
|
||||
sum_free = dmds_full.back().total;
|
||||
sum_projected_free = sum_free - sum_projected_used;
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (sum_projected_free >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
|
||||
__func__, sum_projected_free/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
projected_free_per_device.push_back(projected_free);
|
||||
|
||||
sum_free += dmd.free;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
sum_projected_model += dmd.mb.model;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
|
||||
}
|
||||
}
|
||||
assert(sum_free >= 0 && sum_projected_used >= 0);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_free/MiB);
|
||||
if (nd == 1) {
|
||||
if (projected_free_per_device[0] >= margins[0]) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bool changes_needed = false;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (projected_free_per_device[id] < margins[id]) {
|
||||
changes_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changes_needed) {
|
||||
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free;
|
||||
if (nd == 0) {
|
||||
global_surplus -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus -= margins[id];
|
||||
}
|
||||
}
|
||||
if (global_surplus < 0) {
|
||||
if (nd <= 1) {
|
||||
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
|
||||
__func__, margins[0]/MiB, -global_surplus/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, -global_surplus/MiB);
|
||||
}
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
int64_t sum_used_target = sum_free;
|
||||
if (nd == 0) {
|
||||
sum_used_target -= margins[0];
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_used_target -= margins[id];
|
||||
}
|
||||
}
|
||||
if (nd > 1) {
|
||||
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
||||
// - for dense models only whole layers can be assigned to devices
|
||||
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
|
||||
// - on average we expect a waste of 0.5 layers/tensors per device
|
||||
// - use slightly more than the expected average for nd devices to be safe
|
||||
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
||||
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
||||
}
|
||||
|
||||
int64_t sum_projected_used_min_ctx = 0;
|
||||
cparams->n_ctx = n_ctx_min;
|
||||
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
if (nd == 0) {
|
||||
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
|
||||
} else {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
|
||||
}
|
||||
}
|
||||
if (sum_used_target > sum_projected_used_min_ctx) {
|
||||
// linear interpolation between minimum and maximum context size:
|
||||
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
||||
/ (sum_projected_used - sum_projected_used_min_ctx);
|
||||
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
||||
|
||||
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
||||
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
if (nd <= 1) {
|
||||
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||
} else {
|
||||
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
}
|
||||
} else {
|
||||
if (n_ctx_min == UINT32_MAX) {
|
||||
LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nd == 0) {
|
||||
throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
}
|
||||
if (nd > 1) {
|
||||
if (!tensor_split) {
|
||||
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
||||
}
|
||||
if (mparams->tensor_split) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (mparams->tensor_split[id] != 0.0f) {
|
||||
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||
}
|
||||
}
|
||||
if (!tensor_buft_overrides) {
|
||||
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
||||
}
|
||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
||||
}
|
||||
|
||||
// step 3: iteratively fill the back to front with "dense" layers
|
||||
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||
|
||||
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||
auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
|
||||
constexpr size_t n_strings = 1000;
|
||||
if (il >= n_strings) {
|
||||
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||
}
|
||||
switch (lf) {
|
||||
case LAYER_FRACTION_ATTN: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_UP: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_GATE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_MOE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
};
|
||||
|
||||
struct ngl_t {
|
||||
uint32_t n_layer = 0; // number of total layers
|
||||
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||
|
||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||
|
||||
uint32_t n_full() const {
|
||||
assert(n_layer >= n_part);
|
||||
return n_layer - n_part;
|
||||
}
|
||||
};
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
|
||||
// utility function to set n_gpu_layers and tensor_split
|
||||
auto set_ngl_tensor_split_tbo = [&](
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
llama_model_params & mparams) {
|
||||
mparams.n_gpu_layers = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||
if (nd > 1) {
|
||||
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||
}
|
||||
}
|
||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
|
||||
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||
|
||||
mparams.tensor_split = tensor_split;
|
||||
|
||||
size_t itbo = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
il0 += ngl_per_device[id].n_full();
|
||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||
if (itbo + 1 >= ntbo) {
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
||||
+ std::to_string(ntbo) + " is insufficient for model");
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
||||
itbo++;
|
||||
}
|
||||
il0 += ngl_per_device[id].n_part;
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
};
|
||||
|
||||
// utility function that returns the memory use per device for given numbers of layers per device
|
||||
auto get_memory_for_layers = [&](
|
||||
const char * func_name,
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
|
||||
|
||||
const dmds_t dmd_nl = llama_get_device_memory_data(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const ngl_t & n = ngl_per_device[id];
|
||||
LLAMA_LOG_DEBUG(
|
||||
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ret;
|
||||
ret.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ret.push_back(dmd_nl[id].mb.total());
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
int64_t global_surplus_cpu_moe = 0;
|
||||
if (hp_nex > 0) {
|
||||
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
|
||||
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
|
||||
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||
__func__, global_surplus_cpu_moe/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||
__func__, -global_surplus_cpu_moe/MiB);
|
||||
}
|
||||
|
||||
// reset
|
||||
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
}
|
||||
|
||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margins[id]);
|
||||
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
||||
overflow_bufts.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device(nd);
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
||||
|
||||
// optimize the number of layers per device using the method of false position:
|
||||
// - ngl_per_device has 0 layers for each device, lower bound
|
||||
// - try a "high" configuration where a device is given all unassigned layers
|
||||
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||
// - check memory use of our guess, replace either the low or high bound
|
||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||
// - the last device has the output layer, which cannot be a partial layer
|
||||
if (hp_nex == 0) {
|
||||
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
}
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
uint32_t n_unassigned = hp_ngl + 1;
|
||||
for (size_t jd = id + 1; jd < nd; ++jd) {
|
||||
assert(n_unassigned >= ngl_per_device[jd].n_layer);
|
||||
n_unassigned -= ngl_per_device[jd].n_layer;
|
||||
}
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||
if (hp_nex > 0) {
|
||||
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
|
||||
}
|
||||
if (ngl_per_device_high[id].n_layer > 0) {
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
ngl_per_device_test[id].n_layer += step_size;
|
||||
if (hp_nex) {
|
||||
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
||||
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
|
||||
}
|
||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
}
|
||||
} else {
|
||||
assert(ngl_per_device_high[id].n_layer == n_unassigned);
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 4: for a MoE model where all dense tensors fit,
|
||||
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||
|
||||
size_t id_dense_start = nd;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
if (ngl_per_device[id].n_layer > 0) {
|
||||
id_dense_start = id;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
||||
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||
ngl_per_device_high[jd].n_part = 0;
|
||||
}
|
||||
size_t id_dense_start_high = nd - 1;
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
||||
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
uint32_t n_converted_test = 0;
|
||||
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||
n_converted_test += n_convert_jd;
|
||||
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
id_dense_start_high = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
||||
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
mem = mem_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
|
||||
// try to fit at least part of one more layer
|
||||
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||
ngl_per_device_test[id].n_layer++;
|
||||
ngl_per_device_test[id].n_part++;
|
||||
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
||||
id_dense_start_test++;
|
||||
}
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
||||
if (id < nd - 1) {
|
||||
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1].dev);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
||||
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
overflow_bufts = overflow_bufts_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
// print info for devices that were not changed during the conversion from dense only to full layers:
|
||||
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||
}
|
||||
|
||||
enum llama_params_fit_status llama_params_fit(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
||||
try {
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
|
||||
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const llama_params_fit_exception & e) {
|
||||
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
|
||||
} catch (const std::runtime_error & e) {
|
||||
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
||||
status = LLAMA_PARAMS_FIT_STATUS_ERROR;
|
||||
}
|
||||
const int64_t t1_us = llama_time_us();
|
||||
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
return status;
|
||||
}
|
||||
|
||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||
struct llama_sampler_chain_params result = {
|
||||
/*.no_perf =*/ true,
|
||||
|
|
|
|||
|
|
@ -1,14 +1,12 @@
|
|||
#include "llama.h"
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "fit.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <cinttypes>
|
||||
#include <thread>
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
|
@ -19,49 +17,58 @@ int main(int argc, char ** argv) {
|
|||
|
||||
common_init();
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FIT_PARAMS)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
|
||||
LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__);
|
||||
common_log_flush(common_log_main());
|
||||
printf("-c %" PRIu32 " -ngl %" PRIi32, cparams.n_ctx, mparams.n_gpu_layers);
|
||||
if (!params.fit_params_print) {
|
||||
const common_params_fit_status status = common_fit_params(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
if (status != COMMON_PARAMS_FIT_STATUS_SUCCESS) {
|
||||
LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
size_t nd = llama_max_devices();
|
||||
while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {
|
||||
nd--;
|
||||
}
|
||||
if (nd > 1) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (id == 0) {
|
||||
printf(" -ts ");
|
||||
LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__);
|
||||
common_log_flush(common_log_main());
|
||||
printf("-c %" PRIu32 " -ngl %" PRIi32, cparams.n_ctx, mparams.n_gpu_layers);
|
||||
|
||||
size_t nd = llama_max_devices();
|
||||
while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {
|
||||
nd--;
|
||||
}
|
||||
if (nd > 1) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (id == 0) {
|
||||
printf(" -ts ");
|
||||
}
|
||||
printf("%s%" PRIu32, id > 0 ? "," : "", uint32_t(mparams.tensor_split[id]));
|
||||
}
|
||||
printf("%s%" PRIu32, id > 0 ? "," : "", uint32_t(mparams.tensor_split[id]));
|
||||
}
|
||||
}
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
bool any_tbo = false;
|
||||
for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) {
|
||||
if (itbo == 0) {
|
||||
printf(" -ot \"");
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
bool any_tbo = false;
|
||||
for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) {
|
||||
if (itbo == 0) {
|
||||
printf(" -ot \"");
|
||||
}
|
||||
printf("%s%s=%s", itbo > 0 ? "," : "", mparams.tensor_buft_overrides[itbo].pattern, ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft));
|
||||
any_tbo = true;
|
||||
}
|
||||
printf("%s%s=%s", itbo > 0 ? "," : "", mparams.tensor_buft_overrides[itbo].pattern, ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft));
|
||||
any_tbo = true;
|
||||
printf("%s\n", any_tbo ? "\"" : "");
|
||||
} else {
|
||||
LOG_INF("%s: printing estimated memory in MiB to stdout (device, model, context, compute) ...\n", __func__);
|
||||
common_log_flush(common_log_main());
|
||||
|
||||
common_fit_print(params.model.path.c_str(), &mparams, &cparams);
|
||||
}
|
||||
printf("%s\n", any_tbo ? "\"" : "");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,10 +33,16 @@ struct mtmd_bitmap {
|
|||
bool is_audio = false; // true if the bitmap is audio
|
||||
};
|
||||
|
||||
// position indexing for decoder model
|
||||
enum mtmd_pos_type {
|
||||
MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
|
||||
MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens {
|
||||
uint32_t nx; // number of tokens in x direction
|
||||
uint32_t ny; // number of tokens in y direction
|
||||
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
|
||||
mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
|
||||
uint32_t n_tokens() const { return nx * ny; }
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||
|
|
@ -45,7 +51,7 @@ struct mtmd_image_tokens {
|
|||
return mtmd_image_tokens{
|
||||
nx,
|
||||
ny,
|
||||
use_mrope_pos,
|
||||
pos,
|
||||
batch_f32.clone(),
|
||||
id
|
||||
};
|
||||
|
|
@ -131,6 +137,7 @@ struct mtmd_context {
|
|||
int n_threads;
|
||||
std::string media_marker;
|
||||
const int n_embd_text;
|
||||
mtmd_pos_type pos_type;
|
||||
|
||||
// these are not token, but strings used to mark the beginning and end of image/audio embeddings
|
||||
std::string img_beg;
|
||||
|
|
@ -177,6 +184,22 @@ struct mtmd_context {
|
|||
throw std::runtime_error("media_marker must not be empty");
|
||||
}
|
||||
|
||||
auto decoder_rope_type = llama_model_rope_type(text_model);
|
||||
switch (decoder_rope_type) {
|
||||
case LLAMA_ROPE_TYPE_NORM:
|
||||
case LLAMA_ROPE_TYPE_NEOX:
|
||||
{
|
||||
pos_type = MTMD_POS_TYPE_NORMAL;
|
||||
} break;
|
||||
case LLAMA_ROPE_TYPE_MROPE:
|
||||
case LLAMA_ROPE_TYPE_IMROPE:
|
||||
{
|
||||
pos_type = MTMD_POS_TYPE_MROPE;
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("unsupported decoder rope type: %d\n", decoder_rope_type));
|
||||
}
|
||||
|
||||
clip_context_params ctx_clip_params {
|
||||
/* use_gpu */ ctx_params.use_gpu,
|
||||
/* flash_attn_type */ mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type),
|
||||
|
|
@ -777,12 +800,12 @@ struct mtmd_tokenizer {
|
|||
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
||||
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
image_tokens->use_mrope_pos = true;
|
||||
} else {
|
||||
// other models, we only need the total number of tokens
|
||||
image_tokens->nx = n_tokens;
|
||||
image_tokens->ny = 1;
|
||||
}
|
||||
image_tokens->pos = ctx->pos_type;
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmap->id; // optional
|
||||
|
||||
|
|
@ -1014,7 +1037,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
|||
return ctx->image_embd_v.data();
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
auto proj_type = ctx->proj_type_v();
|
||||
if (chunk && chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
proj_type = ctx->proj_type_a();
|
||||
|
|
@ -1028,32 +1051,19 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx, const mtmd_input_chunk * chu
|
|||
}
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
if (ctx->ctx_v == nullptr && ctx->proj_type_a() == PROJECTOR_TYPE_QWEN3A) {
|
||||
// qwen3-asr
|
||||
return true;
|
||||
}
|
||||
switch (ctx->proj_type_v()) {
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
bool mtmd_decode_use_mrope(const mtmd_context * ctx) {
|
||||
return ctx->pos_type == MTMD_POS_TYPE_MROPE;
|
||||
}
|
||||
|
||||
bool mtmd_support_vision(mtmd_context * ctx) {
|
||||
bool mtmd_support_vision(const mtmd_context * ctx) {
|
||||
return ctx->ctx_v != nullptr;
|
||||
}
|
||||
|
||||
bool mtmd_support_audio(mtmd_context * ctx) {
|
||||
bool mtmd_support_audio(const mtmd_context * ctx) {
|
||||
return ctx->ctx_a != nullptr;
|
||||
}
|
||||
|
||||
int mtmd_get_audio_sample_rate(mtmd_context * ctx) {
|
||||
int mtmd_get_audio_sample_rate(const mtmd_context * ctx) {
|
||||
if (!ctx->ctx_a) {
|
||||
return -1;
|
||||
}
|
||||
|
|
@ -1248,12 +1258,24 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
|
|||
|
||||
mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i) {
|
||||
mtmd_decoder_pos pos;
|
||||
// M-RoPE logic
|
||||
// TODO: support other types of position encoding if needed
|
||||
pos.t = pos_0;
|
||||
pos.x = pos_0 + (i % image_tokens->nx);
|
||||
pos.y = pos_0 + (i / image_tokens->nx);
|
||||
pos.z = 0; // unused for now
|
||||
switch (image_tokens->pos) {
|
||||
case MTMD_POS_TYPE_MROPE:
|
||||
{
|
||||
pos.t = pos_0;
|
||||
pos.x = pos_0 + (i % image_tokens->nx);
|
||||
pos.y = pos_0 + (i / image_tokens->nx);
|
||||
pos.z = 0; // unused for now
|
||||
} break;
|
||||
case MTMD_POS_TYPE_NORMAL:
|
||||
{
|
||||
pos.t = pos_0 + i;
|
||||
pos.x = pos_0 + i;
|
||||
pos.y = pos_0 + i;
|
||||
pos.z = pos_0 + i;
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("invalid position type");
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
|
@ -1262,12 +1284,14 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
|||
}
|
||||
|
||||
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
||||
if (image_tokens->use_mrope_pos) {
|
||||
// for M-RoPE, temporal dimension = max(t,h,w)
|
||||
// t is omitted as we don't support video input
|
||||
return std::max(image_tokens->nx, image_tokens->ny);
|
||||
switch (image_tokens->pos) {
|
||||
case MTMD_POS_TYPE_MROPE:
|
||||
return std::max(image_tokens->nx, image_tokens->ny);
|
||||
case MTMD_POS_TYPE_NORMAL:
|
||||
return image_tokens->n_tokens();
|
||||
default:
|
||||
GGML_ABORT("invalid position type");
|
||||
}
|
||||
return image_tokens->n_tokens();
|
||||
}
|
||||
|
||||
// test function
|
||||
|
|
|
|||
|
|
@ -112,20 +112,20 @@ MTMD_API void mtmd_free(mtmd_context * ctx);
|
|||
|
||||
// whether we need to set non-causal mask before llama_decode
|
||||
// if chunk is nullptr, we assume the default case where chunk is an image chunk
|
||||
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx, const mtmd_input_chunk * chunk);
|
||||
MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk);
|
||||
|
||||
// whether the current model use M-RoPE for llama_decode
|
||||
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
|
||||
MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx);
|
||||
|
||||
// whether the current model supports vision input
|
||||
MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
||||
MTMD_API bool mtmd_support_vision(const mtmd_context * ctx);
|
||||
|
||||
// whether the current model supports audio input
|
||||
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
||||
MTMD_API bool mtmd_support_audio(const mtmd_context * ctx);
|
||||
|
||||
// get audio sample rate in Hz, for example 16000 for Whisper
|
||||
// return -1 if audio is not supported
|
||||
MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
|
||||
MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
|
||||
|
||||
// mtmd_bitmap
|
||||
//
|
||||
|
|
|
|||
|
|
@ -3653,34 +3653,6 @@ void server_routes::init_routes() {
|
|||
return res;
|
||||
};
|
||||
|
||||
this->get_api_show = [this](const server_http_req &) {
|
||||
auto res = create_response();
|
||||
std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
|
||||
json data = {
|
||||
{
|
||||
"model_info", {
|
||||
{ "llama.context_length", meta->slot_n_ctx },
|
||||
}
|
||||
},
|
||||
{"modelfile", ""},
|
||||
{"parameters", ""},
|
||||
{"template", tmpl_default},
|
||||
{"details", {
|
||||
{"parent_model", ""},
|
||||
{"format", "gguf"},
|
||||
{"family", ""},
|
||||
{"families", {""}},
|
||||
{"parameter_size", ""},
|
||||
{"quantization_level", ""}
|
||||
}},
|
||||
{"model_info", ""},
|
||||
{"capabilities", meta->has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}
|
||||
};
|
||||
|
||||
res->ok(data);
|
||||
return res;
|
||||
};
|
||||
|
||||
this->post_infill = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
// check model compatibility
|
||||
|
|
|
|||
|
|
@ -105,7 +105,6 @@ struct server_routes {
|
|||
server_http_context::handler_t post_slots;
|
||||
server_http_context::handler_t get_props;
|
||||
server_http_context::handler_t post_props;
|
||||
server_http_context::handler_t get_api_show;
|
||||
server_http_context::handler_t post_infill;
|
||||
server_http_context::handler_t post_completions;
|
||||
server_http_context::handler_t post_completions_oai;
|
||||
|
|
|
|||
|
|
@ -143,7 +143,6 @@ bool server_http_context::init(const common_params & params) {
|
|||
"/v1/health",
|
||||
"/models",
|
||||
"/v1/models",
|
||||
"/api/tags",
|
||||
"/",
|
||||
"/index.html",
|
||||
"/bundle.js",
|
||||
|
|
|
|||
|
|
@ -1147,7 +1147,7 @@ server_http_proxy::server_http_proxy(
|
|||
|
||||
// setup Client
|
||||
cli->set_follow_location(true);
|
||||
cli->set_connection_timeout(5, 0); // 5 seconds
|
||||
cli->set_connection_timeout(timeout_read, 0); // use --timeout value instead of hardcoded 5 s
|
||||
cli->set_write_timeout(timeout_read, 0); // reversed for cli (client) vs srv (server)
|
||||
cli->set_read_timeout(timeout_write, 0);
|
||||
this->status = 500; // to be overwritten upon response
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include "arg.h"
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "fit.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
|
||||
|
|
@ -141,7 +142,6 @@ int main(int argc, char ** argv) {
|
|||
// note: routes.get_health stays the same
|
||||
routes.get_metrics = models_routes->proxy_get;
|
||||
routes.post_props = models_routes->proxy_post;
|
||||
routes.get_api_show = models_routes->proxy_get;
|
||||
routes.post_completions = models_routes->proxy_post;
|
||||
routes.post_completions_oai = models_routes->proxy_post;
|
||||
routes.post_chat_completions = models_routes->proxy_post;
|
||||
|
|
@ -174,16 +174,13 @@ int main(int argc, char ** argv) {
|
|||
ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics));
|
||||
ctx_http.get ("/props", ex_wrapper(routes.get_props));
|
||||
ctx_http.post("/props", ex_wrapper(routes.post_props));
|
||||
ctx_http.post("/api/show", ex_wrapper(routes.get_api_show));
|
||||
ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check)
|
||||
ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check)
|
||||
ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
|
||||
ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy
|
||||
ctx_http.post("/completions", ex_wrapper(routes.post_completions));
|
||||
ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai));
|
||||
ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||
ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
|
||||
ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
|
||||
ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai));
|
||||
ctx_http.post("/responses", ex_wrapper(routes.post_responses_oai));
|
||||
ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai));
|
||||
|
|
@ -348,7 +345,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
auto * ll_ctx = ctx_server.get_llama_context();
|
||||
if (ll_ctx != nullptr) {
|
||||
llama_memory_breakdown_print(ll_ctx);
|
||||
common_memory_breakdown_print(ll_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue