mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 14:04:34 +00:00
support different window sizes
This commit is contained in:
parent
5685cb87ed
commit
76a7fc7527
6 changed files with 200 additions and 127 deletions
|
@ -677,9 +677,36 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||||
).set_env("LLAMA_ARG_RANK"));
|
).set_env("LLAMA_ARG_RANK"));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"-lw", "--layer-window", "--n-layer-window"}, "N",
|
{"-lw", "--layer-window", "--n-layer-window"}, "N",
|
||||||
format("number of layers to process in each compute (default: %d)", params.n_layer_window),
|
format("number of layers to process in each compute (e.g., 16,16)"),
|
||||||
[](gpt_params & params, int value) {
|
[](gpt_params & params, const std::string & value) {
|
||||||
params.n_layer_window = value;
|
uint32_t result[32] = {0};
|
||||||
|
size_t index = 0;
|
||||||
|
std::stringstream ss(value);
|
||||||
|
std::string item;
|
||||||
|
|
||||||
|
while (std::getline(ss, item, ',')) {
|
||||||
|
try {
|
||||||
|
int num = std::stoi(item);
|
||||||
|
|
||||||
|
if (num <= 0) {
|
||||||
|
throw std::runtime_error("All values in --n-layer-window must be non-zero positive integers");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (index >= 32) {
|
||||||
|
throw std::runtime_error("Too many values in --n-layer-window (maximum is 32)");
|
||||||
|
}
|
||||||
|
|
||||||
|
result[index++] = static_cast<uint32_t>(num);
|
||||||
|
} catch (const std::invalid_argument &) {
|
||||||
|
throw std::runtime_error("Non-integer value found in --n-layer-window");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (index == 0) {
|
||||||
|
throw std::runtime_error("Input cannot be empty");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::copy(std::begin(result), std::end(result), params.n_layer_window);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_N_LAYER_WINDOW"));
|
).set_env("LLAMA_ARG_N_LAYER_WINDOW"));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
|
|
|
@ -858,7 +858,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -986,7 +985,6 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||||
}
|
}
|
||||||
mparams.n_world = params.n_world;
|
mparams.n_world = params.n_world;
|
||||||
mparams.rank = params.rank;
|
mparams.rank = params.rank;
|
||||||
mparams.n_layer_window = params.n_layer_window;
|
|
||||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||||
mparams.main_gpu = params.main_gpu;
|
mparams.main_gpu = params.main_gpu;
|
||||||
mparams.split_mode = params.split_mode;
|
mparams.split_mode = params.split_mode;
|
||||||
|
@ -994,6 +992,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||||
mparams.use_mmap = params.use_mmap;
|
mparams.use_mmap = params.use_mmap;
|
||||||
mparams.use_mlock = params.use_mlock;
|
mparams.use_mlock = params.use_mlock;
|
||||||
mparams.check_tensors = params.check_tensors;
|
mparams.check_tensors = params.check_tensors;
|
||||||
|
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window);
|
||||||
if (params.kv_overrides.empty()) {
|
if (params.kv_overrides.empty()) {
|
||||||
mparams.kv_overrides = NULL;
|
mparams.kv_overrides = NULL;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1038,8 +1037,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
|
|
||||||
cparams.n_world = params.n_world;
|
cparams.n_world = params.n_world;
|
||||||
cparams.rank = params.rank;
|
cparams.rank = params.rank;
|
||||||
cparams.n_layer_window = params.n_layer_window;
|
|
||||||
cparams.unload = params.unload;
|
cparams.unload = params.unload;
|
||||||
|
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
|
||||||
|
|
||||||
if (cparams.master_ip != nullptr) {
|
if (cparams.master_ip != nullptr) {
|
||||||
delete[] cparams.master_ip;
|
delete[] cparams.master_ip;
|
||||||
|
|
|
@ -144,7 +144,7 @@ struct gpt_sampler_params {
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
int32_t n_world = 1; // number of devices to use
|
int32_t n_world = 1; // number of devices to use
|
||||||
int32_t rank = 0; // my rank for distributed inference
|
int32_t rank = 0; // my rank for distributed inference
|
||||||
int32_t n_layer_window = 32; // number of layers to process in each compute
|
uint32_t n_layer_window[32] = {32}; // layer window size on each node
|
||||||
std::string master_ip = "localhost"; // ip address of the master node
|
std::string master_ip = "localhost"; // ip address of the master node
|
||||||
std::string next_node_ip = "localhost"; // ip address of my next node
|
std::string next_node_ip = "localhost"; // ip address of my next node
|
||||||
bool unload = false; // unload layer weights after use or not
|
bool unload = false; // unload layer weights after use or not
|
||||||
|
|
|
@ -144,7 +144,18 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
const uint32_t n_world = params.n_world;
|
const uint32_t n_world = params.n_world;
|
||||||
const uint32_t my_rank = params.rank;
|
const uint32_t my_rank = params.rank;
|
||||||
|
|
||||||
|
// check if --n-layer-window and --world is matched
|
||||||
|
uint32_t non_zero_count = 0;
|
||||||
|
size_t size = sizeof(params.n_layer_window) / sizeof(params.n_layer_window[0]);
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
if (params.n_layer_window[i] != 0) {
|
||||||
|
++non_zero_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
GGML_ASSERT(!(n_world == 1 && my_rank > 0));
|
GGML_ASSERT(!(n_world == 1 && my_rank > 0));
|
||||||
|
GGML_ASSERT(non_zero_count == n_world && "Number of non-zero values in --n-layer-window must equal --world");
|
||||||
|
|
||||||
gpt_init();
|
gpt_init();
|
||||||
|
|
||||||
|
|
|
@ -278,7 +278,7 @@ extern "C" {
|
||||||
struct llama_model_params {
|
struct llama_model_params {
|
||||||
uint32_t n_world; // number of nodes
|
uint32_t n_world; // number of nodes
|
||||||
uint32_t rank; // my node rank
|
uint32_t rank; // my node rank
|
||||||
uint32_t n_layer_window; // number of layers to kept each time
|
uint32_t n_layer_window[32]; // number of layers to kept each time
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||||
|
|
||||||
|
@ -317,7 +317,7 @@ extern "C" {
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
uint32_t n_world; // world size
|
uint32_t n_world; // world size
|
||||||
uint32_t rank; // my rank
|
uint32_t rank; // my rank
|
||||||
uint32_t n_layer_window; // number of layers to process in each compute
|
uint32_t n_layer_window[32];// number of layers to process in each compute
|
||||||
bool unload; // whether to unload layer weights after use
|
bool unload; // whether to unload layer weights after use
|
||||||
char * master_ip; // ip address of the master node
|
char * master_ip; // ip address of the master node
|
||||||
char * next_node_ip; // ip address of the next node
|
char * next_node_ip; // ip address of the next node
|
||||||
|
|
170
src/llama.cpp
170
src/llama.cpp
|
@ -2565,7 +2565,7 @@ static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams m
|
||||||
struct llama_cparams {
|
struct llama_cparams {
|
||||||
uint32_t n_world;
|
uint32_t n_world;
|
||||||
uint32_t rank;
|
uint32_t rank;
|
||||||
uint32_t n_layer_window;
|
uint32_t n_layer_window[32];
|
||||||
bool unload;
|
bool unload;
|
||||||
uint32_t n_ctx; // context size used during inference
|
uint32_t n_ctx; // context size used during inference
|
||||||
uint32_t n_batch;
|
uint32_t n_batch;
|
||||||
|
@ -3619,25 +3619,52 @@ static bool this_layer_is_mine(
|
||||||
uint32_t layer_id,
|
uint32_t layer_id,
|
||||||
uint32_t n_world,
|
uint32_t n_world,
|
||||||
uint32_t my_rank,
|
uint32_t my_rank,
|
||||||
uint32_t n_layer_window) {
|
const uint32_t * n_layer_window) {
|
||||||
return (layer_id / n_layer_window) % n_world == my_rank;
|
uint32_t cumulative_layers = 0;
|
||||||
|
uint32_t rank = 0;
|
||||||
|
while (true) {
|
||||||
|
cumulative_layers += n_layer_window[rank];
|
||||||
|
if (layer_id < cumulative_layers) {
|
||||||
|
return rank == my_rank;
|
||||||
|
}
|
||||||
|
rank = (rank + 1) % n_world;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int64_t map_layer_to_local_id(
|
static int32_t map_layer_to_local_id(
|
||||||
int64_t layer_id,
|
uint32_t layer_id,
|
||||||
uint32_t n_world,
|
uint32_t n_world,
|
||||||
uint32_t my_rank,
|
uint32_t my_rank,
|
||||||
uint32_t n_layer_window) {
|
const uint32_t* n_layer_window)
|
||||||
|
{
|
||||||
if (!this_layer_is_mine(layer_id, n_world, my_rank, n_layer_window)) {
|
if (!this_layer_is_mine(layer_id, n_world, my_rank, n_layer_window)) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
// map layer_id to kvcache_id.
|
|
||||||
// example: For n_world=2 and n_layer_window=4, rank 0 handles layers 0-3, 8-11, 16-19, while rank 1 handles layers 4-7, 12-15, 20-23.
|
uint32_t cycle_size = 0;
|
||||||
// on rank 0, layer_id should map to kvcache_id as follows: 0-3 -> 0-3, 8-11 -> 4-7, 16-19 -> 8-11.
|
for (uint32_t i = 0; i < n_world; ++i) {
|
||||||
// on rank 1, layer_id should map to kvcache_id as follows: 4-7 -> 0-3, 12-15 -> 4-7, 20-23 -> 8-11.
|
cycle_size += n_layer_window[i];
|
||||||
int64_t cycle_size = n_world * n_layer_window;
|
}
|
||||||
int64_t local_offset = (layer_id / cycle_size) * n_layer_window;
|
|
||||||
return (layer_id % cycle_size) % n_layer_window + local_offset;
|
uint32_t cycle_offset = layer_id % cycle_size;
|
||||||
|
uint32_t cumulative_layers = 0;
|
||||||
|
uint32_t local_offset = (layer_id / cycle_size) * n_layer_window[my_rank];
|
||||||
|
|
||||||
|
for (uint32_t rank = 0; rank < n_world; ++rank) {
|
||||||
|
uint32_t window_size = n_layer_window[rank];
|
||||||
|
|
||||||
|
if (cycle_offset < cumulative_layers + window_size) {
|
||||||
|
if (rank == my_rank) {
|
||||||
|
return cycle_offset - cumulative_layers + local_offset;
|
||||||
|
} else {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cumulative_layers += window_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -3657,7 +3684,7 @@ static bool llama_kv_cache_init(
|
||||||
const int64_t n_layer = hparams.n_layer;
|
const int64_t n_layer = hparams.n_layer;
|
||||||
const uint32_t n_world = cparams.n_world;
|
const uint32_t n_world = cparams.n_world;
|
||||||
const uint32_t my_rank = cparams.rank;
|
const uint32_t my_rank = cparams.rank;
|
||||||
const uint32_t n_layer_window = cparams.n_layer_window;
|
const uint32_t * n_layer_window = cparams.n_layer_window;
|
||||||
|
|
||||||
cache.has_shift = false;
|
cache.has_shift = false;
|
||||||
cache.recurrent = llama_model_is_recurrent(&model);
|
cache.recurrent = llama_model_is_recurrent(&model);
|
||||||
|
@ -3672,20 +3699,24 @@ static bool llama_kv_cache_init(
|
||||||
|
|
||||||
// count used buffer types
|
// count used buffer types
|
||||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||||
int64_t local_i;
|
int32_t local_i;
|
||||||
uint32_t my_layers = 0;
|
uint32_t my_layers = 0;
|
||||||
|
|
||||||
for (int64_t i = 0; i < n_layer; ++i) {
|
for (int64_t i = 0; i < n_layer; ++i) {
|
||||||
if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
|
if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
my_layers++;
|
|
||||||
local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
|
local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
|
||||||
|
GGML_ASSERT(local_i != -1);
|
||||||
|
|
||||||
if (offload) {
|
if (offload) {
|
||||||
buft_layer_count[model.buft_layer[local_i].buft]++;
|
buft_layer_count[model.buft_layer[local_i].buft]++;
|
||||||
} else {
|
} else {
|
||||||
buft_layer_count[llama_default_buffer_type_cpu(model, true)]++;
|
buft_layer_count[llama_default_buffer_type_cpu(model, true)]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
my_layers++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// create a context for each buffer type
|
// create a context for each buffer type
|
||||||
|
@ -3714,13 +3745,14 @@ static bool llama_kv_cache_init(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
|
int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
|
||||||
|
GGML_ASSERT(local_i != -1);
|
||||||
|
|
||||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||||
|
|
||||||
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[local_i].buft) : cache.ctxs.front();
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[local_i].buft) : cache.ctxs.front();
|
||||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa * kv_size);
|
||||||
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa * kv_size);
|
||||||
ggml_format_name(k, "cache_k_l%d", i);
|
ggml_format_name(k, "cache_k_l%d", i);
|
||||||
ggml_format_name(v, "cache_v_l%d", i);
|
ggml_format_name(v, "cache_v_l%d", i);
|
||||||
cache.k_l.push_back(k);
|
cache.k_l.push_back(k);
|
||||||
|
@ -7026,7 +7058,7 @@ static bool llm_load_tensors(
|
||||||
llama_model & model,
|
llama_model & model,
|
||||||
uint32_t n_world,
|
uint32_t n_world,
|
||||||
uint32_t my_rank,
|
uint32_t my_rank,
|
||||||
uint32_t n_layer_window,
|
const uint32_t * n_layer_window,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
enum llama_split_mode split_mode,
|
enum llama_split_mode split_mode,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
|
@ -7045,13 +7077,9 @@ static bool llm_load_tensors(
|
||||||
model.split_mode = split_mode;
|
model.split_mode = split_mode;
|
||||||
model.main_gpu = main_gpu;
|
model.main_gpu = main_gpu;
|
||||||
model.n_gpu_layers = n_gpu_layers;
|
model.n_gpu_layers = n_gpu_layers;
|
||||||
|
int n_layer = hparams.n_layer;
|
||||||
const int n_layer = hparams.n_layer;
|
|
||||||
bool use_mmap_buffer = true;
|
bool use_mmap_buffer = true;
|
||||||
|
|
||||||
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
|
||||||
model.buft_input = llama_default_buffer_type_cpu(model, true);
|
|
||||||
|
|
||||||
int my_layers = 0;
|
int my_layers = 0;
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
if (this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
|
if (this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
|
||||||
|
@ -7062,8 +7090,11 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
if (this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
|
if (this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
|
||||||
int local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
|
int32_t local_i = map_layer_to_local_id(i, n_world, my_rank, n_layer_window);
|
||||||
if (i % (int)n_layer_window >= (int)n_layer_window - n_gpu_layers) {
|
int32_t window_size = static_cast<int32_t>(n_layer_window[my_rank]);
|
||||||
|
GGML_ASSERT(local_i != -1);
|
||||||
|
|
||||||
|
if (local_i % window_size >= window_size - n_gpu_layers) {
|
||||||
LLAMA_LOG_INFO("Layer %i assigned to gpu (cache index %i)\n", i, local_i);
|
LLAMA_LOG_INFO("Layer %i assigned to gpu (cache index %i)\n", i, local_i);
|
||||||
model.buft_layer[local_i] = llama_default_buffer_type_offload(model, main_gpu);
|
model.buft_layer[local_i] = llama_default_buffer_type_offload(model, main_gpu);
|
||||||
} else {
|
} else {
|
||||||
|
@ -7073,14 +7104,19 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// assign the output layer
|
// assign the output layer (locate on node 0 only)
|
||||||
if (my_rank == 0 && n_gpu_layers > (int)n_layer_window) {
|
if (my_rank == 0) {
|
||||||
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
||||||
|
model.buft_input = llama_default_buffer_type_cpu(model, true);
|
||||||
|
|
||||||
|
if (n_gpu_layers > (int)n_layer_window[0]) {
|
||||||
LLAMA_LOG_INFO("Layer output assigned to gpu\n");
|
LLAMA_LOG_INFO("Layer output assigned to gpu\n");
|
||||||
model.buft_output = llama_default_buffer_type_offload(model, main_gpu);
|
model.buft_output = llama_default_buffer_type_offload(model, main_gpu);
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_INFO("Layer output assigned to cpu\n");
|
LLAMA_LOG_INFO("Layer output assigned to cpu\n");
|
||||||
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// count used buffer types
|
// count used buffer types
|
||||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||||
|
@ -7099,7 +7135,7 @@ static bool llm_load_tensors(
|
||||||
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
||||||
|
|
||||||
// for moe merged tensors
|
// for moe merged tensors
|
||||||
ctx_size += ggml_tensor_overhead()*n_layer*3;
|
ctx_size += ggml_tensor_overhead() * my_layers * 3;
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||||
for (auto & it : buft_layer_count) {
|
for (auto & it : buft_layer_count) {
|
||||||
|
@ -7149,8 +7185,8 @@ static bool llm_load_tensors(
|
||||||
ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ctx_for_layer = [&](int local_i) { return ctx_map.at(model.buft_layer[local_i].buft); };
|
auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
|
||||||
auto ctx_for_layer_split = [&](int local_i) { return ctx_map.at(model.buft_layer[local_i].buft_matrix); };
|
auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
|
||||||
|
|
||||||
model.layers.resize(my_layers);
|
model.layers.resize(my_layers);
|
||||||
|
|
||||||
|
@ -7202,12 +7238,11 @@ static bool llm_load_tensors(
|
||||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
|
||||||
if (n_expert == 0) {
|
if (n_expert == 0) {
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
|
||||||
// optional MLP bias
|
// optional MLP bias
|
||||||
|
@ -7216,10 +7251,9 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
} else {
|
} else {
|
||||||
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
||||||
|
|
||||||
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
if (layer.ffn_gate_exps) {
|
if (layer.ffn_gate_exps) {
|
||||||
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
||||||
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
||||||
} else {
|
} else {
|
||||||
// merge split expert into a single tensor for compatibility with older models
|
// merge split expert into a single tensor for compatibility with older models
|
||||||
|
@ -7240,9 +7274,9 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
for (uint32_t x = 0; x < n_expert; ++x) {
|
for (uint32_t x = 0; x < n_expert; ++x) {
|
||||||
// the individual experts are loaded into a view of the merged tensor
|
// the individual experts are loaded into a view of the merged tensor
|
||||||
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, layer.ffn_gate_exps->nb[2] * x);
|
||||||
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {n_ff, n_embd}, layer.ffn_down_exps->nb[2] * x);
|
||||||
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, layer.ffn_up_exps->nb[2] * x);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9191,7 +9225,7 @@ static void llm_build_kv_store(
|
||||||
const int64_t n_ctx = cparams.n_ctx;
|
const int64_t n_ctx = cparams.n_ctx;
|
||||||
const uint32_t n_world = cparams.n_world;
|
const uint32_t n_world = cparams.n_world;
|
||||||
const uint32_t my_rank = cparams.rank;
|
const uint32_t my_rank = cparams.rank;
|
||||||
const uint32_t n_layer_window = cparams.n_layer_window;
|
const uint32_t * n_layer_window = cparams.n_layer_window;
|
||||||
const int local_il = map_layer_to_local_id(il, n_world, my_rank, n_layer_window);
|
const int local_il = map_layer_to_local_id(il, n_world, my_rank, n_layer_window);
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||||
|
@ -9554,7 +9588,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
const llama_cparams & cparams = lctx.cparams;
|
const llama_cparams & cparams = lctx.cparams;
|
||||||
const uint32_t n_world = cparams.n_world;
|
const uint32_t n_world = cparams.n_world;
|
||||||
const uint32_t my_rank = cparams.rank;
|
const uint32_t my_rank = cparams.rank;
|
||||||
const uint32_t n_layer_window = cparams.n_layer_window;
|
const uint32_t * n_layer_window = cparams.n_layer_window;
|
||||||
|
|
||||||
const int local_il = map_layer_to_local_id(il, n_world, my_rank, n_layer_window);
|
const int local_il = map_layer_to_local_id(il, n_world, my_rank, n_layer_window);
|
||||||
const int64_t n_ctx = cparams.n_ctx;
|
const int64_t n_ctx = cparams.n_ctx;
|
||||||
|
@ -10515,7 +10549,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
const uint32_t n_world = this->cparams.n_world;
|
const uint32_t n_world = this->cparams.n_world;
|
||||||
const uint32_t my_rank = this->cparams.rank;
|
const uint32_t my_rank = this->cparams.rank;
|
||||||
const uint32_t n_layer_window = this->cparams.n_layer_window;
|
const uint32_t * n_layer_window = this->cparams.n_layer_window;
|
||||||
|
|
||||||
if (my_rank == 0) {
|
if (my_rank == 0) {
|
||||||
// inp_embd - contains the input embedding
|
// inp_embd - contains the input embedding
|
||||||
|
@ -16365,16 +16399,16 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t map_layer_to_subgf_id(uint32_t i, uint32_t my_rank, uint32_t n_world, uint32_t n_layer, uint32_t n_layer_window) {
|
static int32_t map_layer_to_subgf_id(uint32_t i, uint32_t my_rank, uint32_t n_world, const uint32_t * n_layer_window) {
|
||||||
uint32_t global_layer_offset = my_rank * n_layer_window;
|
if (!this_layer_is_mine(i, n_world, my_rank, n_layer_window)) {
|
||||||
uint32_t step = n_world * n_layer_window;
|
|
||||||
if (i < n_layer) {
|
|
||||||
uint32_t relative_layer = i % step;
|
|
||||||
if (relative_layer >= global_layer_offset && relative_layer < global_layer_offset + n_layer_window) {
|
|
||||||
return i / step;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return -1;
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t total_window_size = 0;
|
||||||
|
for (uint32_t rank = 0; rank < n_world; ++rank) {
|
||||||
|
total_window_size += n_layer_window[rank];
|
||||||
|
}
|
||||||
|
return i / total_window_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<struct ggml_cgraph *> llama_build_graph(
|
static std::vector<struct ggml_cgraph *> llama_build_graph(
|
||||||
|
@ -16384,7 +16418,7 @@ static std::vector<struct ggml_cgraph *> llama_build_graph(
|
||||||
const auto & model = lctx.model;
|
const auto & model = lctx.model;
|
||||||
const uint32_t n_world = lctx.cparams.n_world;
|
const uint32_t n_world = lctx.cparams.n_world;
|
||||||
const uint32_t my_rank = lctx.cparams.rank;
|
const uint32_t my_rank = lctx.cparams.rank;
|
||||||
const uint32_t n_layer_window = lctx.cparams.n_layer_window;
|
const uint32_t * n_layer_window = lctx.cparams.n_layer_window;
|
||||||
const uint32_t n_layer = lctx.model.hparams.n_layer;
|
const uint32_t n_layer = lctx.model.hparams.n_layer;
|
||||||
|
|
||||||
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||||
|
@ -16392,7 +16426,8 @@ static std::vector<struct ggml_cgraph *> llama_build_graph(
|
||||||
int sub_gf_id = 0;
|
int sub_gf_id = 0;
|
||||||
if (il >= 0) {
|
if (il >= 0) {
|
||||||
ggml_format_name(cur, "%s-%d", name, il);
|
ggml_format_name(cur, "%s-%d", name, il);
|
||||||
sub_gf_id = map_layer_to_subgf_id(il, my_rank, n_world, n_layer, n_layer_window);
|
sub_gf_id = map_layer_to_subgf_id(il, my_rank, n_world, n_layer_window);
|
||||||
|
GGML_ASSERT(sub_gf_id != -1);
|
||||||
} else {
|
} else {
|
||||||
ggml_set_name(cur, name);
|
ggml_set_name(cur, name);
|
||||||
}
|
}
|
||||||
|
@ -16406,7 +16441,7 @@ static std::vector<struct ggml_cgraph *> llama_build_graph(
|
||||||
|
|
||||||
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
||||||
// FIXME: fix in ggml_backend_sched
|
// FIXME: fix in ggml_backend_sched
|
||||||
const bool full_offload = lctx.model.n_gpu_layers > (int)n_layer_window;
|
const bool full_offload = lctx.model.n_gpu_layers > (int)n_layer_window[0];
|
||||||
if (batch.n_tokens < 32 || full_offload) {
|
if (batch.n_tokens < 32 || full_offload) {
|
||||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||||
for (auto * backend : lctx.backends) {
|
for (auto * backend : lctx.backends) {
|
||||||
|
@ -18250,6 +18285,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||||
|
|
||||||
// apply K-shift if needed
|
// apply K-shift if needed
|
||||||
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
||||||
|
throw std::runtime_error("shift not supported\n");
|
||||||
|
|
||||||
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
|
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
|
||||||
GGML_ABORT("Deepseek2 does not support K-shift");
|
GGML_ABORT("Deepseek2 does not support K-shift");
|
||||||
}
|
}
|
||||||
|
@ -18290,6 +18327,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||||
|
|
||||||
// reserve a worst case graph again
|
// reserve a worst case graph again
|
||||||
if (need_reserve) {
|
if (need_reserve) {
|
||||||
|
throw std::runtime_error("reserve not supported\n");
|
||||||
|
|
||||||
// TODO: extract to a function
|
// TODO: extract to a function
|
||||||
// build worst-case graph
|
// build worst-case graph
|
||||||
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
||||||
|
@ -18299,7 +18338,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||||
std::vector<ggml_cgraph *> gf = llama_build_graph(lctx, ubatch, true);
|
std::vector<ggml_cgraph *> gf = llama_build_graph(lctx, ubatch, true);
|
||||||
|
|
||||||
// initialize scheduler with the worst-case graph
|
// initialize scheduler with the worst-case graph
|
||||||
ggml_backend_sched_reset(lctx.sched.at(0)); // todo.
|
ggml_backend_sched_reset(lctx.sched[0]); // todo.
|
||||||
|
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
GGML_ASSERT(lctx.sched.size() == gf.size());
|
GGML_ASSERT(lctx.sched.size() == gf.size());
|
||||||
|
@ -19405,7 +19444,7 @@ struct llama_model_params llama_model_default_params() {
|
||||||
struct llama_model_params result = {
|
struct llama_model_params result = {
|
||||||
/*.n_world =*/ 1,
|
/*.n_world =*/ 1,
|
||||||
/*.rank =*/ 0,
|
/*.rank =*/ 0,
|
||||||
/*.n_layer_window =*/ 32,
|
/*.n_layer_window =*/ {32},
|
||||||
/*.n_gpu_layers =*/ 0,
|
/*.n_gpu_layers =*/ 0,
|
||||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
|
@ -19432,7 +19471,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
struct llama_context_params result = {
|
struct llama_context_params result = {
|
||||||
/*.n_world =*/ 1,
|
/*.n_world =*/ 1,
|
||||||
/*.rank =*/ 0,
|
/*.rank =*/ 0,
|
||||||
/*.n_layer_window =*/ 32,
|
/*.n_layer_window =*/ {32},
|
||||||
/*.unload =*/ false,
|
/*.unload =*/ false,
|
||||||
/*.master_ip =*/ nullptr,
|
/*.master_ip =*/ nullptr,
|
||||||
/*.next_node_ip =*/ nullptr,
|
/*.next_node_ip =*/ nullptr,
|
||||||
|
@ -19736,7 +19775,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
cparams.n_world = params.n_world;
|
cparams.n_world = params.n_world;
|
||||||
cparams.rank = params.rank;
|
cparams.rank = params.rank;
|
||||||
cparams.n_layer_window = params.n_layer_window;
|
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
|
||||||
cparams.unload = params.unload;
|
cparams.unload = params.unload;
|
||||||
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
||||||
cparams.n_threads = params.n_threads;
|
cparams.n_threads = params.n_threads;
|
||||||
|
@ -19808,21 +19847,19 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx->master_ip = params.master_ip;
|
||||||
|
ctx->next_node_ip = params.next_node_ip;
|
||||||
|
|
||||||
LLAMA_LOG_INFO("\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
LLAMA_LOG_INFO("%s: n_world = %u\n", __func__, cparams.n_world);
|
LLAMA_LOG_INFO("%s: n_world = %u\n", __func__, cparams.n_world);
|
||||||
LLAMA_LOG_INFO("%s: rank = %u\n", __func__, cparams.rank);
|
LLAMA_LOG_INFO("%s: rank = %u\n", __func__, cparams.rank);
|
||||||
LLAMA_LOG_INFO("%s: n_layer_win= %u\n", __func__, cparams.n_layer_window);
|
LLAMA_LOG_INFO("%s: win_size = %u\n", __func__, cparams.n_layer_window[cparams.rank]);
|
||||||
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
||||||
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
||||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||||
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
||||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||||
|
|
||||||
ctx->master_ip = params.master_ip;
|
|
||||||
ctx->next_node_ip = params.next_node_ip;
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("\n");
|
|
||||||
LLAMA_LOG_INFO("%s: master_ip = %s\n", __func__, ctx->master_ip.c_str());
|
LLAMA_LOG_INFO("%s: master_ip = %s\n", __func__, ctx->master_ip.c_str());
|
||||||
LLAMA_LOG_INFO("%s: next_node_ip = %s\n", __func__, ctx->next_node_ip.c_str());
|
LLAMA_LOG_INFO("%s: next_node_ip = %s\n", __func__, ctx->next_node_ip.c_str());
|
||||||
|
|
||||||
|
@ -20041,8 +20078,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
|
|
||||||
// graph outputs buffer, reserve for rank 0 only
|
// graph outputs buffer, reserve for rank 0 only
|
||||||
const uint32_t my_rank = params.rank;
|
if (params.rank == 0) {
|
||||||
if (my_rank == 0) {
|
|
||||||
// resized during inference when a batch uses more outputs
|
// resized during inference when a batch uses more outputs
|
||||||
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
|
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
|
||||||
|
@ -20118,7 +20154,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->sched.resize(gf.size());
|
ctx->sched.resize(gf.size());
|
||||||
|
|
||||||
// prefetch the first subgraph weights
|
// prefetch the first subgraph weights
|
||||||
manage_graph_tensors(gf.front(), POSIX_MADV_WILLNEED, true);
|
manage_graph_tensors(gf.front(), POSIX_MADV_WILLNEED, false);
|
||||||
|
|
||||||
// initialize scheduler with the worst-case graph
|
// initialize scheduler with the worst-case graph
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
|
@ -20138,7 +20174,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
size_t total_size = 0;
|
size_t total_size = 0;
|
||||||
for (size_t j = 0; j < ctx->sched.size(); j++) {
|
for (size_t j = 0; j < ctx->sched.size(); j++) {
|
||||||
total_size += ggml_backend_sched_get_buffer_size(ctx->sched.at(j), backend);
|
total_size += ggml_backend_sched_get_buffer_size(ctx->sched[j], backend);
|
||||||
}
|
}
|
||||||
if (total_size > 1) {
|
if (total_size > 1) {
|
||||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB (in total)\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB (in total)\n", __func__,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue