mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 08:59:09 +00:00
fix kappa and memory bounds, account for look-up table and input/output layer delay
This commit is contained in:
parent
9e4ba4f06a
commit
f3dd5776eb
2 changed files with 34 additions and 34 deletions
|
@ -832,7 +832,7 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|||
return cache_directory + filename;
|
||||
}
|
||||
|
||||
static bool assign_device(
|
||||
static bool assign_layers_to_device(
|
||||
uint32_t n_world,
|
||||
uint32_t my_rank,
|
||||
const device_info * dev_info_set,
|
||||
|
@ -857,6 +857,7 @@ static bool assign_device(
|
|||
// model-specific constants
|
||||
const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model);
|
||||
const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model);
|
||||
const int n_vocab = llama_n_vocab(model);
|
||||
const int n_kv = cparams.n_ctx;
|
||||
|
||||
const int64_t b = dev_info_set[0].model_bytes.nb_layer;
|
||||
|
@ -876,7 +877,7 @@ static bool assign_device(
|
|||
// -------- Compute alpha[m], beta[m], xi[m] --------
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
// alpha[m]
|
||||
const device_info &dev = dev_info_set[m];
|
||||
const device_info & dev = dev_info_set[m];
|
||||
float t_calc_cpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
|
@ -931,7 +932,7 @@ static bool assign_device(
|
|||
// - $d_m^{\text{avail}}+d_m^{\text{swapout}}$ for Android
|
||||
// and $n_m$ is initialized to 0.
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info &dev = dev_info_set[m];
|
||||
const device_info & dev = dev_info_set[m];
|
||||
GGML_ASSERT(dev.device_os != nullptr);
|
||||
|
||||
bool is_macos = strcmp(dev.device_os, "macOS") == 0;
|
||||
|
@ -968,7 +969,7 @@ static bool assign_device(
|
|||
// stores the actual read bandwidth (GB/s) for each device
|
||||
std::vector<float> disk_speed(n_world, 0.0f);
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info &dev = dev_info_set[m];
|
||||
const device_info & dev = dev_info_set[m];
|
||||
GGML_ASSERT(dev.device_os != nullptr);
|
||||
bool is_linux = strcmp(dev.device_os, "Linux") == 0;
|
||||
|
||||
|
@ -1010,7 +1011,7 @@ static bool assign_device(
|
|||
M1.clear(), M2.clear(), M3.clear(), M4.clear();
|
||||
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info &dev = dev_info_set[m];
|
||||
const device_info & dev = dev_info_set[m];
|
||||
|
||||
GGML_ASSERT(dev.device_os != nullptr);
|
||||
bool is_macos = strcmp(dev.device_os, "macOS") == 0;
|
||||
|
@ -1023,9 +1024,9 @@ static bool assign_device(
|
|||
|
||||
int l_m = w[m] * k; // total number of layers assigned to device m
|
||||
int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU
|
||||
bool condition1 = l_m * b + (bi + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] > mem_budget[m] * GIGABYTE;
|
||||
bool condition2 = l_m * b + (bi + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] + c_gpu[m] > mem_budget[m] * GIGABYTE;
|
||||
bool condition3 = (l_m - l_m_gpu) * b_prime + (bi + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE;
|
||||
bool condition1 = l_m * b + (bi / n_vocab + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] > mem_budget[m] * GIGABYTE;
|
||||
bool condition2 = l_m * b + (bi / n_vocab + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] + c_gpu[m] > mem_budget[m] * GIGABYTE;
|
||||
bool condition3 = (l_m - l_m_gpu) * b_prime + (bi / n_vocab + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE;
|
||||
bool is_slow_disk = disk_speed[m] < min_disk_read_speed;
|
||||
|
||||
if (is_macos && !dev.gpu_support.metal && condition1 && !is_slow_disk) {
|
||||
|
@ -1083,13 +1084,26 @@ static bool assign_device(
|
|||
|
||||
// update kappa
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info &dev = dev_info_set[m];
|
||||
const device_info & dev = dev_info_set[m];
|
||||
GGML_ASSERT(dev.device_os != nullptr);
|
||||
bool is_android = strcmp(dev.device_os, "Android") == 0;
|
||||
|
||||
if (m == 0 && !in_set(m, M4)) {
|
||||
kappa = (bi + bo) / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
if (m == 0) {
|
||||
kappa = (
|
||||
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
|
||||
kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||
|
||||
if (!in_set(m, M4)) {
|
||||
kappa += (bi / n_vocab + bo) / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
}
|
||||
}
|
||||
|
||||
if (in_set(m, M3)) {
|
||||
kappa += (c_cpu[m] - dev.memory.available_physical * GIGABYTE - dev.memory.used_can_swap * GIGABYTE * int(is_android)) / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
}
|
||||
|
@ -1128,23 +1142,11 @@ static bool assign_device(
|
|||
// -------------------------------------------------------------
|
||||
// Construct vectors vz, vz_gpu
|
||||
// -------------------------------------------------------------
|
||||
// z and z_gpu are used to express memory constraints:
|
||||
// for z:
|
||||
// - M1: (d_m^{avail} - b_cio) / (L*b')
|
||||
// - M2: (d_m^{total} - b_cio - c_gpu) / (L*b')
|
||||
// - M3: (d_m^{avail}+d_m^{swapout} - b_cio) / (L*b')
|
||||
// - M4: - (d_m^{avail} - b_cio) / (L*b') on macOS without Metal,
|
||||
// or - (d_m^{total} - b_cio - c_gpu) / (L*b') on macOS with Metal,
|
||||
// or - (d_m^{avail}+d_m^{swapout} - b_cio) / (L*b') on Linux or Android
|
||||
//
|
||||
// for z_gpu:
|
||||
// - M1: (d_{m,cuda}^{avail} - c_gpu) / (L*b'),
|
||||
// d_{m,cuda}^{avail} is non-zero only if the device supports CUDA
|
||||
std::vector<float> vec_z(n_world, 0.0f), vec_z_gpu(n_world, 0.0f);
|
||||
std::vector<int> dev_gpu(n_world, 0);
|
||||
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info &dev = dev_info_set[m];
|
||||
const device_info & dev = dev_info_set[m];
|
||||
|
||||
GGML_ASSERT(dev.device_os != nullptr);
|
||||
bool is_macos = strcmp(dev.device_os, "macOS") == 0;
|
||||
|
@ -1152,7 +1154,7 @@ static bool assign_device(
|
|||
bool is_windows = strcmp(dev.device_os, "Windows") == 0;
|
||||
GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
|
||||
|
||||
int64_t b_cio = (bi + bo) * int(m == 0) + c_cpu[m];
|
||||
int64_t b_cio = (bi / n_vocab + bo) * int(m == 0) + c_cpu[m];
|
||||
|
||||
if (in_set(m, M1)) {
|
||||
vec_z[m] = (double)(dev.memory.available_physical * GIGABYTE - b_cio) / (double)(n_layer * b_prime);
|
||||
|
@ -1174,7 +1176,7 @@ static bool assign_device(
|
|||
float reserved_mem = 0.1f; // reserved shared memory to avoid potential OOM, set to 100 MiB by default
|
||||
vec_z_gpu[m] = (double)((dev.gpu_props.memory_free - reserved_mem) * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime);
|
||||
if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
|
||||
vec_z_gpu[m] -= (double)(bi + bo) / (double)(n_layer * b_prime);
|
||||
vec_z_gpu[m] -= (double)bo / (double)(n_layer * b_prime);
|
||||
}
|
||||
dev_gpu[m] = 1;
|
||||
} else {
|
||||
|
@ -1269,7 +1271,7 @@ static bool assign_device(
|
|||
|
||||
// constraint coefficients 3: RAM constraint for each device
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info &dev = dev_info_set[m];
|
||||
const device_info & dev = dev_info_set[m];
|
||||
GGML_ASSERT(dev.device_os != nullptr);
|
||||
bool is_macos = strcmp(dev.device_os, "macOS") == 0;
|
||||
int cons_row = constraint_idx + m;
|
||||
|
@ -1373,10 +1375,11 @@ static bool assign_device(
|
|||
GGML_ASSERT(final_solution[m] == w[m] && final_solution[m + n_world] == n[m]);
|
||||
LOG_INF("\n%s:\n", device_name);
|
||||
LOG_INF(" - Device Index : %d\n", m);
|
||||
LOG_INF(" - Assignment Set : %s\n", in_set(m, M1) ? "M1" : in_set(m, M2) ? "M2" : in_set(m, M3) ? "M3" : "M4");
|
||||
LOG_INF(" - N Layer Window : %d\n", w[m]);
|
||||
LOG_INF(" - N GPU Layers : %d\n", n[m]);
|
||||
}
|
||||
LOG_INF("\nTotal Latency: %.3f ms\n", final_objective);
|
||||
LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
|
||||
LOG_INF("------------------------------------------");
|
||||
|
||||
#else
|
||||
|
@ -1478,7 +1481,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0};
|
||||
if (my_rank == 0) {
|
||||
// automatically determine n_layer_window and n_gpu_layers
|
||||
if (!assign_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
|
||||
if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
|
||||
LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__);
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
|
|
|
@ -5255,14 +5255,11 @@ struct llama_model_loader {
|
|||
const auto & mapping = mappings.at(idx);
|
||||
*addr = mapping->addr;
|
||||
|
||||
auto merge_tensor_range = [&](ggml_context * context, bool keep_only_inp_out) {
|
||||
auto merge_tensor_range = [&](ggml_context * context, bool keep_output) {
|
||||
for (ggml_tensor * tensor = ggml_get_first_tensor(context); tensor; tensor = ggml_get_next_tensor(context, tensor)) {
|
||||
try {
|
||||
const char * tname = ggml_get_name(tensor);
|
||||
if (keep_only_inp_out && !(
|
||||
// strcmp(tname, "token_embd.weight") == 0 || // lookup table is used so we do not need to keep it in metal memory
|
||||
strcmp(tname, "output_norm.weight") == 0 ||
|
||||
strcmp(tname, "output.weight") == 0)) {
|
||||
if (keep_output && !(strcmp(tname, "output_norm.weight") == 0 || strcmp(tname, "output.weight") == 0)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue