mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 16:09:05 +00:00
add bounds n[m]<=0 for devices without GPUs
This commit is contained in:
parent
ac5d63b09e
commit
1e2b934d69
1 changed files with 65 additions and 32 deletions
|
@ -832,6 +832,20 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|||
return cache_directory + filename;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static std::string vec_to_str(const std::vector<T> & vec) {
|
||||
std::ostringstream oss;
|
||||
oss << "[";
|
||||
for (size_t i = 0; i < vec.size(); ++i) {
|
||||
oss << vec[i];
|
||||
if (i < vec.size() - 1) {
|
||||
oss << ", ";
|
||||
}
|
||||
}
|
||||
oss << "]";
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
static bool assign_layers_to_device(
|
||||
uint32_t n_world,
|
||||
uint32_t my_rank,
|
||||
|
@ -840,7 +854,7 @@ static bool assign_layers_to_device(
|
|||
uint32_t * n_gpu_layers,
|
||||
struct llama_model * model,
|
||||
const struct llama_context_params cparams,
|
||||
float min_disk_read_speed = 0.5f) { // minimum disk I/O speed: 500 MB/s
|
||||
float min_disk_read_speed = 0.1f) { // minimum disk I/O speed: 100 MB/s
|
||||
GGML_ASSERT(dev_info_set != nullptr);
|
||||
GGML_ASSERT(n_layer_window != nullptr);
|
||||
GGML_ASSERT(my_rank == 0);
|
||||
|
@ -1082,6 +1096,9 @@ static bool assign_layers_to_device(
|
|||
|
||||
if (!assign_sets(cur_k)) break;
|
||||
|
||||
LOG_INF("Set assignment: M1: %s, M2: %s, M3: %s, M4: %s\n",
|
||||
vec_to_str(M1).c_str(), vec_to_str(M2).c_str(), vec_to_str(M3).c_str(), vec_to_str(M4).c_str());
|
||||
|
||||
// update kappa
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info & dev = dev_info_set[m];
|
||||
|
@ -1109,6 +1126,14 @@ static bool assign_layers_to_device(
|
|||
}
|
||||
}
|
||||
|
||||
std::vector<int> dev_gpu(n_world, 0);
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info & dev = dev_info_set[m];
|
||||
if (dev.gpu_support.cuda || dev.gpu_support.metal) {
|
||||
dev_gpu[m] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// Construct vectors va, vb, vc
|
||||
// -------------------------------------------------------------
|
||||
|
@ -1118,7 +1143,7 @@ static bool assign_layers_to_device(
|
|||
// - M3: a[m] = alpha[m] + b' / s_m^{disk}, b[m] = beta[m] - b'/ s_m^{disk}, c[m] = xi[m]
|
||||
// - M4: a[m] = alpha[m], b[m] = beta[m], c[m] = xi[m]
|
||||
std::vector<float> vec_a(n_world, 0.0f), vec_b(n_world, 0.0f), vec_c(n_world, 0.0f);
|
||||
|
||||
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
if (in_set(m, M1)) {
|
||||
vec_a[m] = alpha[m] + b / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
|
@ -1130,11 +1155,15 @@ static bool assign_layers_to_device(
|
|||
vec_c[m] = xi[m];
|
||||
} else if (in_set(m, M3)) {
|
||||
vec_a[m] = alpha[m] + b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
vec_b[m] = beta[m] - b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
if (dev_gpu[m]) {
|
||||
vec_b[m] = beta[m] - b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
}
|
||||
vec_c[m] = xi[m];
|
||||
} else {
|
||||
vec_a[m] = alpha[m];
|
||||
vec_b[m] = beta[m];
|
||||
if (dev_gpu[m]) {
|
||||
vec_b[m] = beta[m];
|
||||
}
|
||||
vec_c[m] = xi[m];
|
||||
}
|
||||
}
|
||||
|
@ -1143,7 +1172,6 @@ static bool assign_layers_to_device(
|
|||
// Construct vectors vz, vz_gpu
|
||||
// -------------------------------------------------------------
|
||||
std::vector<float> vec_z(n_world, 0.0f), vec_z_gpu(n_world, 0.0f);
|
||||
std::vector<int> dev_gpu(n_world, 0);
|
||||
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info & dev = dev_info_set[m];
|
||||
|
@ -1168,25 +1196,19 @@ static bool assign_layers_to_device(
|
|||
} else if (is_macos && dev.gpu_support.metal) {
|
||||
vec_z[m] = - (double)(dev.gpu_props.memory_free * GIGABYTE - b_cio - c_gpu[m]) / (double)(n_layer * b_prime);
|
||||
} else {
|
||||
vec_z[m] = - (double)(dev.memory.available_physical * GIGABYTE + dev.memory.used_can_swap * GIGABYTE * int(is_android) - b_cio) / (double)(n_layer * b_prime);
|
||||
vec_z[m] = - (double)((dev.memory.available_physical + dev.memory.used_can_swap * int(is_android)) * GIGABYTE - b_cio) / (double)(n_layer * b_prime);
|
||||
}
|
||||
}
|
||||
|
||||
if (dev.gpu_support.cuda || dev.gpu_support.metal) {
|
||||
if (dev_gpu[m]) {
|
||||
float reserved_mem = 0.1f; // reserved shared memory to avoid potential OOM, set to 100 MiB by default
|
||||
vec_z_gpu[m] = (double)((dev.gpu_props.memory_free - reserved_mem) * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime);
|
||||
if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
|
||||
vec_z_gpu[m] -= (double)bo / (double)(n_layer * b_prime);
|
||||
}
|
||||
dev_gpu[m] = 1;
|
||||
} else {
|
||||
vec_z_gpu[m] = -(double)c_gpu[m] / (double)(n_layer * b_prime);
|
||||
}
|
||||
}
|
||||
|
||||
// count the number of cuda devices
|
||||
int num_dev_gpu = std::accumulate(dev_gpu.begin(), dev_gpu.end(), 0);
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// Build and solve the optimization model
|
||||
// -------------------------------------------------------------
|
||||
|
@ -1203,7 +1225,7 @@ static bool assign_layers_to_device(
|
|||
|
||||
// define the number of decision variables and constraints
|
||||
model.lp_.num_col_ = n_world * 2; // number of decision variables
|
||||
model.lp_.num_row_ = 1 + 2 * n_world + num_dev_gpu; // number of constraints
|
||||
model.lp_.num_row_ = 1 + 3 * n_world; // number of constraints
|
||||
|
||||
// define the objective: k * sum(a[m] * w[m] + b[m] * n[m]) + kappa + k * sum(c[m])
|
||||
model.lp_.sense_ = ObjSense::kMinimize;
|
||||
|
@ -1246,10 +1268,8 @@ static bool assign_layers_to_device(
|
|||
|
||||
// constraint bound 4: CUDA/shared memory constraint for CUDA/Metal devices
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
if (dev_gpu[m]) {
|
||||
model.lp_.row_upper_[constraint_idx] = W * vec_z_gpu[m];
|
||||
constraint_idx++;
|
||||
}
|
||||
model.lp_.row_upper_[constraint_idx] = W * vec_z_gpu[m];
|
||||
constraint_idx++;
|
||||
}
|
||||
|
||||
// define the constraint matrix
|
||||
|
@ -1278,15 +1298,14 @@ static bool assign_layers_to_device(
|
|||
|
||||
if (in_set(m, M1) || in_set(m, M2)) { // in sets M1 and M2
|
||||
A[cons_row][m] = -1.0; // coefficient for w[m]
|
||||
A[cons_row][m + n_world] = 0.0; // coefficient for n[m]
|
||||
} else if (in_set(m, M3)) { // in set M3
|
||||
A[cons_row][m] = -1.0; // coefficient for w[m]
|
||||
A[cons_row][m + n_world] = 1.0; // coefficient for n[m]
|
||||
if (dev_gpu[m]) {
|
||||
A[cons_row][m + n_world] = 1.0; // coefficient for n[m]
|
||||
}
|
||||
} else { // in set M4
|
||||
A[cons_row][m] = 1.0; // coefficient for w[m]
|
||||
if (is_macos) {
|
||||
A[cons_row][m + n_world] = 0.0; // coefficient for n[m]
|
||||
} else {
|
||||
if (!is_macos && dev_gpu[m]) {
|
||||
A[cons_row][m + n_world] = -1.0; // coefficient for n[m]
|
||||
}
|
||||
}
|
||||
|
@ -1295,11 +1314,8 @@ static bool assign_layers_to_device(
|
|||
|
||||
// constraint coefficients 4: CUDA/shared memory constraint for CUDA/Metal devices
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
if (dev_gpu[m]) {
|
||||
A[constraint_idx][m] = 0.0; // coefficient for w[m]
|
||||
A[constraint_idx][m + n_world] = 1.0; // coefficient for n[m]
|
||||
constraint_idx++;
|
||||
}
|
||||
A[constraint_idx][m + n_world] = 1.0; // coefficient for n[m]
|
||||
constraint_idx++;
|
||||
}
|
||||
|
||||
// translate the constraint matrix A into the LP model
|
||||
|
@ -1353,6 +1369,13 @@ static bool assign_layers_to_device(
|
|||
best_k = k;
|
||||
best_solution = solution.col_value;
|
||||
}
|
||||
|
||||
LOG_INF("k = %2d, obj = %7.1f, solution: %s | best_k = %2d, best_obj = %7.1f, best_solution: %s\n",
|
||||
k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str());
|
||||
}
|
||||
|
||||
if (best_objective > final_objective) {
|
||||
break; // avoid oscillation between two set assignments
|
||||
}
|
||||
|
||||
// update w[m] and n[m]
|
||||
|
@ -1382,19 +1405,29 @@ static bool assign_layers_to_device(
|
|||
LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
|
||||
LOG_INF("------------------------------------------");
|
||||
|
||||
// copy value from w and n to n_layer_window and n_gpu_layers, respectively
|
||||
std::copy(w.begin(), w.end(), n_layer_window);
|
||||
std::copy(n.begin(), n.end(), n_gpu_layers);
|
||||
|
||||
#else
|
||||
(void)bi;
|
||||
(void)bo;
|
||||
(void)kappa;
|
||||
(void)cparams;
|
||||
(void)min_disk_read_speed;
|
||||
(void)n_vocab;
|
||||
(void)GIGABYTE;
|
||||
|
||||
std::copy(w.begin(), w.end(), n_layer_window);
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info & dev = dev_info_set[m];
|
||||
if (dev.gpu_support.cuda || dev.gpu_support.metal) {
|
||||
n_gpu_layers[m] = w[m];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// copy value from w and n to n_layer_window and n_gpu_layers, respectively
|
||||
std::copy(w.begin(), w.end(), n_layer_window);
|
||||
std::copy(n.begin(), n.end(), n_gpu_layers);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue