mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 15:09:03 +00:00
use highs to solve the allocation program
This commit is contained in:
parent
b577c10d25
commit
5d9aadf3d5
6 changed files with 614 additions and 86 deletions
8
Makefile
8
Makefile
|
@ -264,11 +264,11 @@ MK_CXXFLAGS = -std=c++11 -fPIC
|
||||||
MK_NVCCFLAGS = -std=c++11
|
MK_NVCCFLAGS = -std=c++11
|
||||||
|
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
MK_CPPFLAGS += -I/opt/homebrew/include
|
MK_CPPFLAGS += -isystem /opt/homebrew/include -isystem /opt/homebrew/include/highs
|
||||||
MK_LDFLAGS += -L/opt/homebrew/lib -lzmq
|
MK_LDFLAGS += -L/opt/homebrew/lib -lzmq -lhighs
|
||||||
else ifeq ($(UNAME_S),Linux)
|
else ifeq ($(UNAME_S),Linux)
|
||||||
MK_CPPFLAGS += -I/usr/local/include
|
MK_CPPFLAGS += -isystem /usr/local/include -isystem /usr/local/include/highs
|
||||||
MK_LDFLAGS += -L/usr/local/lib -lzmq
|
MK_LDFLAGS += -L/usr/local/lib -lzmq -lhighs
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_NO_CCACHE
|
ifdef LLAMA_NO_CCACHE
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "Highs.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -28,8 +29,6 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
||||||
#define DEFAULT_N_LAYER_WINDOW 4
|
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__MACH__)
|
#if defined(__APPLE__) && defined(__MACH__)
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
|
@ -72,6 +71,8 @@
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
constexpr int GIGABYTE = 1024 * 1024 * 1024;
|
||||||
|
|
||||||
//
|
//
|
||||||
// CPU utils
|
// CPU utils
|
||||||
//
|
//
|
||||||
|
@ -364,11 +365,6 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <size_t N>
|
|
||||||
void copy_n_layer_window(const uint32_t (&source)[N], uint32_t * destination) {
|
|
||||||
std::copy(std::begin(source), std::end(source), destination);
|
|
||||||
}
|
|
||||||
|
|
||||||
void gpt_init() {
|
void gpt_init() {
|
||||||
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||||
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
|
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
|
||||||
|
@ -822,30 +818,531 @@ std::string fs_get_cache_file(const std::string & filename) {
|
||||||
return cache_directory + filename;
|
return cache_directory + filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void assign_device(
|
||||||
//
|
|
||||||
// Model utils
|
|
||||||
//
|
|
||||||
static void llama_assign_n_layer_window(
|
|
||||||
uint32_t n_world,
|
uint32_t n_world,
|
||||||
uint32_t my_rank,
|
uint32_t my_rank,
|
||||||
const device_info * dev_info_set,
|
const device_info * dev_info_set,
|
||||||
uint32_t * n_layer_window,
|
uint32_t * n_layer_window,
|
||||||
struct llama_model * model) {
|
uint32_t * n_gpu_layers,
|
||||||
|
struct llama_model * model,
|
||||||
|
const struct llama_context_params cparams,
|
||||||
|
float min_disk_read_speed = 0.1f) { // minimum disk I/O speed: 100 MB/s
|
||||||
GGML_ASSERT(dev_info_set != nullptr);
|
GGML_ASSERT(dev_info_set != nullptr);
|
||||||
GGML_ASSERT(n_layer_window != nullptr);
|
GGML_ASSERT(n_layer_window != nullptr);
|
||||||
|
GGML_ASSERT(my_rank == 0);
|
||||||
|
|
||||||
uint32_t n_layer = llama_model_n_layers(model);
|
// if only 1 device, it is assigned all layers
|
||||||
|
const uint32_t n_layer = llama_model_n_layers(model);
|
||||||
if (n_world == 1) {
|
if (n_world == 1) {
|
||||||
n_layer_window[0] = n_layer;
|
n_layer_window[0] = n_layer;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
(void)my_rank;
|
const device_info &master = dev_info_set[0];
|
||||||
|
|
||||||
std::fill_n(n_layer_window, n_world, DEFAULT_N_LAYER_WINDOW);
|
// model-specific constants
|
||||||
|
const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model);
|
||||||
|
const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model);
|
||||||
|
const int n_kv = 16;
|
||||||
|
|
||||||
|
const int64_t b = dev_info_set[0].model_bytes.nb_layer;
|
||||||
|
const int64_t bi = dev_info_set[0].model_bytes.nb_input;
|
||||||
|
const int64_t bo = dev_info_set[0].model_bytes.nb_output;
|
||||||
|
const int64_t b_prime = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv;
|
||||||
|
|
||||||
|
// device-specific constants
|
||||||
|
std::vector<float> alpha(n_world, 0.0f);
|
||||||
|
std::vector<float> beta(n_world, 0.0f);
|
||||||
|
std::vector<float> xi(n_world, 0.0f);
|
||||||
|
float kappa = 0.0f;
|
||||||
|
std::vector<int> w(n_world, 0);
|
||||||
|
std::vector<int> n(n_world, 0);
|
||||||
|
std::vector<float> mem_budget(n_world, 0.0f);
|
||||||
|
|
||||||
|
// -------- Compute alpha[m], beta[m], xi[m] --------
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
// alpha[m]
|
||||||
|
const device_info &dev = dev_info_set[m];
|
||||||
|
float t_calc_cpu = (
|
||||||
|
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
|
||||||
|
float t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||||
|
|
||||||
|
alpha[m] = t_calc_cpu + t_kv_cpy_cpu + t_read_ram_cpu; // in ms
|
||||||
|
|
||||||
|
// beta[m]
|
||||||
|
float t_calc_gpu = 0.0;
|
||||||
|
float t_kv_cpy_gpu = 0.0;
|
||||||
|
float t_read_ram_gpu = 0.0;
|
||||||
|
|
||||||
|
if (dev.gpu_support.metal) {
|
||||||
|
t_calc_gpu = (
|
||||||
|
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
|
||||||
|
t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
|
||||||
|
} else if (dev.gpu_support.cuda) {
|
||||||
|
t_calc_gpu = (
|
||||||
|
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
|
||||||
|
t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
|
||||||
|
}
|
||||||
|
|
||||||
|
beta[m] = t_calc_gpu - t_calc_cpu + t_kv_cpy_gpu - t_kv_cpy_cpu + t_read_ram_gpu - t_read_ram_cpu; // in ms
|
||||||
|
|
||||||
|
// xi[m]
|
||||||
|
// the ram-vram and vram-ram transfer time and the communication time are less than 1 ms
|
||||||
|
xi[m] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we adopt an iterative optimization approach. Initially, $w_m$ is set proportionally
|
||||||
|
// based on the available memory budget
|
||||||
|
// - $d_m^{\text{avail}}$ for macOS without Metal and Linux
|
||||||
|
// - $d_m^{\text{total}}$ for macOS with Metal
|
||||||
|
// - $d_m^{\text{avail}}+d_m^{\text{swapout}}$ for Android
|
||||||
|
// and $n_m$ is initialized to 0.
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
const device_info &dev = dev_info_set[m];
|
||||||
|
GGML_ASSERT(dev.device_os != nullptr);
|
||||||
|
|
||||||
|
bool is_macos = strcmp(dev.device_os, "macOS") == 0;
|
||||||
|
bool is_linux = strcmp(dev.device_os, "Linux") == 0;
|
||||||
|
bool is_android = strcmp(dev.device_os, "Android") == 0;
|
||||||
|
bool is_windows = strcmp(dev.device_os, "Windows") == 0;
|
||||||
|
GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
|
||||||
|
|
||||||
|
if ((is_macos && !dev.gpu_support.metal) || is_linux) {
|
||||||
|
mem_budget[m] = dev.memory.available_physical;
|
||||||
|
} else if (is_macos && dev.gpu_support.metal) {
|
||||||
|
mem_budget[m] = dev.memory.total_physical;
|
||||||
|
} else if (is_android) {
|
||||||
|
mem_budget[m] = dev.memory.available_physical + dev.memory.used_can_swap;
|
||||||
|
} else {
|
||||||
|
// todo: add support for other OS such as Windows
|
||||||
|
GGML_ASSERT(false && "Unsupported OS\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize w_m proportionally to memory budget and n_m to 0
|
||||||
|
float total_mem_budget = std::accumulate(mem_budget.begin(), mem_budget.end(), 0.0f);
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
w[m] = std::round(mem_budget[m] / total_mem_budget * n_layer);
|
||||||
|
n[m] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// stores the actual read bandwidth (GB/s) for each device
|
||||||
|
std::vector<float> disk_speed(n_world, 0.0f);
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
const device_info &dev = dev_info_set[m];
|
||||||
|
GGML_ASSERT(dev.device_os != nullptr);
|
||||||
|
bool is_linux = strcmp(dev.device_os, "Linux") == 0;
|
||||||
|
|
||||||
|
if (is_linux) {
|
||||||
|
disk_speed[m] = dev.disk.read_seq_bw;
|
||||||
|
} else {
|
||||||
|
disk_speed[m] = dev.disk.read_rnd_bw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// helper function to find valid factors for a given n_layers
|
||||||
|
auto find_factors = [&](int n_layers) {
|
||||||
|
std::vector<int> factors;
|
||||||
|
for (int k = 1; k <= n_layers / 2; ++k) {
|
||||||
|
if (n_layers % k == 0) {
|
||||||
|
factors.push_back(k);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return factors;
|
||||||
|
};
|
||||||
|
|
||||||
|
// get valid factors
|
||||||
|
std::vector<int> valid_k = find_factors(n_layer);
|
||||||
|
|
||||||
|
// assign devices to sets M1, M2, M3, and M4
|
||||||
|
// M1: devices running on macOS without Metal, and with insufficient memory
|
||||||
|
// M2: devices running on macOS with Metal and insufficient memory
|
||||||
|
// M3: devices running on Linux or Android and with insufficient memory
|
||||||
|
// M4: devices with sufficient memory or very slow disk I/O (slower than min_disk_io_speed)
|
||||||
|
std::vector<uint32_t> M1, M2, M3, M4, M1_prev, M2_prev, M3_prev, M4_prev;
|
||||||
|
std::vector<int64_t> c_cpu(n_world, 0), c_gpu(n_world, 0);
|
||||||
|
|
||||||
|
// helper function to check if a device is in a specific set
|
||||||
|
auto in_set = [&](uint32_t m, const std::vector<uint32_t> & M) {
|
||||||
|
return (std::find(M.begin(), M.end(), m) != M.end());
|
||||||
|
};
|
||||||
|
|
||||||
|
auto assign_sets = [&](int k) -> bool {
|
||||||
|
M1.clear(), M2.clear(), M3.clear(), M4.clear();
|
||||||
|
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
const device_info &dev = dev_info_set[m];
|
||||||
|
|
||||||
|
GGML_ASSERT(dev.device_os != nullptr);
|
||||||
|
bool is_macos = strcmp(dev.device_os, "macOS") == 0;
|
||||||
|
bool is_linux = strcmp(dev.device_os, "Linux") == 0;
|
||||||
|
bool is_android = strcmp(dev.device_os, "Android") == 0;
|
||||||
|
bool is_windows = strcmp(dev.device_os, "Windows") == 0;
|
||||||
|
GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
|
||||||
|
|
||||||
|
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, dev.gpu_support.metal, m == 0, w[m] * k, n[m] * k);
|
||||||
|
|
||||||
|
int l_m = w[m] * k; // total number of layers assigned to device m
|
||||||
|
int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU
|
||||||
|
bool condition1 = l_m * b + (bi + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] > mem_budget[m] * GIGABYTE;
|
||||||
|
bool condition2 = l_m * b + (bi + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] + c_gpu[m] > mem_budget[m] * GIGABYTE;
|
||||||
|
bool condition3 = (l_m - l_m_gpu) * b_prime + (bi + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE;
|
||||||
|
bool is_slow_disk = disk_speed[m] < min_disk_read_speed;
|
||||||
|
|
||||||
|
if (is_macos && !dev.gpu_support.metal && condition1 && !is_slow_disk) {
|
||||||
|
// case 1: macOS without Metal, and with insufficient memory
|
||||||
|
M1.push_back(m);
|
||||||
|
} else if (is_macos && dev.gpu_support.metal && condition2 && !is_slow_disk) {
|
||||||
|
// case 2: macOS with Metal, and with insufficient memory
|
||||||
|
M2.push_back(m);
|
||||||
|
} else if ((is_linux || is_android) && condition3 && !is_slow_disk) {
|
||||||
|
// case 3: Linux with insufficient memory
|
||||||
|
M3.push_back(m);
|
||||||
|
} else {
|
||||||
|
// case 4: otherwise, assigned to M4
|
||||||
|
M4.push_back(m);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check whether the sets are changed
|
||||||
|
bool sets_changed = (M1 != M1_prev || M2 != M2_prev || M3 != M3_prev || M4 != M4_prev);
|
||||||
|
|
||||||
|
// update the previous sets
|
||||||
|
M1_prev = M1, M2_prev = M2, M3_prev = M3, M4_prev = M4;
|
||||||
|
|
||||||
|
return sets_changed;
|
||||||
|
};
|
||||||
|
|
||||||
|
// helper function to print a matrix
|
||||||
|
auto print_matrix = [](const std::vector<std::vector<double>>& matrix) {
|
||||||
|
for (const auto& row : matrix) {
|
||||||
|
for (const auto& elem : row) {
|
||||||
|
printf("%.3f ", elem);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
double final_objective = 1.0e30;
|
||||||
|
std::vector<double> final_solution;
|
||||||
|
int final_k = -1;
|
||||||
|
|
||||||
|
// iterative optimization to find a valid set assignment (M1, M2, M3, M4)
|
||||||
|
while (true) {
|
||||||
|
int W = std::accumulate(w.begin(), w.end(), 0);
|
||||||
|
int cur_k = (int)n_layer / W;
|
||||||
|
GGML_ASSERT(W > 1 && (int)n_layer % W == 0 && "Constraint: L = k * W must hold\n");
|
||||||
|
|
||||||
|
if (!assign_sets(cur_k)) break;
|
||||||
|
|
||||||
|
// update kappa
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
const device_info &dev = dev_info_set[m];
|
||||||
|
GGML_ASSERT(dev.device_os != nullptr);
|
||||||
|
bool is_android = strcmp(dev.device_os, "Android") == 0;
|
||||||
|
|
||||||
|
if (m == 0) {
|
||||||
|
kappa = (bi + bo) / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||||
|
}
|
||||||
|
if (in_set(m, M3)) {
|
||||||
|
kappa += (c_cpu[m] - dev.memory.available_physical * GIGABYTE - dev.memory.used_can_swap * GIGABYTE * int(is_android)) / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------
|
||||||
|
// Construct vectors va, vb, vc
|
||||||
|
// -------------------------------------------------------------
|
||||||
|
// a[m], b[m], c[m] are computed based on divisions M1, M2, M3, and M4:
|
||||||
|
// - M1: a[m] = alpha[m] + b / s_m^{disk}, b[m] = 0, c[m] = xi[m]
|
||||||
|
// - M2: a[m] = alpha[m] + b / s_m^{disk}, b[m] = beta[m], c[m] = xi[m]
|
||||||
|
// - M3: a[m] = alpha[m] + b' / s_m^{disk}, b[m] = beta[m] - b'/ s_m^{disk}, c[m] = xi[m]
|
||||||
|
// - M4: a[m] = alpha[m], b[m] = beta[m], c[m] = xi[m]
|
||||||
|
std::vector<float> vec_a(n_world, 0.0f), vec_b(n_world, 0.0f), vec_c(n_world, 0.0f);
|
||||||
|
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
if (in_set(m, M1)) {
|
||||||
|
vec_a[m] = alpha[m] + b / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||||
|
vec_b[m] = 0.0f;
|
||||||
|
vec_c[m] = xi[m];
|
||||||
|
} else if (in_set(m, M2)) {
|
||||||
|
vec_a[m] = alpha[m] + b / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||||
|
vec_b[m] = beta[m];
|
||||||
|
vec_c[m] = xi[m];
|
||||||
|
} else if (in_set(m, M3)) {
|
||||||
|
vec_a[m] = alpha[m] + b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||||
|
vec_b[m] = beta[m] - b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||||
|
vec_c[m] = xi[m];
|
||||||
|
} else {
|
||||||
|
vec_a[m] = alpha[m];
|
||||||
|
vec_b[m] = beta[m];
|
||||||
|
vec_c[m] = xi[m];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------
|
||||||
|
// Construct vectors vz, vz_cuda
|
||||||
|
// -------------------------------------------------------------
|
||||||
|
// z and z_cuda are used to express memory constraints:
|
||||||
|
// for z:
|
||||||
|
// - M1: (d_m^{avail} - b_cio) / (L*b')
|
||||||
|
// - M2: (d_m^{total} - b_cio - c_gpu) / (L*b')
|
||||||
|
// - M3: (d_m^{avail}+d_m^{swapout} - b_cio) / (L*b')
|
||||||
|
// - M4: - (d_m^{avail} - b_cio) / (L*b') on macOS without Metal,
|
||||||
|
// or - (d_m^{total} - b_cio - c_gpu) / (L*b') on macOS with Metal,
|
||||||
|
// or - (d_m^{avail}+d_m^{swapout} - b_cio) / (L*b') on Linux or Android
|
||||||
|
//
|
||||||
|
// for z_cuda:
|
||||||
|
// - M1: (d_{m,cuda}^{avail} - c_gpu) / (L*b'),
|
||||||
|
// d_{m,cuda}^{avail} is non-zero only if the device supports CUDA
|
||||||
|
std::vector<float> vec_z(n_world, 0.0f), vec_z_cuda(n_world, 0.0f);
|
||||||
|
std::vector<int> dev_cuda(n_world, 0);
|
||||||
|
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
const device_info &dev = dev_info_set[m];
|
||||||
|
|
||||||
|
GGML_ASSERT(dev.device_os != nullptr);
|
||||||
|
bool is_macos = strcmp(dev.device_os, "macOS") == 0;
|
||||||
|
bool is_android = strcmp(dev.device_os, "Android") == 0;
|
||||||
|
bool is_windows = strcmp(dev.device_os, "Windows") == 0;
|
||||||
|
GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
|
||||||
|
|
||||||
|
int64_t b_cio = (bi + bo) * int(m == 0) + c_cpu[m];
|
||||||
|
|
||||||
|
if (in_set(m, M1)) {
|
||||||
|
vec_z[m] = (double)(dev.memory.available_physical * GIGABYTE - b_cio) / (double)(n_layer * b_prime);
|
||||||
|
} else if (in_set(m, M2)) {
|
||||||
|
vec_z[m] = (double)(dev.memory.total_physical * GIGABYTE - b_cio - c_gpu[m]) / (double)(n_layer * b_prime);
|
||||||
|
} else if (in_set(m, M3)) {
|
||||||
|
vec_z[m] = (double)(dev.memory.available_physical * GIGABYTE + dev.memory.used_can_swap * GIGABYTE * int(is_android) - b_cio) / (double)(n_layer * b_prime);
|
||||||
|
} else {
|
||||||
|
if (is_macos && !dev.gpu_support.metal) {
|
||||||
|
vec_z[m] = - (double)(dev.memory.available_physical * GIGABYTE - b_cio) / (double)(n_layer * b_prime);
|
||||||
|
} else if (is_macos && dev.gpu_support.metal) {
|
||||||
|
vec_z[m] = - (double)(dev.memory.total_physical * GIGABYTE - b_cio - c_gpu[m]) / (double)(n_layer * b_prime);
|
||||||
|
} else {
|
||||||
|
vec_z[m] = - (double)(dev.memory.available_physical * GIGABYTE + dev.memory.used_can_swap * GIGABYTE * int(is_android) - b_cio) / (double)(n_layer * b_prime);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dev.gpu_support.cuda) {
|
||||||
|
vec_z_cuda[m] = (double)(dev.gpu_props.memory_free * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime);
|
||||||
|
dev_cuda[m] = 1;
|
||||||
|
} else {
|
||||||
|
vec_z_cuda[m] = -(double)c_gpu[m] / (double)(n_layer * b_prime);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// count the number of cuda devices
|
||||||
|
int num_dev_cuda = std::accumulate(dev_cuda.begin(), dev_cuda.end(), 0);
|
||||||
|
|
||||||
|
// -------------------------------------------------------------
|
||||||
|
// Build and solve the optimization model
|
||||||
|
// -------------------------------------------------------------
|
||||||
|
double best_objective = 1.0e30;
|
||||||
|
std::vector<double> best_solution;
|
||||||
|
int best_k = -1;
|
||||||
|
|
||||||
|
// iterate over all possible values of k to find the best solution
|
||||||
|
for (int k : valid_k) {
|
||||||
|
GGML_ASSERT(n_layer % k == 0 && "Constraint: L = k * W must hold\n");
|
||||||
|
int W = n_layer / k;
|
||||||
|
|
||||||
|
HighsModel model;
|
||||||
|
|
||||||
|
// define the number of decision variables and constraints
|
||||||
|
model.lp_.num_col_ = n_world * 2; // number of decision variables
|
||||||
|
model.lp_.num_row_ = 1 + 2 * n_world + num_dev_cuda; // number of constraints
|
||||||
|
|
||||||
|
// define the objective: k * sum(a[m] * w[m] + b[m] * n[m]) + kappa + k * sum(c[m])
|
||||||
|
model.lp_.sense_ = ObjSense::kMinimize;
|
||||||
|
model.lp_.offset_ = k * std::accumulate(vec_c.begin(), vec_c.end(), 0.0f) + kappa;
|
||||||
|
model.lp_.col_cost_.clear();
|
||||||
|
std::copy(vec_a.begin(), vec_a.end(), std::back_inserter(model.lp_.col_cost_));
|
||||||
|
std::copy(vec_b.begin(), vec_b.end(), std::back_inserter(model.lp_.col_cost_));
|
||||||
|
std::transform(
|
||||||
|
model.lp_.col_cost_.begin(),
|
||||||
|
model.lp_.col_cost_.end(),
|
||||||
|
model.lp_.col_cost_.begin(), [k](double cost) {
|
||||||
|
return cost * k;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// define the variable bounds
|
||||||
|
model.lp_.col_lower_ = std::vector<double>(n_world * 2, 0.0);
|
||||||
|
std::fill(model.lp_.col_lower_.begin(), model.lp_.col_lower_.begin() + n_world, 1.0);
|
||||||
|
model.lp_.col_upper_ = std::vector<double>(n_world * 2, n_layer);
|
||||||
|
|
||||||
|
// define the constraint bounds
|
||||||
|
int constraint_idx = 0;
|
||||||
|
model.lp_.row_lower_ = std::vector<double>(model.lp_.num_row_, -1.0e30); // initialize to a large negative value
|
||||||
|
model.lp_.row_upper_ = std::vector<double>(model.lp_.num_row_, 1.0e30); // initialize to a large positive value
|
||||||
|
|
||||||
|
// constraint bound 1: sum(w[m]) = W
|
||||||
|
model.lp_.row_lower_[constraint_idx] = {(double)W};
|
||||||
|
model.lp_.row_upper_[constraint_idx] = {(double)W};
|
||||||
|
constraint_idx++;
|
||||||
|
|
||||||
|
// constraint bound 2: n[m] <= w[m], m = 1, 2, ..., n_world
|
||||||
|
std::fill_n(model.lp_.row_upper_.begin() + constraint_idx, n_world, 0.0); // constraint: -w[m] + n[m] <= 0.0
|
||||||
|
constraint_idx += n_world;
|
||||||
|
|
||||||
|
// constraint bound 3: RAM constraint for each device
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
model.lp_.row_upper_[constraint_idx + m] = -W * vec_z[m];
|
||||||
|
}
|
||||||
|
constraint_idx += n_world;
|
||||||
|
|
||||||
|
// constraint bound 4: CUDA memory constraint for CUDA devices
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
if (dev_cuda[m]) {
|
||||||
|
model.lp_.row_upper_[constraint_idx] = W * vec_z_cuda[m];
|
||||||
|
constraint_idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// define the constraint matrix
|
||||||
|
const int n_rows = model.lp_.num_row_;
|
||||||
|
const int n_cols = model.lp_.num_col_;
|
||||||
|
std::vector<std::vector<double>> A(n_rows, std::vector<double>(n_cols, 0.0));
|
||||||
|
constraint_idx = 0;
|
||||||
|
|
||||||
|
// constraint coefficients 1: sum(w[m]) = W
|
||||||
|
std::fill_n(A[constraint_idx].begin(), n_world, 1.0);
|
||||||
|
constraint_idx++;
|
||||||
|
|
||||||
|
// constraint coefficients 2: n[m] <= w[m], m = 1, 2, ..., n_world
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
A[constraint_idx + m][m] = -1.0; // coefficient for w[m]
|
||||||
|
A[constraint_idx + m][m + n_world] = 1.0; // coefficient for n[m]
|
||||||
|
}
|
||||||
|
constraint_idx += n_world;
|
||||||
|
|
||||||
|
// constraint coefficients 3: RAM constraint for each device
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
const device_info &dev = dev_info_set[m];
|
||||||
|
GGML_ASSERT(dev.device_os != nullptr);
|
||||||
|
bool is_macos = strcmp(dev.device_os, "macOS") == 0;
|
||||||
|
int cons_row = constraint_idx + m;
|
||||||
|
|
||||||
|
if (in_set(m, M1) || in_set(m, M2)) { // in sets M1 and M2
|
||||||
|
A[cons_row][m] = -1.0; // coefficient for w[m]
|
||||||
|
A[cons_row][m + n_world] = 0.0; // coefficient for n[m]
|
||||||
|
} else if (in_set(m, M3)) { // in set M3
|
||||||
|
A[cons_row][m] = -1.0; // coefficient for w[m]
|
||||||
|
A[cons_row][m + n_world] = 1.0; // coefficient for n[m]
|
||||||
|
} else { // in set M4
|
||||||
|
A[cons_row][m] = 1.0; // coefficient for w[m]
|
||||||
|
if (is_macos) {
|
||||||
|
A[cons_row][m + n_world] = 0.0; // coefficient for n[m]
|
||||||
|
} else {
|
||||||
|
A[cons_row][m + n_world] = -1.0; // coefficient for n[m]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
constraint_idx += n_world;
|
||||||
|
|
||||||
|
// constraint coefficients 4: CUDA memory constraint for CUDA devices
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
if (dev_cuda[m]) {
|
||||||
|
A[constraint_idx][m] = 0.0; // coefficient for w[m]
|
||||||
|
A[constraint_idx][m + n_world] = 1.0; // coefficient for n[m]
|
||||||
|
constraint_idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// translate the constraint matrix A into the LP model
|
||||||
|
model.lp_.a_matrix_.format_ = MatrixFormat::kColwise;
|
||||||
|
model.lp_.a_matrix_.start_.resize(n_cols + 1);
|
||||||
|
model.lp_.a_matrix_.index_.clear();
|
||||||
|
model.lp_.a_matrix_.value_.clear();
|
||||||
|
|
||||||
|
int nnz_count = 0; // number of non-zero elements
|
||||||
|
for (int j = 0; j < n_cols; ++j) {
|
||||||
|
model.lp_.a_matrix_.start_[j] = nnz_count;
|
||||||
|
for (int i = 0; i < n_rows; ++i) {
|
||||||
|
if (A[i][j] != 0.0) {
|
||||||
|
model.lp_.a_matrix_.index_.push_back(i);
|
||||||
|
model.lp_.a_matrix_.value_.push_back(A[i][j]);
|
||||||
|
nnz_count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
model.lp_.a_matrix_.start_[n_cols] = nnz_count;
|
||||||
|
|
||||||
|
// integer constraints
|
||||||
|
model.lp_.integrality_ = std::vector<HighsVarType>(n_world * 2, HighsVarType::kInteger);
|
||||||
|
|
||||||
|
// solve the optimization problem
|
||||||
|
Highs highs;
|
||||||
|
highs.setOptionValue("log_to_console", false); // disable logging
|
||||||
|
|
||||||
|
HighsStatus return_status = highs.passModel(model);
|
||||||
|
GGML_ASSERT(return_status == HighsStatus::kOk && "Failed to pass model\n");
|
||||||
|
|
||||||
|
// run the solver
|
||||||
|
return_status = highs.run();
|
||||||
|
GGML_ASSERT(return_status == HighsStatus::kOk && "Failed to run the solver\n");
|
||||||
|
|
||||||
|
// get the solution
|
||||||
|
const HighsModelStatus& model_status = highs.getModelStatus();
|
||||||
|
if (model_status != HighsModelStatus::kOptimal) continue;
|
||||||
|
|
||||||
|
// record the best solution
|
||||||
|
const HighsSolution& solution = highs.getSolution();
|
||||||
|
double objective_value = highs.getInfo().objective_function_value;
|
||||||
|
if (objective_value < best_objective) {
|
||||||
|
best_objective = objective_value;
|
||||||
|
best_k = k;
|
||||||
|
best_solution = solution.col_value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update w[m] and n[m]
|
||||||
|
GGML_ASSERT(best_solution.size() == n_world * 2 && "Invalid solution\n");
|
||||||
|
std::copy(best_solution.begin(), best_solution.begin() + n_world, w.begin());
|
||||||
|
std::copy(best_solution.begin() + n_world, best_solution.end(), n.begin());
|
||||||
|
|
||||||
|
// update the global best solution
|
||||||
|
final_k = best_k;
|
||||||
|
final_objective = best_objective;
|
||||||
|
final_solution = best_solution;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INF("Global best solution found for k = %d\n", final_k);
|
||||||
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
const char * device_name = dev_info_set[m].device_name;
|
||||||
|
GGML_ASSERT(final_solution[m] == w[m] && final_solution[m + n_world] == n[m]);
|
||||||
|
LOG_INF("Device %s (m = %d): w = %d, n = %d\n", device_name, m, w[m], n[m]);
|
||||||
|
}
|
||||||
|
LOG_INF("Objective value: %.3f\n", final_objective);
|
||||||
|
|
||||||
|
// copy value from w and n to n_layer_window and n_gpu_layers, respectively
|
||||||
|
std::copy(w.begin(), w.end(), n_layer_window);
|
||||||
|
std::copy(n.begin(), n.end(), n_gpu_layers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Model utils
|
||||||
|
//
|
||||||
|
|
||||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
llama_init_result iparams;
|
llama_init_result iparams;
|
||||||
auto mparams = llama_model_params_from_gpt_params(params);
|
auto mparams = llama_model_params_from_gpt_params(params);
|
||||||
|
@ -914,30 +1411,40 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
|
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
|
||||||
dev_info_set[0] = dev_info;
|
dev_info_set[0] = dev_info;
|
||||||
llama_gather_device_info(lctx, dev_info_set);
|
llama_gather_device_info(lctx, dev_info_set);
|
||||||
device_print_props(dev_info_set, n_world, model, cparams);
|
|
||||||
} else {
|
} else {
|
||||||
llama_send_device_info(lctx, &dev_info);
|
llama_send_device_info(lctx, &dev_info);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t n_layer_window[32] = {0};
|
uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0};
|
||||||
if (my_rank == 0) {
|
if (my_rank == 0) {
|
||||||
if (n_world == 1 || params.n_layer_window[0] == 0) {
|
if (n_world == 1 || params.n_layer_window[0] == 0) {
|
||||||
llama_assign_n_layer_window(n_world, my_rank, dev_info_set, n_layer_window, model);
|
// automatically determine n_layer_window and n_gpu_layers
|
||||||
|
assign_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams);
|
||||||
} else {
|
} else {
|
||||||
copy_n_layer_window(params.n_layer_window, n_layer_window);
|
// use manually set n_layer_window
|
||||||
|
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), n_layer_window);
|
||||||
}
|
}
|
||||||
|
|
||||||
// synchronize the new n_layer_window to other nodes
|
// synchronize the new n_layer_window and n_gpu_layers to other nodes
|
||||||
llama_broadcast_n_layer_window(lctx, n_layer_window);
|
llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers);
|
||||||
} else {
|
} else {
|
||||||
llama_recv_n_layer_window(lctx, n_layer_window);
|
llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
|
||||||
}
|
}
|
||||||
|
|
||||||
// update n_layer_window
|
// update n_layer_window and n_gpu_layers
|
||||||
copy_n_layer_window(n_layer_window, params.n_layer_window);
|
std::copy(std::begin(n_layer_window), std::end(n_layer_window), params.n_layer_window);
|
||||||
copy_n_layer_window(n_layer_window, cparams.n_layer_window);
|
std::copy(std::begin(n_layer_window), std::end(n_layer_window), cparams.n_layer_window);
|
||||||
copy_n_layer_window(n_layer_window, mparams.n_layer_window);
|
std::copy(std::begin(n_layer_window), std::end(n_layer_window), mparams.n_layer_window);
|
||||||
copy_n_layer_window(n_layer_window, llama_context_n_layer_window(lctx));
|
std::copy(std::begin(n_layer_window), std::end(n_layer_window), llama_context_n_layer_window(lctx));
|
||||||
|
|
||||||
|
params.n_gpu_layers = n_gpu_layers[my_rank];
|
||||||
|
cparams.n_gpu_layers = n_gpu_layers[my_rank];
|
||||||
|
mparams.n_gpu_layers = n_gpu_layers[my_rank];
|
||||||
|
llama_context_n_gpu_layers(lctx)[my_rank] = n_gpu_layers[my_rank];
|
||||||
|
|
||||||
|
#ifdef LLAMA_DEBUG
|
||||||
|
device_print_props(dev_info_set, n_world, model, cparams);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (!mparams.vocab_only && llm_load_tensors(ml, model, mparams) < 0) {
|
if (!mparams.vocab_only && llm_load_tensors(ml, model, mparams) < 0) {
|
||||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
|
|
|
@ -158,7 +158,7 @@ struct gpt_params {
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = 0; // number of layers to store in VRAM (0 - do not use by default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
|
|
@ -1312,8 +1312,8 @@ static float device_memory_access_delay(struct device_info & dev_info, struct ll
|
||||||
auto n_bytes = dev_info.model_bytes;
|
auto n_bytes = dev_info.model_bytes;
|
||||||
int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
|
int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
|
||||||
|
|
||||||
uint64_t cpu_kv_size;
|
int64_t cpu_kv_size;
|
||||||
uint64_t gpu_kv_size;
|
int64_t gpu_kv_size;
|
||||||
|
|
||||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||||
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
|
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
|
||||||
|
@ -1428,17 +1428,17 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
|
||||||
|
|
||||||
cpu_total_bytes += n_bytes.nb_output;
|
cpu_total_bytes += n_bytes.nb_output;
|
||||||
|
|
||||||
uint64_t cpu_kv_size;
|
int64_t cpu_kv_size;
|
||||||
uint64_t gpu_kv_size;
|
int64_t gpu_kv_size;
|
||||||
uint64_t cpu_compute_buf;
|
int64_t cpu_compute_buf;
|
||||||
uint64_t gpu_compute_buf;
|
int64_t gpu_compute_buf;
|
||||||
|
|
||||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||||
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
|
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
|
||||||
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true);
|
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true, true, n_layers, n_gpu_layers);
|
||||||
#else
|
#else
|
||||||
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
|
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
|
||||||
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false);
|
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_layers, n_gpu_layers);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
double cpu_kv_size_gib = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0; // convert to GiB
|
double cpu_kv_size_gib = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0; // convert to GiB
|
||||||
|
|
|
@ -442,12 +442,12 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API void llama_free_model(struct llama_model * model);
|
LLAMA_API void llama_free_model(struct llama_model * model);
|
||||||
|
|
||||||
LLAMA_API void llama_init_sockets (struct llama_context * ctx, uint32_t n_world, uint32_t my_rank);
|
LLAMA_API void llama_init_sockets (struct llama_context * ctx, uint32_t n_world, uint32_t my_rank);
|
||||||
LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg);
|
LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg);
|
||||||
LLAMA_API int llama_gather_device_info (struct llama_context * ctx, struct device_info * dev_info_set);
|
LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set);
|
||||||
LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info);
|
LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info);
|
||||||
LLAMA_API int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window);
|
LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
|
||||||
LLAMA_API int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window);
|
LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
|
||||||
|
|
||||||
LLAMA_API int llm_load_tensors(
|
LLAMA_API int llm_load_tensors(
|
||||||
struct llama_model_loader * ml,
|
struct llama_model_loader * ml,
|
||||||
|
@ -465,6 +465,8 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API uint32_t * llama_context_n_layer_window(struct llama_context * ctx);
|
LLAMA_API uint32_t * llama_context_n_layer_window(struct llama_context * ctx);
|
||||||
|
|
||||||
|
LLAMA_API uint32_t * llama_context_n_gpu_layers(struct llama_context * ctx);
|
||||||
|
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
|
@ -536,11 +538,14 @@ extern "C" {
|
||||||
|
|
||||||
// Return the size of compute buffer size, including input tensors and activations
|
// Return the size of compute buffer size, including input tensors and activations
|
||||||
LLAMA_API void llama_model_compute_buf_size(
|
LLAMA_API void llama_model_compute_buf_size(
|
||||||
uint64_t * cpu_buf,
|
int64_t * cpu_buf,
|
||||||
uint64_t * gpu_buf,
|
int64_t * gpu_buf,
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const struct llama_context_params cparams,
|
const struct llama_context_params cparams,
|
||||||
bool use_gpu);
|
bool use_gpu,
|
||||||
|
bool is_master,
|
||||||
|
int n_layers,
|
||||||
|
int n_gpu_layers);
|
||||||
|
|
||||||
// Return the size of KV cache in the model
|
// Return the size of KV cache in the model
|
||||||
LLAMA_API void llama_total_kv_size(
|
LLAMA_API void llama_total_kv_size(
|
||||||
|
@ -551,8 +556,8 @@ extern "C" {
|
||||||
bool use_gpu);
|
bool use_gpu);
|
||||||
|
|
||||||
LLAMA_API void llama_kv_size(
|
LLAMA_API void llama_kv_size(
|
||||||
uint64_t * cpu_cache,
|
int64_t * cpu_cache,
|
||||||
uint64_t * gpu_cache,
|
int64_t * gpu_cache,
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const struct llama_context_params cparams,
|
const struct llama_context_params cparams,
|
||||||
bool use_gpu);
|
bool use_gpu);
|
||||||
|
|
|
@ -19960,7 +19960,7 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
|
int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) {
|
||||||
uint32_t n_world = ctx->cparams.n_world;
|
uint32_t n_world = ctx->cparams.n_world;
|
||||||
if (n_world == 1) {
|
if (n_world == 1) {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -19973,6 +19973,9 @@ int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_laye
|
||||||
send_msgs.emplace_back("n_layer_window", strlen("n_layer_window"));
|
send_msgs.emplace_back("n_layer_window", strlen("n_layer_window"));
|
||||||
send_msgs.emplace_back(n_layer_window, sizeof(uint32_t) * 32);
|
send_msgs.emplace_back(n_layer_window, sizeof(uint32_t) * 32);
|
||||||
|
|
||||||
|
send_msgs.emplace_back("n_gpu_layers", strlen("n_gpu_layers"));
|
||||||
|
send_msgs.emplace_back(n_gpu_layers, sizeof(uint32_t) * 32);
|
||||||
|
|
||||||
zmq::send_multipart(*ctx->send_socket, send_msgs);
|
zmq::send_multipart(*ctx->send_socket, send_msgs);
|
||||||
} catch (const zmq::error_t& e) {
|
} catch (const zmq::error_t& e) {
|
||||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||||
|
@ -19982,7 +19985,7 @@ int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_laye
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
|
int llama_recv_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) {
|
||||||
uint32_t n_world = ctx->cparams.n_world;
|
uint32_t n_world = ctx->cparams.n_world;
|
||||||
uint32_t my_rank = ctx->cparams.rank;
|
uint32_t my_rank = ctx->cparams.rank;
|
||||||
|
|
||||||
|
@ -19991,15 +19994,20 @@ int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_win
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string key = recv_msgs[0].to_string();
|
if (recv_msgs.size() != 4) { // expecting n_layer_windows and n_gpu_layers
|
||||||
if (key != "n_layer_window") {
|
LLAMA_LOG_INFO("Unexpected number of messages received: %zu\n", recv_msgs.size());
|
||||||
LLAMA_LOG_INFO("Unexpected message received: %s\n", key.c_str());
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
zmq::message_t & data_msg = recv_msgs[1];
|
if (recv_msgs[0].to_string() != "n_layer_window" || recv_msgs[2].to_string() != "n_gpu_layers") {
|
||||||
GGML_ASSERT(data_msg.size() == sizeof(uint32_t) * 32);
|
LLAMA_LOG_INFO("Unexpected message received\n");
|
||||||
memcpy(n_layer_window, data_msg.data(), sizeof(uint32_t) * 32);
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(recv_msgs[1].size() == sizeof(uint32_t) * 32);
|
||||||
|
GGML_ASSERT(recv_msgs[3].size() == sizeof(uint32_t) * 32);
|
||||||
|
memcpy(n_layer_window, recv_msgs[1].data(), sizeof(uint32_t) * 32);
|
||||||
|
memcpy(n_gpu_layers, recv_msgs[3].data(), sizeof(uint32_t) * 32);
|
||||||
|
|
||||||
if (my_rank != n_world - 1) {
|
if (my_rank != n_world - 1) {
|
||||||
try {
|
try {
|
||||||
|
@ -20511,6 +20519,10 @@ uint32_t * llama_context_n_layer_window(struct llama_context * ctx) {
|
||||||
return ctx->cparams.n_layer_window;
|
return ctx->cparams.n_layer_window;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t * llama_context_n_gpu_layers(struct llama_context * ctx) {
|
||||||
|
return ctx->cparams.n_gpu_layers;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_free(struct llama_context * ctx) {
|
void llama_free(struct llama_context * ctx) {
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
@ -20909,47 +20921,51 @@ static void count_n_bytes(struct model_bytes * n_bytes, enum profiler_layer_type
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_model_compute_buf_size(
|
void llama_model_compute_buf_size(
|
||||||
uint64_t * cpu_buf,
|
int64_t * cpu_buf,
|
||||||
uint64_t * gpu_buf,
|
int64_t * gpu_buf,
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const struct llama_context_params cparams,
|
const struct llama_context_params cparams,
|
||||||
bool use_gpu) {
|
bool use_gpu,
|
||||||
|
bool is_master,
|
||||||
|
int n_layers,
|
||||||
|
int n_gpu_layers) {
|
||||||
const llama_hparams hparams = model->hparams;
|
const llama_hparams hparams = model->hparams;
|
||||||
|
|
||||||
// input tensors
|
// input tensors
|
||||||
const uint64_t n_inp_toks = cparams.n_ubatch;
|
const int64_t n_inp_toks = cparams.n_ubatch;
|
||||||
const uint64_t n_inp_embd = hparams.n_embd * cparams.n_ubatch;
|
const int64_t n_inp_embd = hparams.n_embd * cparams.n_ubatch;
|
||||||
|
|
||||||
// activations (see figures/memory-allocation-map-for-activations.png for detailed allocation)
|
// activations (see figures/memory-allocation-map-for-activations.png for detailed allocation)
|
||||||
const uint64_t n_bak_embd = hparams.n_embd * cparams.n_ubatch;
|
const int64_t n_bak_embd = hparams.n_embd * cparams.n_ubatch;
|
||||||
const uint64_t n_inp_pos = cparams.n_ubatch;
|
const int64_t n_inp_pos = cparams.n_ubatch;
|
||||||
const uint64_t n_kq_mask = cparams.n_ctx * cparams.n_ubatch;
|
const int64_t n_kq_mask = cparams.n_ctx * cparams.n_ubatch;
|
||||||
const uint64_t n_inp_out_ids = cparams.n_ubatch;
|
const int64_t n_inp_out_ids = cparams.n_ubatch;
|
||||||
const uint64_t n_norm = hparams.n_embd * cparams.n_ubatch;
|
const int64_t n_norm = hparams.n_embd * cparams.n_ubatch;
|
||||||
const uint64_t n_qcur = hparams.n_embd * cparams.n_ubatch * 2;
|
const int64_t n_qcur = hparams.n_embd * cparams.n_ubatch * 2;
|
||||||
const uint64_t n_kq = cparams.n_ctx * cparams.n_ubatch * hparams.n_head();
|
const int64_t n_kq = cparams.n_ctx * cparams.n_ubatch * hparams.n_head();
|
||||||
|
|
||||||
// outputs
|
// outputs
|
||||||
const uint64_t n_out_embd = hparams.n_embd * cparams.n_ubatch;
|
const int64_t n_out_embd = hparams.n_embd * cparams.n_ubatch;
|
||||||
const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch;
|
const int64_t n_output = hparams.n_vocab * cparams.n_ubatch;
|
||||||
|
|
||||||
// compute buffer size for input, each layer, and output
|
// compute buffer size for input, each layer, and output
|
||||||
const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
|
const int64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
|
||||||
const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask +
|
const int64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask +
|
||||||
n_inp_out_ids + n_norm + n_qcur + n_kq
|
n_inp_out_ids + n_norm + n_qcur + n_kq
|
||||||
) * ggml_type_size(GGML_TYPE_F32);
|
) * ggml_type_size(GGML_TYPE_F32);
|
||||||
const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);
|
const int64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);
|
||||||
|
|
||||||
|
*cpu_buf = 0;
|
||||||
|
*gpu_buf = 0;
|
||||||
|
if (is_master) *cpu_buf = n_buf_inp + n_buf_out;
|
||||||
|
|
||||||
if (use_gpu) {
|
if (use_gpu) {
|
||||||
*gpu_buf = n_buf_act;
|
*gpu_buf += n_buf_act;
|
||||||
if (llama_model_n_layers(model) > cparams.n_gpu_layers) {
|
if (n_layers > n_gpu_layers) {
|
||||||
*cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
|
*cpu_buf += n_buf_act;
|
||||||
} else {
|
|
||||||
*cpu_buf = n_buf_inp + n_buf_out;
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
*gpu_buf = 0;
|
*cpu_buf += n_buf_act;
|
||||||
*cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20973,8 +20989,8 @@ void llama_total_kv_size(
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_size(
|
void llama_kv_size(
|
||||||
uint64_t * cpu_cache,
|
int64_t * cpu_cache,
|
||||||
uint64_t * gpu_cache,
|
int64_t * gpu_cache,
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const struct llama_context_params cparams,
|
const struct llama_context_params cparams,
|
||||||
bool use_gpu) {
|
bool use_gpu) {
|
||||||
|
|
Loading…
Add table
Reference in a new issue