Merge branch 'dev' into lt_test

Merge dev branch updates into local branch lt_test.
This commit is contained in:
leeetao 2025-02-23 08:35:45 +00:00
commit 7bf1b743fb
7 changed files with 147 additions and 65 deletions

View file

@ -274,15 +274,10 @@ endif
ifeq ($(USE_HIGHS),1)
HIGHS_CPPFLAGS = -isystem /usr/local/include/highs
HIGHS_LDFLAGS = -L/usr/local/lib -lhighs
ifeq ($(UNAME_S),Darwin)
HIGHS_CPPFLAGS += -isystem /opt/homebrew/include/highs
HIGHS_LDFLAGS += -L/opt/homebrew/lib -lhighs
else ifneq ($(CONDA_PREFIX),)
HIGHS_CPPFLAGS += -isystem $(CONDA_PREFIX)/include -isystem $(CONDA_PREFIX)/include/highs
HIGHS_LDFLAGS += -L$(CONDA_PREFIX)/lib -Wl,-rpath,$(CONDA_PREFIX)/lib
endif
MK_CPPFLAGS += $(HIGHS_CPPFLAGS) -DUSE_HIGHS
MK_LDFLAGS += $(HIGHS_LDFLAGS)
endif

View file

@ -724,10 +724,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
}
).set_env("LLAMA_ARG_NEXT_NODE_IP"));
add_opt(llama_arg(
{"--unload", "--unload-weight"},
format("whether to unload layer weights after use (default: %s)", params.unload ? "true" : "false"),
{"--prefetch"},
format("whether to prefetch layer weights (default: %s)", params.prefetch ? "true" : "false"),
[](gpt_params & params) {
params.unload = true;
params.prefetch = true;
}
).set_env("LLAMA_ARG_UNLOAD"));
add_opt(llama_arg(

View file

@ -866,27 +866,29 @@ static bool assign_layers_to_device(
return true;
}
const device_info &master = dev_info_set[0];
std::vector<int> w(n_world, 0);
std::vector<int> n(n_world, 0);
std::vector<float> mem_budget(n_world, 0.0f);
// model-specific constants
const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model);
const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model);
const int n_vocab = llama_n_vocab(model);
const int n_kv = cparams.n_ctx;
const int64_t b = dev_info_set[0].model_bytes.nb_layer;
const int64_t bi = dev_info_set[0].model_bytes.nb_input;
const int64_t bo = dev_info_set[0].model_bytes.nb_output;
const int64_t b_prime = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv;
#if defined(USE_HIGHS)
const device_info &master = dev_info_set[0];
const int n_vocab = llama_n_vocab(model);
const int64_t bi = dev_info_set[0].model_bytes.nb_input;
// device-specific constants
std::vector<float> alpha(n_world, 0.0f);
std::vector<float> beta(n_world, 0.0f);
std::vector<float> xi(n_world, 0.0f);
float kappa = 0.0f;
std::vector<int> w(n_world, 0);
std::vector<int> n(n_world, 0);
std::vector<float> mem_budget(n_world, 0.0f);
// -------- Compute alpha[m], beta[m], xi[m] --------
for (uint32_t m = 0; m < n_world; ++m) {
@ -977,7 +979,6 @@ static bool assign_layers_to_device(
: std::min_element(mem_budget.begin(), mem_budget.end());
w[std::distance(mem_budget.begin(), device)] += diff;
#if defined(USE_HIGHS)
// stores the actual read bandwidth (GB/s) for each device
std::vector<float> disk_speed(n_world, 0.0f);
for (uint32_t m = 0; m < n_world; ++m) {
@ -1012,6 +1013,7 @@ static bool assign_layers_to_device(
// M3: devices running on Linux or Android and with insufficient memory
// M4: devices with sufficient memory or very slow disk I/O (slower than min_disk_io_speed)
std::vector<uint32_t> M1, M2, M3, M4, M1_prev, M2_prev, M3_prev, M4_prev;
std::vector<bool> M4_force(n_world, false);
std::vector<int64_t> c_cpu(n_world, 0), c_gpu(n_world, 0);
// helper function to check if a device is in a specific set
@ -1032,7 +1034,8 @@ static bool assign_layers_to_device(
bool is_windows = strcmp(dev.device_os, "Windows") == 0;
GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, dev.gpu_support.metal, m == 0, w[m] * k, n[m] * k);
bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k);
int l_m = w[m] * k; // total number of layers assigned to device m
int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU
@ -1041,18 +1044,16 @@ static bool assign_layers_to_device(
bool condition3 = (l_m - l_m_gpu) * b_prime + (bi / n_vocab + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE;
bool is_slow_disk = disk_speed[m] < min_disk_read_speed;
if (is_macos && !dev.gpu_support.metal && condition1 && !is_slow_disk) {
// case 1: macOS without Metal, and with insufficient memory
M1.push_back(m);
} else if (is_macos && dev.gpu_support.metal && condition2 && !is_slow_disk) {
// case 2: macOS with Metal, and with insufficient memory
M2.push_back(m);
} else if ((is_linux || is_android) && condition3 && !is_slow_disk) {
// case 3: Linux with insufficient memory
M3.push_back(m);
if (M4_force[m] || is_slow_disk) {
M4.push_back(m); // case 4: devices with very slow disk or force to be in M4
} else if (is_macos && !dev.gpu_support.metal && condition1) {
M1.push_back(m); // case 1: macOS without Metal, and with insufficient memory
} else if (is_macos && dev.gpu_support.metal && condition2) {
M2.push_back(m); // case 2: macOS with Metal, and with insufficient memory
} else if ((is_linux || is_android) && condition3) {
M3.push_back(m); // case 3: Linux with insufficient memory
} else {
// case 4: otherwise, assigned to M4
M4.push_back(m);
M4.push_back(m); // case 4: devices with sufficient memory
}
}
@ -1253,7 +1254,8 @@ static bool assign_layers_to_device(
// constraint bound 4: CUDA/shared memory constraint for CUDA/Metal devices
for (uint32_t m = 0; m < n_world; ++m) {
model.lp_.row_upper_[constraint_idx] = W * vec_z_gpu[m];
double upper_bound = W * vec_z_gpu[m];
model.lp_.row_upper_[constraint_idx] = (upper_bound > 0) ? std::max(upper_bound, 1.0) : upper_bound;
constraint_idx++;
}
@ -1359,6 +1361,44 @@ static bool assign_layers_to_device(
k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str());
}
// check the solution
bool has_free_gpu_memory = false, has_overload = false;
for (uint32_t m = 0; m < n_world; ++m) {
uint32_t w_m = best_solution[m], n_m = best_solution[m + n_world];
// if there is still free GPU memory
if (n_m < static_cast<uint32_t>(std::round(W * vec_z_gpu[m]))) {
has_free_gpu_memory = true;
}
// if there is device overloaded
if (w_m > n_m) {
has_overload = true;
}
}
if (has_free_gpu_memory && has_overload) {
int worst_device = -1;
float worst_speed = std::numeric_limits<float>::max();
// find the device with slowest disk speed but was not in M4 yet
for (uint32_t m = 0; m < n_world; ++m) {
if (!in_set(m, M4) && disk_speed[m] < worst_speed) {
worst_speed = disk_speed[m];
worst_device = m;
}
}
if (worst_device != -1) {
M4_force[worst_device] = true;
LOG_INF("Forcing device %d (disk speed %.2f GB/s) into M4\n", worst_device, worst_speed);
} else {
LOG_INF("Infeasible solution detected but no device can be forced into M4\n");
}
continue;
}
// update w[m] and n[m]
GGML_ASSERT(best_solution.size() == n_world * 2 && "Invalid solution\n");
std::copy(best_solution.begin(), best_solution.begin() + n_world, w.begin());
@ -1387,27 +1427,59 @@ static bool assign_layers_to_device(
LOG_INF(" - N Layer Window : %d\n", w[m]);
LOG_INF(" - N GPU Layers : %d\n", n[m]);
}
LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
LOG_INF("------------------------------------------");
// LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
// LOG_INF("------------------------------------------");
// copy value from w and n to n_layer_window and n_gpu_layers, respectively
std::copy(w.begin(), w.end(), n_layer_window);
std::copy(n.begin(), n.end(), n_gpu_layers);
#else
(void)bi;
(void)bo;
(void)kappa;
(void)cparams;
(void)min_disk_read_speed;
(void)n_vocab;
(void)GIGABYTE;
std::copy(w.begin(), w.end(), n_layer_window);
// assign layers according to RAM/VRAM
for (uint32_t m = 0; m < n_world; ++m) {
const device_info & dev = dev_info_set[m];
if (dev.gpu_support.metal || dev.gpu_support.cuda) {
mem_budget[m] = dev.gpu_props.memory_free;
} else {
mem_budget[m] = dev.memory.available_physical;
}
}
// initialize w_m proportionally to memory budget and n_m to 0
float total_mem_budget = std::accumulate(mem_budget.begin(), mem_budget.end(), 0.0f);
for (uint32_t m = 0; m < n_world; ++m) {
w[m] = std::round(mem_budget[m] / total_mem_budget * n_layer);
n[m] = 0;
}
// adjust w[m] to ensure L mod W = 0
int diff = n_layer - std::accumulate(w.begin(), w.end(), 0);
auto device = (diff > 0) ? std::max_element(mem_budget.begin(), mem_budget.end())
: std::min_element(mem_budget.begin(), mem_budget.end());
w[std::distance(mem_budget.begin(), device)] += diff;
std::copy(w.begin(), w.end(), n_layer_window);
std::vector<float> vec_z_gpu(n_world, 0.0f);
std::vector<int64_t> c_cpu(n_world, 0), c_gpu(n_world, 0);
for (uint32_t m = 0; m < n_world; ++m) {
const device_info & dev = dev_info_set[m];
bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m], n[m]);
if (dev.gpu_support.cuda || dev.gpu_support.metal) {
n_gpu_layers[m] = w[m];
int64_t required_mem = w[m] * b_prime;
int64_t available_mem = dev.gpu_props.memory_free * GIGABYTE - c_gpu[m];
if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
available_mem -= bo;
}
if (required_mem <= available_mem) {
n_gpu_layers[m] = w[m];
} else {
n_gpu_layers[m] = available_mem / b_prime;
}
}
}
@ -1473,8 +1545,10 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
// get device profile
LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
dev_info.rank = params.rank;
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
if (n_world > 1) {
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
}
// create llama context
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
llama_context * lctx = llama_new_context_with_model(model, cparams);
@ -1714,7 +1788,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_world = params.n_world;
cparams.rank = params.rank;
cparams.unload = params.unload;
cparams.prefetch = params.prefetch;
cparams.keep_out_in_metal = params.keep_out_in_metal;
cparams.n_gpu_layers = params.n_gpu_layers;
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);

View file

@ -147,7 +147,7 @@ struct gpt_params {
uint32_t n_layer_window[32] = {0}; // layer window size on each node
std::string master_ip = "localhost"; // ip address of the master node
std::string next_node_ip = "localhost"; // ip address of my next node
bool unload = false; // unload layer weights after use or not
bool prefetch = false; // prefetch layer weights
bool keep_out_in_metal = true; // whether to keep output weights in metal memory, true by default
int32_t gpu_mem = 999.0; // gpu memory to use, in GiB
int32_t n_predict = -1; // new tokens to predict

View file

@ -521,9 +521,9 @@ static uint64_t device_host_physical_memory(bool available) {
// active pages compression has higher priority than releasing the clean mmap-ed pages
// some of the active pages can be compressed to save memory for our mmap-ed model weights
if (is_uma_arch()) {
// assume 30% of active pages can be compressed on macOS UMA (an empirical value)
// assume 10% of active pages can be compressed on macOS UMA (an empirical value)
// because GPU is more likely to use the inactive memory
memory += vm_stats.active_count * 0.3 * page_size;
memory += vm_stats.active_count * 0.1 * page_size;
} else {
// assume 50% of active pages can be compressed on macOS NUMA (an empirical value)
memory += vm_stats.active_count * 0.5 * page_size;

View file

@ -324,7 +324,7 @@ extern "C" {
uint32_t rank; // my rank
uint32_t n_layer_window[32];// number of layers to process in each compute
uint32_t n_gpu_layers; // number of layers to process on GPU
bool unload; // whether to unload layer weights after use
bool prefetch; // whether to prefetch layer weights
bool keep_out_in_metal; // whether to keep output weights in metal memory
char * master_ip; // ip address of the master node
char * next_node_ip; // ip address of the next node

View file

@ -106,7 +106,7 @@
struct Timer {
const char * name;
int64_t start_time;
bool enable_timer = true;
bool enable_timer = false;
Timer(const char * name) : name(name), start_time(ggml_time_us()) {}
~Timer() {
if (enable_timer) {
@ -2571,7 +2571,7 @@ struct llama_cparams {
uint32_t n_world;
uint32_t rank;
uint32_t n_layer_window[32];
bool unload;
bool prefetch;
uint32_t n_ctx; // context size used during inference
uint32_t n_batch;
uint32_t n_ubatch;
@ -17829,14 +17829,12 @@ static bool is_tensor_loaded(struct ggml_tensor * tensor) {
// align addr
llama_mmap::align_range(&first, &last, page_size);
size_t len = std::max(last - first, static_cast<size_t>(page_size));
// calculate the number of pages to check
size_t page_count = (len + page_size - 1) / page_size;
size_t page_count = len / page_size;
#ifdef __APPLE__
char * mincore_res = new char[page_count];
#else
unsigned char *mincore_res = new unsigned char[page_count]; // use 'unsigned char' for Linux
unsigned char * mincore_res = new unsigned char[page_count]; // use 'unsigned char' for Linux
#endif
// call mincore to check if pages are resident in memory
@ -17865,13 +17863,20 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) {
if (strstr(cur->name, "weight") == nullptr || cur->data == nullptr) {
continue;
}
const char * backend_name = ggml_backend_buffer_name(cur->buffer);
if (backend_name) {
std::string lower_name(backend_name);
std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
[](unsigned char c) { return std::tolower(c); });
if (lower_name.find("cuda") != std::string::npos) continue;
}
if (is_tensor_loaded(cur)) n_loaded++;
n_total++;
}
return float(n_loaded) / float(n_total) * 100.0f;
}
static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool force = false) {
static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) {
long page_size = sysconf(_SC_PAGESIZE);
struct Segment {
@ -17882,10 +17887,19 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
for (int i = 0; i < ggml_graph_n_leafs(cgraph); i++) {
struct ggml_tensor * cur = ggml_graph_leaf(cgraph, i);
if (strstr(cur->name, "weight") == nullptr || cur->data == nullptr) {
continue;
}
const char * backend_name = ggml_backend_buffer_name(cur->buffer);
if (backend_name) {
std::string lower_name(backend_name);
std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
[](unsigned char c) { return std::tolower(c); });
if (lower_name.find("cuda") != std::string::npos) continue;
}
size_t size = ggml_nbytes(cur);
size_t first = reinterpret_cast<size_t>(cur->data);
size_t last = first + size;
@ -17915,13 +17929,16 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
}
for (const auto & segment : merged_segments) {
size_t prefetch_dense = 4;
size_t len = std::max(segment.end - segment.start, static_cast<size_t>(page_size));
posix_madvise(reinterpret_cast<void *>(segment.start), len, advice); // hint to load into memory
// force to prefetch data
if (force && advice == POSIX_MADV_WILLNEED) {
// force to prefetch data, disabled by default
if (advice == POSIX_MADV_WILLNEED && false) {
volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
for (size_t off = 0; off < len; off += page_size) {
(void)ptr[off];
for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
for (size_t i = 0; i < prefetch_dense; i++) {
if (off + i * page_size < len) (void)ptr[off + i * page_size];
}
}
}
}
@ -18193,17 +18210,13 @@ static int llama_decode_internal(
}
// overlap memory scheduling with other nodes' communication and computing
{
if (cparams.prefetch && n_world > 1) {
timer(manage_graph_tensors);
int next_gf_id = (i + 1) % gf.size();
manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, true);
manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED);
if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) {
manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, true);
}
if (cparams.unload && n_world > 1) {
manage_graph_tensors(sub_gf, POSIX_MADV_DONTNEED);
manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED);
}
}
}
@ -19926,7 +19939,7 @@ struct llama_context_params llama_context_default_params() {
/*.rank =*/ 0,
/*.n_layer_window =*/ {32},
/*.n_gpu_layers =*/ 0,
/*.unload =*/ false,
/*.prefetch =*/ false,
/*.keep_out_in_metal =*/ true,
/*.master_ip =*/ nullptr,
/*.next_node_ip =*/ nullptr,
@ -20354,7 +20367,7 @@ void * llama_context_setup_backend(
auto & cparams = ctx->cparams;
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
cparams.unload = params.unload;
cparams.prefetch = params.prefetch;
cparams.n_seq_max = std::max(1u, params.n_seq_max);
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch;