mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 22:59:02 +00:00
Merge branch 'dev' into lt_test
Merge dev branch updates into local branch lt_test.
This commit is contained in:
commit
7bf1b743fb
7 changed files with 147 additions and 65 deletions
5
Makefile
5
Makefile
|
@ -274,15 +274,10 @@ endif
|
|||
ifeq ($(USE_HIGHS),1)
|
||||
HIGHS_CPPFLAGS = -isystem /usr/local/include/highs
|
||||
HIGHS_LDFLAGS = -L/usr/local/lib -lhighs
|
||||
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
HIGHS_CPPFLAGS += -isystem /opt/homebrew/include/highs
|
||||
HIGHS_LDFLAGS += -L/opt/homebrew/lib -lhighs
|
||||
else ifneq ($(CONDA_PREFIX),)
|
||||
HIGHS_CPPFLAGS += -isystem $(CONDA_PREFIX)/include -isystem $(CONDA_PREFIX)/include/highs
|
||||
HIGHS_LDFLAGS += -L$(CONDA_PREFIX)/lib -Wl,-rpath,$(CONDA_PREFIX)/lib
|
||||
endif
|
||||
|
||||
MK_CPPFLAGS += $(HIGHS_CPPFLAGS) -DUSE_HIGHS
|
||||
MK_LDFLAGS += $(HIGHS_LDFLAGS)
|
||||
endif
|
||||
|
|
|
@ -724,10 +724,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
}
|
||||
).set_env("LLAMA_ARG_NEXT_NODE_IP"));
|
||||
add_opt(llama_arg(
|
||||
{"--unload", "--unload-weight"},
|
||||
format("whether to unload layer weights after use (default: %s)", params.unload ? "true" : "false"),
|
||||
{"--prefetch"},
|
||||
format("whether to prefetch layer weights (default: %s)", params.prefetch ? "true" : "false"),
|
||||
[](gpt_params & params) {
|
||||
params.unload = true;
|
||||
params.prefetch = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_UNLOAD"));
|
||||
add_opt(llama_arg(
|
||||
|
|
|
@ -866,27 +866,29 @@ static bool assign_layers_to_device(
|
|||
return true;
|
||||
}
|
||||
|
||||
const device_info &master = dev_info_set[0];
|
||||
std::vector<int> w(n_world, 0);
|
||||
std::vector<int> n(n_world, 0);
|
||||
std::vector<float> mem_budget(n_world, 0.0f);
|
||||
|
||||
// model-specific constants
|
||||
const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model);
|
||||
const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model);
|
||||
const int n_vocab = llama_n_vocab(model);
|
||||
const int n_kv = cparams.n_ctx;
|
||||
|
||||
const int64_t b = dev_info_set[0].model_bytes.nb_layer;
|
||||
const int64_t bi = dev_info_set[0].model_bytes.nb_input;
|
||||
const int64_t bo = dev_info_set[0].model_bytes.nb_output;
|
||||
const int64_t b_prime = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv;
|
||||
|
||||
#if defined(USE_HIGHS)
|
||||
const device_info &master = dev_info_set[0];
|
||||
const int n_vocab = llama_n_vocab(model);
|
||||
const int64_t bi = dev_info_set[0].model_bytes.nb_input;
|
||||
|
||||
// device-specific constants
|
||||
std::vector<float> alpha(n_world, 0.0f);
|
||||
std::vector<float> beta(n_world, 0.0f);
|
||||
std::vector<float> xi(n_world, 0.0f);
|
||||
float kappa = 0.0f;
|
||||
std::vector<int> w(n_world, 0);
|
||||
std::vector<int> n(n_world, 0);
|
||||
std::vector<float> mem_budget(n_world, 0.0f);
|
||||
|
||||
// -------- Compute alpha[m], beta[m], xi[m] --------
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
|
@ -977,7 +979,6 @@ static bool assign_layers_to_device(
|
|||
: std::min_element(mem_budget.begin(), mem_budget.end());
|
||||
w[std::distance(mem_budget.begin(), device)] += diff;
|
||||
|
||||
#if defined(USE_HIGHS)
|
||||
// stores the actual read bandwidth (GB/s) for each device
|
||||
std::vector<float> disk_speed(n_world, 0.0f);
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
|
@ -1012,6 +1013,7 @@ static bool assign_layers_to_device(
|
|||
// M3: devices running on Linux or Android and with insufficient memory
|
||||
// M4: devices with sufficient memory or very slow disk I/O (slower than min_disk_io_speed)
|
||||
std::vector<uint32_t> M1, M2, M3, M4, M1_prev, M2_prev, M3_prev, M4_prev;
|
||||
std::vector<bool> M4_force(n_world, false);
|
||||
std::vector<int64_t> c_cpu(n_world, 0), c_gpu(n_world, 0);
|
||||
|
||||
// helper function to check if a device is in a specific set
|
||||
|
@ -1032,7 +1034,8 @@ static bool assign_layers_to_device(
|
|||
bool is_windows = strcmp(dev.device_os, "Windows") == 0;
|
||||
GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
|
||||
|
||||
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, dev.gpu_support.metal, m == 0, w[m] * k, n[m] * k);
|
||||
bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
|
||||
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k);
|
||||
|
||||
int l_m = w[m] * k; // total number of layers assigned to device m
|
||||
int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU
|
||||
|
@ -1041,18 +1044,16 @@ static bool assign_layers_to_device(
|
|||
bool condition3 = (l_m - l_m_gpu) * b_prime + (bi / n_vocab + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE;
|
||||
bool is_slow_disk = disk_speed[m] < min_disk_read_speed;
|
||||
|
||||
if (is_macos && !dev.gpu_support.metal && condition1 && !is_slow_disk) {
|
||||
// case 1: macOS without Metal, and with insufficient memory
|
||||
M1.push_back(m);
|
||||
} else if (is_macos && dev.gpu_support.metal && condition2 && !is_slow_disk) {
|
||||
// case 2: macOS with Metal, and with insufficient memory
|
||||
M2.push_back(m);
|
||||
} else if ((is_linux || is_android) && condition3 && !is_slow_disk) {
|
||||
// case 3: Linux with insufficient memory
|
||||
M3.push_back(m);
|
||||
if (M4_force[m] || is_slow_disk) {
|
||||
M4.push_back(m); // case 4: devices with very slow disk or force to be in M4
|
||||
} else if (is_macos && !dev.gpu_support.metal && condition1) {
|
||||
M1.push_back(m); // case 1: macOS without Metal, and with insufficient memory
|
||||
} else if (is_macos && dev.gpu_support.metal && condition2) {
|
||||
M2.push_back(m); // case 2: macOS with Metal, and with insufficient memory
|
||||
} else if ((is_linux || is_android) && condition3) {
|
||||
M3.push_back(m); // case 3: Linux with insufficient memory
|
||||
} else {
|
||||
// case 4: otherwise, assigned to M4
|
||||
M4.push_back(m);
|
||||
M4.push_back(m); // case 4: devices with sufficient memory
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1253,7 +1254,8 @@ static bool assign_layers_to_device(
|
|||
|
||||
// constraint bound 4: CUDA/shared memory constraint for CUDA/Metal devices
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
model.lp_.row_upper_[constraint_idx] = W * vec_z_gpu[m];
|
||||
double upper_bound = W * vec_z_gpu[m];
|
||||
model.lp_.row_upper_[constraint_idx] = (upper_bound > 0) ? std::max(upper_bound, 1.0) : upper_bound;
|
||||
constraint_idx++;
|
||||
}
|
||||
|
||||
|
@ -1359,6 +1361,44 @@ static bool assign_layers_to_device(
|
|||
k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str());
|
||||
}
|
||||
|
||||
// check the solution
|
||||
bool has_free_gpu_memory = false, has_overload = false;
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
uint32_t w_m = best_solution[m], n_m = best_solution[m + n_world];
|
||||
|
||||
// if there is still free GPU memory
|
||||
if (n_m < static_cast<uint32_t>(std::round(W * vec_z_gpu[m]))) {
|
||||
has_free_gpu_memory = true;
|
||||
}
|
||||
|
||||
// if there is device overloaded
|
||||
if (w_m > n_m) {
|
||||
has_overload = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_free_gpu_memory && has_overload) {
|
||||
int worst_device = -1;
|
||||
float worst_speed = std::numeric_limits<float>::max();
|
||||
|
||||
// find the device with slowest disk speed but was not in M4 yet
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
if (!in_set(m, M4) && disk_speed[m] < worst_speed) {
|
||||
worst_speed = disk_speed[m];
|
||||
worst_device = m;
|
||||
}
|
||||
}
|
||||
|
||||
if (worst_device != -1) {
|
||||
M4_force[worst_device] = true;
|
||||
LOG_INF("Forcing device %d (disk speed %.2f GB/s) into M4\n", worst_device, worst_speed);
|
||||
} else {
|
||||
LOG_INF("Infeasible solution detected but no device can be forced into M4\n");
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// update w[m] and n[m]
|
||||
GGML_ASSERT(best_solution.size() == n_world * 2 && "Invalid solution\n");
|
||||
std::copy(best_solution.begin(), best_solution.begin() + n_world, w.begin());
|
||||
|
@ -1387,27 +1427,59 @@ static bool assign_layers_to_device(
|
|||
LOG_INF(" - N Layer Window : %d\n", w[m]);
|
||||
LOG_INF(" - N GPU Layers : %d\n", n[m]);
|
||||
}
|
||||
LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
|
||||
LOG_INF("------------------------------------------");
|
||||
// LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
|
||||
// LOG_INF("------------------------------------------");
|
||||
|
||||
// copy value from w and n to n_layer_window and n_gpu_layers, respectively
|
||||
std::copy(w.begin(), w.end(), n_layer_window);
|
||||
std::copy(n.begin(), n.end(), n_gpu_layers);
|
||||
|
||||
#else
|
||||
(void)bi;
|
||||
(void)bo;
|
||||
(void)kappa;
|
||||
(void)cparams;
|
||||
(void)min_disk_read_speed;
|
||||
(void)n_vocab;
|
||||
(void)GIGABYTE;
|
||||
|
||||
std::copy(w.begin(), w.end(), n_layer_window);
|
||||
// assign layers according to RAM/VRAM
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info & dev = dev_info_set[m];
|
||||
if (dev.gpu_support.metal || dev.gpu_support.cuda) {
|
||||
mem_budget[m] = dev.gpu_props.memory_free;
|
||||
} else {
|
||||
mem_budget[m] = dev.memory.available_physical;
|
||||
}
|
||||
}
|
||||
|
||||
// initialize w_m proportionally to memory budget and n_m to 0
|
||||
float total_mem_budget = std::accumulate(mem_budget.begin(), mem_budget.end(), 0.0f);
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
w[m] = std::round(mem_budget[m] / total_mem_budget * n_layer);
|
||||
n[m] = 0;
|
||||
}
|
||||
// adjust w[m] to ensure L mod W = 0
|
||||
int diff = n_layer - std::accumulate(w.begin(), w.end(), 0);
|
||||
auto device = (diff > 0) ? std::max_element(mem_budget.begin(), mem_budget.end())
|
||||
: std::min_element(mem_budget.begin(), mem_budget.end());
|
||||
w[std::distance(mem_budget.begin(), device)] += diff;
|
||||
std::copy(w.begin(), w.end(), n_layer_window);
|
||||
|
||||
std::vector<float> vec_z_gpu(n_world, 0.0f);
|
||||
std::vector<int64_t> c_cpu(n_world, 0), c_gpu(n_world, 0);
|
||||
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
const device_info & dev = dev_info_set[m];
|
||||
bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
|
||||
llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m], n[m]);
|
||||
|
||||
if (dev.gpu_support.cuda || dev.gpu_support.metal) {
|
||||
n_gpu_layers[m] = w[m];
|
||||
int64_t required_mem = w[m] * b_prime;
|
||||
int64_t available_mem = dev.gpu_props.memory_free * GIGABYTE - c_gpu[m];
|
||||
if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
|
||||
available_mem -= bo;
|
||||
}
|
||||
|
||||
if (required_mem <= available_mem) {
|
||||
n_gpu_layers[m] = w[m];
|
||||
} else {
|
||||
n_gpu_layers[m] = available_mem / b_prime;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1473,8 +1545,10 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
// get device profile
|
||||
LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
|
||||
dev_info.rank = params.rank;
|
||||
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
|
||||
if (n_world > 1) {
|
||||
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
}
|
||||
|
||||
// create llama context
|
||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||
|
@ -1714,7 +1788,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
|
||||
cparams.n_world = params.n_world;
|
||||
cparams.rank = params.rank;
|
||||
cparams.unload = params.unload;
|
||||
cparams.prefetch = params.prefetch;
|
||||
cparams.keep_out_in_metal = params.keep_out_in_metal;
|
||||
cparams.n_gpu_layers = params.n_gpu_layers;
|
||||
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
|
||||
|
|
|
@ -147,7 +147,7 @@ struct gpt_params {
|
|||
uint32_t n_layer_window[32] = {0}; // layer window size on each node
|
||||
std::string master_ip = "localhost"; // ip address of the master node
|
||||
std::string next_node_ip = "localhost"; // ip address of my next node
|
||||
bool unload = false; // unload layer weights after use or not
|
||||
bool prefetch = false; // prefetch layer weights
|
||||
bool keep_out_in_metal = true; // whether to keep output weights in metal memory, true by default
|
||||
int32_t gpu_mem = 999.0; // gpu memory to use, in GiB
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
|
|
|
@ -521,9 +521,9 @@ static uint64_t device_host_physical_memory(bool available) {
|
|||
// active pages compression has higher priority than releasing the clean mmap-ed pages
|
||||
// some of the active pages can be compressed to save memory for our mmap-ed model weights
|
||||
if (is_uma_arch()) {
|
||||
// assume 30% of active pages can be compressed on macOS UMA (an empirical value)
|
||||
// assume 10% of active pages can be compressed on macOS UMA (an empirical value)
|
||||
// because GPU is more likely to use the inactive memory
|
||||
memory += vm_stats.active_count * 0.3 * page_size;
|
||||
memory += vm_stats.active_count * 0.1 * page_size;
|
||||
} else {
|
||||
// assume 50% of active pages can be compressed on macOS NUMA (an empirical value)
|
||||
memory += vm_stats.active_count * 0.5 * page_size;
|
||||
|
|
|
@ -324,7 +324,7 @@ extern "C" {
|
|||
uint32_t rank; // my rank
|
||||
uint32_t n_layer_window[32];// number of layers to process in each compute
|
||||
uint32_t n_gpu_layers; // number of layers to process on GPU
|
||||
bool unload; // whether to unload layer weights after use
|
||||
bool prefetch; // whether to prefetch layer weights
|
||||
bool keep_out_in_metal; // whether to keep output weights in metal memory
|
||||
char * master_ip; // ip address of the master node
|
||||
char * next_node_ip; // ip address of the next node
|
||||
|
|
|
@ -106,7 +106,7 @@
|
|||
struct Timer {
|
||||
const char * name;
|
||||
int64_t start_time;
|
||||
bool enable_timer = true;
|
||||
bool enable_timer = false;
|
||||
Timer(const char * name) : name(name), start_time(ggml_time_us()) {}
|
||||
~Timer() {
|
||||
if (enable_timer) {
|
||||
|
@ -2571,7 +2571,7 @@ struct llama_cparams {
|
|||
uint32_t n_world;
|
||||
uint32_t rank;
|
||||
uint32_t n_layer_window[32];
|
||||
bool unload;
|
||||
bool prefetch;
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
|
@ -17829,14 +17829,12 @@ static bool is_tensor_loaded(struct ggml_tensor * tensor) {
|
|||
// align addr
|
||||
llama_mmap::align_range(&first, &last, page_size);
|
||||
size_t len = std::max(last - first, static_cast<size_t>(page_size));
|
||||
|
||||
// calculate the number of pages to check
|
||||
size_t page_count = (len + page_size - 1) / page_size;
|
||||
size_t page_count = len / page_size;
|
||||
|
||||
#ifdef __APPLE__
|
||||
char * mincore_res = new char[page_count];
|
||||
#else
|
||||
unsigned char *mincore_res = new unsigned char[page_count]; // use 'unsigned char' for Linux
|
||||
unsigned char * mincore_res = new unsigned char[page_count]; // use 'unsigned char' for Linux
|
||||
#endif
|
||||
|
||||
// call mincore to check if pages are resident in memory
|
||||
|
@ -17865,13 +17863,20 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) {
|
|||
if (strstr(cur->name, "weight") == nullptr || cur->data == nullptr) {
|
||||
continue;
|
||||
}
|
||||
const char * backend_name = ggml_backend_buffer_name(cur->buffer);
|
||||
if (backend_name) {
|
||||
std::string lower_name(backend_name);
|
||||
std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
if (lower_name.find("cuda") != std::string::npos) continue;
|
||||
}
|
||||
if (is_tensor_loaded(cur)) n_loaded++;
|
||||
n_total++;
|
||||
}
|
||||
return float(n_loaded) / float(n_total) * 100.0f;
|
||||
}
|
||||
|
||||
static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool force = false) {
|
||||
static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) {
|
||||
long page_size = sysconf(_SC_PAGESIZE);
|
||||
|
||||
struct Segment {
|
||||
|
@ -17882,10 +17887,19 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
|
|||
|
||||
for (int i = 0; i < ggml_graph_n_leafs(cgraph); i++) {
|
||||
struct ggml_tensor * cur = ggml_graph_leaf(cgraph, i);
|
||||
|
||||
if (strstr(cur->name, "weight") == nullptr || cur->data == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const char * backend_name = ggml_backend_buffer_name(cur->buffer);
|
||||
if (backend_name) {
|
||||
std::string lower_name(backend_name);
|
||||
std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
if (lower_name.find("cuda") != std::string::npos) continue;
|
||||
}
|
||||
|
||||
size_t size = ggml_nbytes(cur);
|
||||
size_t first = reinterpret_cast<size_t>(cur->data);
|
||||
size_t last = first + size;
|
||||
|
@ -17915,13 +17929,16 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
|
|||
}
|
||||
|
||||
for (const auto & segment : merged_segments) {
|
||||
size_t prefetch_dense = 4;
|
||||
size_t len = std::max(segment.end - segment.start, static_cast<size_t>(page_size));
|
||||
posix_madvise(reinterpret_cast<void *>(segment.start), len, advice); // hint to load into memory
|
||||
// force to prefetch data
|
||||
if (force && advice == POSIX_MADV_WILLNEED) {
|
||||
// force to prefetch data, disabled by default
|
||||
if (advice == POSIX_MADV_WILLNEED && false) {
|
||||
volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
|
||||
for (size_t off = 0; off < len; off += page_size) {
|
||||
(void)ptr[off];
|
||||
for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
|
||||
for (size_t i = 0; i < prefetch_dense; i++) {
|
||||
if (off + i * page_size < len) (void)ptr[off + i * page_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -18193,17 +18210,13 @@ static int llama_decode_internal(
|
|||
}
|
||||
|
||||
// overlap memory scheduling with other nodes' communication and computing
|
||||
{
|
||||
if (cparams.prefetch && n_world > 1) {
|
||||
timer(manage_graph_tensors);
|
||||
|
||||
int next_gf_id = (i + 1) % gf.size();
|
||||
manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, true);
|
||||
manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED);
|
||||
if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) {
|
||||
manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, true);
|
||||
}
|
||||
|
||||
if (cparams.unload && n_world > 1) {
|
||||
manage_graph_tensors(sub_gf, POSIX_MADV_DONTNEED);
|
||||
manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -19926,7 +19939,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.rank =*/ 0,
|
||||
/*.n_layer_window =*/ {32},
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.unload =*/ false,
|
||||
/*.prefetch =*/ false,
|
||||
/*.keep_out_in_metal =*/ true,
|
||||
/*.master_ip =*/ nullptr,
|
||||
/*.next_node_ip =*/ nullptr,
|
||||
|
@ -20354,7 +20367,7 @@ void * llama_context_setup_backend(
|
|||
auto & cparams = ctx->cparams;
|
||||
|
||||
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
|
||||
cparams.unload = params.unload;
|
||||
cparams.prefetch = params.prefetch;
|
||||
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch;
|
||||
|
|
Loading…
Add table
Reference in a new issue