mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-05 23:49:04 +00:00
synchronize device info
This commit is contained in:
parent
ef7fdf70cc
commit
53cb3a6069
5 changed files with 408 additions and 73 deletions
|
@ -833,10 +833,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
|
||||
// profile devices and determine the best setup
|
||||
device_info dev_info;
|
||||
llama_profile_device(&dev_info, model, params.model.c_str());
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return iparams;
|
||||
|
@ -866,17 +862,35 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
}
|
||||
}
|
||||
|
||||
auto cparams = llama_context_params_from_gpt_params(params);
|
||||
// get device profile
|
||||
device_info dev_info;
|
||||
dev_info.rank = params.rank;
|
||||
llama_profile_device(&dev_info, model, params.model.c_str());
|
||||
|
||||
// create llama context
|
||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||
|
||||
// initialize sockets
|
||||
llama_init_sockets(lctx, cparams.n_world, cparams.rank);
|
||||
|
||||
// sychronize device profile to the master node
|
||||
struct device_info * dev_info_set = nullptr;
|
||||
if (params.rank == 0) {
|
||||
dev_info_set = (struct device_info *)malloc(cparams.n_world * sizeof(struct device_info));
|
||||
dev_info_set[0] = dev_info;
|
||||
llama_collect_device_info(dev_info_set, lctx);
|
||||
device_print_props(dev_info_set, cparams.n_world);
|
||||
} else {
|
||||
llama_send_device_info(&dev_info, lctx);
|
||||
}
|
||||
|
||||
if (llama_context_setup_backend(lctx) == nullptr) {
|
||||
LOG_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str());
|
||||
llama_free_model(model);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
llama_init_sockets(lctx, cparams.n_world, cparams.rank);
|
||||
|
||||
if (!params.control_vectors.empty()) {
|
||||
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
||||
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
||||
|
|
|
@ -300,8 +300,6 @@ int device_has_sycl(void) {
|
|||
return ggml_cpu_has_sycl();
|
||||
}
|
||||
|
||||
// ggml_backend_buffer_type_t llama_dev_buffer_type(const llama_model * model, int device)
|
||||
|
||||
void device_get_props(struct llama_model * model, int device, struct ggml_backend_dev_props * props) {
|
||||
ggml_backend_buffer_type_t buft_type;
|
||||
if (device == -1) { // type cpu
|
||||
|
@ -312,3 +310,282 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen
|
|||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft_type);
|
||||
ggml_backend_dev_get_props(dev, props);
|
||||
}
|
||||
|
||||
void device_print_props(struct device_info * dev_info_set, int n) {
|
||||
LOG_INF("\n-------------------------------------------------------------------------------------------\n");
|
||||
LOG_INF("| Property ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| Rank %-8d", i);
|
||||
GGML_ASSERT(dev_info_set[i].rank == i);
|
||||
}
|
||||
LOG_INF("\n-------------------------------------------------------------------------------------------\n");
|
||||
|
||||
LOG_INF("| Device Name ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.10s ", dev_info_set[i].device_name);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU Name ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.name);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU Description ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.description);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Number of CPU cores ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10u ", dev_info_set[i].cpu_props.cores);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Physical Mem Total (GB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Physical Mem Available (GB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_physical);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Swap Mem Total (GB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_swap);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Swap Mem Available (GB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_swap);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Mem Bandwidth (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.bandwidth);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Disk Read Bandwidth (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].disk_read_bandwidth);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| GPU Metal ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10d ", dev_info_set[i].gpu_support.metal);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| GPU CUDA ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10d ", dev_info_set[i].gpu_support.cuda);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| GPU Vulkan ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10d ", dev_info_set[i].gpu_support.vulkan);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| GPU Kompute ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10d ", dev_info_set[i].gpu_support.kompute);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| GPU BLAS ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10d ", dev_info_set[i].gpu_support.gpublas);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| BLAS ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10d ", dev_info_set[i].gpu_support.blas);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| SYCL ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10d ", dev_info_set[i].gpu_support.sycl);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| GPU Name ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.name);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| GPU Description ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.description);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| GPU Mem Free (GB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_free);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| GPU Mem Total (GB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_total);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("-------------------------------------------------------------------------------------------\n\n");
|
||||
}
|
||||
|
||||
|
||||
size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||
// calculate total size for serialized buffer
|
||||
size_t device_name_len = strlen(dev_info->device_name) + 1;
|
||||
size_t cpu_name_len = strlen(dev_info->cpu_props.name) + 1;
|
||||
size_t cpu_description_len = strlen(dev_info->cpu_props.description) + 1;
|
||||
size_t gpu_name_len = strlen(dev_info->gpu_props.name) + 1;
|
||||
size_t gpu_description_len = strlen(dev_info->gpu_props.description) + 1;
|
||||
|
||||
size_t total_size = sizeof(uint32_t)
|
||||
+ sizeof(size_t) * 5 // for lengths of strings
|
||||
+ device_name_len
|
||||
+ cpu_name_len
|
||||
+ cpu_description_len
|
||||
+ gpu_name_len
|
||||
+ gpu_description_len
|
||||
+ sizeof(float) // disk_read_bandwidth
|
||||
+ sizeof(uint32_t) // cpu_props.cores
|
||||
+ sizeof(struct memory_info)
|
||||
+ sizeof(struct gpu_support)
|
||||
+ sizeof(float) * 2; // gpu_props.memory_free and gpu_props.memory_total
|
||||
|
||||
*buffer = (char *)malloc(total_size);
|
||||
char * ptr = *buffer;
|
||||
|
||||
// rank
|
||||
memcpy(ptr, &dev_info->rank, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
||||
// copy string lengths and string data
|
||||
memcpy(ptr, &device_name_len, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
memcpy(ptr, dev_info->device_name, device_name_len);
|
||||
ptr += device_name_len;
|
||||
|
||||
memcpy(ptr, &cpu_name_len, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
memcpy(ptr, dev_info->cpu_props.name, cpu_name_len);
|
||||
ptr += cpu_name_len;
|
||||
|
||||
memcpy(ptr, &cpu_description_len, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
memcpy(ptr, dev_info->cpu_props.description, cpu_description_len);
|
||||
ptr += cpu_description_len;
|
||||
|
||||
memcpy(ptr, &gpu_name_len, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
memcpy(ptr, dev_info->gpu_props.name, gpu_name_len);
|
||||
ptr += gpu_name_len;
|
||||
|
||||
memcpy(ptr, &gpu_description_len, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
memcpy(ptr, dev_info->gpu_props.description, gpu_description_len);
|
||||
ptr += gpu_description_len;
|
||||
|
||||
// copy the non-string members
|
||||
memcpy(ptr, &dev_info->disk_read_bandwidth, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
||||
memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
|
||||
ptr += sizeof(struct memory_info);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_support, sizeof(struct gpu_support));
|
||||
ptr += sizeof(struct gpu_support);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.memory_free, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float));
|
||||
|
||||
return total_size;
|
||||
}
|
||||
|
||||
void deserialize(const char * buffer, struct device_info * dev_info) {
|
||||
const char * ptr = buffer;
|
||||
|
||||
// rank
|
||||
memcpy(&dev_info->rank, ptr, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
||||
// device_name
|
||||
size_t device_name_len;
|
||||
memcpy(&device_name_len, ptr, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
dev_info->device_name = (char *)malloc(device_name_len);
|
||||
memcpy((void *)dev_info->device_name, ptr, device_name_len);
|
||||
ptr += device_name_len;
|
||||
|
||||
// cpu_props.name
|
||||
size_t cpu_name_len;
|
||||
memcpy(&cpu_name_len, ptr, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
dev_info->cpu_props.name = (char *)malloc(cpu_name_len);
|
||||
memcpy((void *)dev_info->cpu_props.name, ptr, cpu_name_len);
|
||||
ptr += cpu_name_len;
|
||||
|
||||
// cpu_props.description
|
||||
size_t cpu_description_len;
|
||||
memcpy(&cpu_description_len, ptr, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
dev_info->cpu_props.description = (char *)malloc(cpu_description_len);
|
||||
memcpy((void *)dev_info->cpu_props.description, ptr, cpu_description_len);
|
||||
ptr += cpu_description_len;
|
||||
|
||||
// gpu_props.name
|
||||
size_t gpu_name_len;
|
||||
memcpy(&gpu_name_len, ptr, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
dev_info->gpu_props.name = (char *)malloc(gpu_name_len);
|
||||
memcpy((void *)dev_info->gpu_props.name, ptr, gpu_name_len);
|
||||
ptr += gpu_name_len;
|
||||
|
||||
// gpu_props.description
|
||||
size_t gpu_description_len;
|
||||
memcpy(&gpu_description_len, ptr, sizeof(size_t));
|
||||
ptr += sizeof(size_t);
|
||||
dev_info->gpu_props.description = (char *)malloc(gpu_description_len);
|
||||
memcpy((void *)dev_info->gpu_props.description, ptr, gpu_description_len);
|
||||
ptr += gpu_description_len;
|
||||
|
||||
// other non-string members
|
||||
memcpy(&dev_info->disk_read_bandwidth, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
||||
memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
|
||||
ptr += sizeof(struct memory_info);
|
||||
|
||||
memcpy(&dev_info->gpu_support, ptr, sizeof(struct gpu_support));
|
||||
ptr += sizeof(struct gpu_support);
|
||||
|
||||
memcpy(&dev_info->gpu_props.memory_free, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float));
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ struct gpu_props {
|
|||
};
|
||||
|
||||
struct device_info {
|
||||
uint32_t rank;
|
||||
const char * device_name;
|
||||
float disk_read_bandwidth; // in GB/s
|
||||
struct cpu_props cpu_props;
|
||||
|
@ -51,6 +52,7 @@ uint64_t device_swap_memory (bool available);
|
|||
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb);
|
||||
uint64_t device_memory_bw (size_t buffer_size_mb);
|
||||
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
||||
void device_print_props (struct device_info * dev_info_set, int n);
|
||||
|
||||
int device_has_metal (void);
|
||||
int device_has_cuda (void);
|
||||
|
@ -60,4 +62,7 @@ int device_has_gpublas(void);
|
|||
int device_has_blas (void);
|
||||
int device_has_sycl (void);
|
||||
|
||||
size_t serialize (const struct device_info * dev_info, char ** buffer);
|
||||
void deserialize(const char * buffer, struct device_info * dev_info);
|
||||
|
||||
#endif // PROFILER_H
|
||||
|
|
|
@ -434,11 +434,14 @@ extern "C" {
|
|||
|
||||
LLAMA_API void llama_init_sockets (struct llama_context * ctx, uint32_t n_world, uint32_t my_rank);
|
||||
LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg);
|
||||
LLAMA_API int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx);
|
||||
LLAMA_API int llama_send_device_info (struct device_info * dev_info, struct llama_context * ctx);
|
||||
|
||||
// TODO: rename to llama_init_from_model
|
||||
LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
LLAMA_API void * llama_context_setup_backend(struct llama_context * ctx);
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
|
116
src/llama.cpp
116
src/llama.cpp
|
@ -2572,6 +2572,8 @@ struct llama_cparams {
|
|||
uint32_t n_layer_window[32];
|
||||
bool unload;
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
|
@ -3579,39 +3581,6 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, co
|
|||
dev_info->gpu_props.description = gpu_props.description;
|
||||
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
||||
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
||||
|
||||
LLAMA_LOG_INFO("\n");
|
||||
LLAMA_LOG_INFO("Device Info:\n");
|
||||
LLAMA_LOG_INFO(" Device Name : %s\n", dev_info->device_name);
|
||||
LLAMA_LOG_INFO(" CPU Name : %s\n", dev_info->cpu_props.name);
|
||||
LLAMA_LOG_INFO(" CPU Description : %s\n", dev_info->cpu_props.description);
|
||||
LLAMA_LOG_INFO(" Number of CPU cores : %u\n", dev_info->cpu_props.cores);
|
||||
LLAMA_LOG_INFO(" Disk Read Bandwidth : %.2f GB/s\n", dev_info->disk_read_bandwidth);
|
||||
LLAMA_LOG_INFO("\n");
|
||||
|
||||
LLAMA_LOG_INFO("Memory Information:\n");
|
||||
LLAMA_LOG_INFO(" Physical Mem Total : %.2f GB\n", dev_info->memory.total_physical);
|
||||
LLAMA_LOG_INFO(" Physical Mem Available : %.2f GB\n", dev_info->memory.available_physical);
|
||||
LLAMA_LOG_INFO(" Swap Memory Total : %.2f GB\n", dev_info->memory.total_swap);
|
||||
LLAMA_LOG_INFO(" Swap Memory Available : %.2f GB\n", dev_info->memory.available_swap);
|
||||
LLAMA_LOG_INFO(" Mem Bandwidth : %.2f GB/s\n", dev_info->memory.bandwidth);
|
||||
LLAMA_LOG_INFO("\n");
|
||||
|
||||
LLAMA_LOG_INFO("GPU Support:\n");
|
||||
LLAMA_LOG_INFO(" Metal : %i\n", dev_info->gpu_support.metal);
|
||||
LLAMA_LOG_INFO(" CUDA : %i\n", dev_info->gpu_support.cuda);
|
||||
LLAMA_LOG_INFO(" Vulkan : %i\n", dev_info->gpu_support.vulkan);
|
||||
LLAMA_LOG_INFO(" Kompute : %i\n", dev_info->gpu_support.kompute);
|
||||
LLAMA_LOG_INFO(" GPU BLAS : %i\n", dev_info->gpu_support.gpublas);
|
||||
LLAMA_LOG_INFO(" BLAS : %i\n", dev_info->gpu_support.blas);
|
||||
LLAMA_LOG_INFO(" SYCL : %i\n", dev_info->gpu_support.sycl);
|
||||
LLAMA_LOG_INFO("\n");
|
||||
|
||||
LLAMA_LOG_INFO("GPU Properties:\n");
|
||||
LLAMA_LOG_INFO(" GPU Name : %s\n", dev_info->gpu_props.name);
|
||||
LLAMA_LOG_INFO(" Description : %s\n", dev_info->gpu_props.description);
|
||||
LLAMA_LOG_INFO(" Memory Free : %.2f GB\n", dev_info->gpu_props.memory_free);
|
||||
LLAMA_LOG_INFO(" Memory Total : %.2f GB\n", dev_info->gpu_props.memory_total);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
|
||||
|
@ -19815,6 +19784,63 @@ void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t m
|
|||
}
|
||||
}
|
||||
|
||||
int llama_collect_device_info(struct device_info * dev_info_set, struct llama_context * ctx) {
|
||||
uint32_t n_world = ctx->cparams.n_world;
|
||||
if (n_world == 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
GGML_ASSERT(dev_info_set != nullptr);
|
||||
GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
|
||||
try {
|
||||
char * buffer = nullptr;
|
||||
size_t buffer_size = serialize(&dev_info_set[0], &buffer);
|
||||
|
||||
std::vector<zmq::message_t> send_msgs;
|
||||
send_msgs.emplace_back(buffer, buffer_size);
|
||||
zmq::send_multipart(*ctx->send_socket, send_msgs);
|
||||
|
||||
free(buffer);
|
||||
} catch (const zmq::error_t& e) {
|
||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<zmq::message_t> recv_msgs;
|
||||
if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
|
||||
return -1;
|
||||
}
|
||||
GGML_ASSERT(recv_msgs.size() == n_world);
|
||||
|
||||
for (size_t i = 0; i < recv_msgs.size(); i++) {
|
||||
deserialize((const char *)recv_msgs[i].data(), &dev_info_set[i]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_send_device_info(struct device_info * dev_info, struct llama_context * ctx) {
|
||||
std::vector<zmq::message_t> recv_msgs;
|
||||
if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
GGML_ASSERT(dev_info != nullptr);
|
||||
GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
|
||||
|
||||
try {
|
||||
char * buffer = nullptr;
|
||||
size_t buffer_size = serialize(dev_info, &buffer);
|
||||
|
||||
recv_msgs.emplace_back(buffer, buffer_size);
|
||||
zmq::send_multipart(*ctx->send_socket, recv_msgs);
|
||||
|
||||
free(buffer);
|
||||
} catch (const zmq::error_t& e) {
|
||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_free_sockets(struct llama_context * ctx, char ** msg) {
|
||||
const uint32_t n_world = ctx->cparams.n_world;
|
||||
const uint32_t my_rank = ctx->cparams.rank;
|
||||
|
@ -19902,6 +19928,8 @@ struct llama_context * llama_new_context_with_model(
|
|||
cparams.pooling_type = params.pooling_type;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
cparams.type_k = params.type_k;
|
||||
cparams.type_v = params.type_v;
|
||||
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
||||
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
||||
|
||||
|
@ -19981,19 +20009,27 @@ struct llama_context * llama_new_context_with_model(
|
|||
// build worst-case graph for encoder if a model contains encoder
|
||||
ctx->is_encoding = llama_model_has_encoder(model);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void * llama_context_setup_backend(struct llama_context * ctx) {
|
||||
GGML_ASSERT(ctx != nullptr);
|
||||
const auto * model = &ctx->model;
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const auto & cparams = ctx->cparams;
|
||||
|
||||
uint32_t kv_size = cparams.n_ctx;
|
||||
ggml_type type_k = params.type_k;
|
||||
ggml_type type_v = params.type_v;
|
||||
ggml_type type_k = cparams.type_k;
|
||||
ggml_type type_v = cparams.type_v;
|
||||
|
||||
// Mamba only needs a constant number of KV cache cells per sequence
|
||||
if (llama_model_is_recurrent(model)) {
|
||||
// Mamba needs at least as many KV cells as there are sequences kept at any time
|
||||
kv_size = std::max((uint32_t) 1, params.n_seq_max);
|
||||
kv_size = std::max((uint32_t) 1, cparams.n_seq_max);
|
||||
// it's probably best to keep as much precision as possible for the states
|
||||
type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
|
||||
type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
|
||||
}
|
||||
|
||||
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
||||
|
||||
|
@ -20189,9 +20225,9 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
|
||||
// graph outputs buffer, reserve for rank 0 only
|
||||
if (params.rank == 0) {
|
||||
if (cparams.rank == 0) {
|
||||
// resized during inference when a batch uses more outputs
|
||||
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
|
||||
if (llama_output_reserve(*ctx, cparams.n_seq_max) < cparams.n_seq_max) {
|
||||
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
|
@ -20226,7 +20262,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
llama_get_device_count(*model) > 1 &&
|
||||
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
||||
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
||||
params.offload_kqv;
|
||||
cparams.offload_kqv;
|
||||
|
||||
// pipeline parallelism requires support for async compute and events in all devices
|
||||
if (pipeline_parallel) {
|
||||
|
|
Loading…
Add table
Reference in a new issue