Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	README.md
#	examples/llama-bench/llama-bench.cpp
#	examples/llama.android/llama/src/main/cpp/llama-android.cpp
#	examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
#	src/llama-vocab.cpp
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2025-01-17 23:13:50 +08:00
commit 96407502cd
43 changed files with 15434 additions and 435 deletions

View file

@ -49,7 +49,7 @@ static bool old_mixtral_warning_showed = false;
#endif
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
// loading time will be recalculated after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = 0;
@ -58,7 +58,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
model.t_start_us = tm.t_start_us;
try {
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
ml.print_info();
@ -9414,14 +9414,9 @@ int64_t llama_time_us(void) {
return ggml_time_us();
}
struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params) {
return llama_model_load_from_file(path_model, params);
}
struct llama_model * llama_model_load_from_file(
const char * path_model,
static struct llama_model * llama_model_load_from_file_impl(
const std::string & path_model,
std::vector<std::string> & splits,
struct llama_model_params params) {
ggml_time_init();
@ -9444,47 +9439,6 @@ struct llama_model * llama_model_load_from_file(
};
}
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
// split the servers set them into model->rpc_servers
std::string servers(params.rpc_servers);
size_t pos = 0;
while ((pos = servers.find(',')) != std::string::npos) {
std::string server = servers.substr(0, pos);
model->rpc_servers.push_back(server);
servers.erase(0, pos + 1);
}
model->rpc_servers.push_back(servers);
}
// add RPC devices
if (!model->rpc_servers.empty()) {
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
llama_model_free(model);
return nullptr;
}
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
llama_model_free(model);
return nullptr;
}
for (const std::string & server : model->rpc_servers) {
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
if (dev) {
model->devices.push_back(dev);
} else {
LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
llama_model_free(model);
return nullptr;
}
}
}
// create list of devices to use with this model
if (params.devices) {
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
@ -9525,7 +9479,7 @@ struct llama_model * llama_model_load_from_file(
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
}
const int status = llama_model_load(path_model, *model, params);
const int status = llama_model_load(path_model, splits, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
@ -9541,6 +9495,35 @@ struct llama_model * llama_model_load_from_file(
return model;
}
// deprecated
struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params) {
return llama_model_load_from_file(path_model, params);
}
struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params) {
std::vector<std::string> splits = {};
return llama_model_load_from_file_impl(path_model, splits, params);
}
struct llama_model * llama_model_load_from_splits(
const char ** paths,
size_t n_paths,
struct llama_model_params params) {
std::vector<std::string> splits;
if (n_paths == 0) {
LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
return nullptr;
}
for (size_t i = 0; i < n_paths; ++i) {
splits.push_back(paths[i]);
}
return llama_model_load_from_file_impl(splits.front(), splits, params);
}
struct llama_context * llama_init_from_model(
struct llama_model * model,
struct llama_context_params params) {