Merge remote-tracking branch 'occam/opencl-dev' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	CMakeLists.txt
#	Makefile
#	README.md
#	ggml-opencl.cpp
#	llama.cpp
#	otherarch/ggml_v2-opencl-legacy.c
This commit is contained in:
Concedo 2023-05-22 16:16:48 +08:00
commit 981d5ba866
14 changed files with 941 additions and 321 deletions

243
llama.cpp
View file

@ -1,6 +1,7 @@
// Defines fileno on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#include <cstddef>
#include <cstdint>
#include <cstdio>
#endif
@ -428,26 +429,30 @@ struct llama_file_loader {
}
void read_magic() {
uint32_t magic = file.read_u32();
uint32_t version = 0;
if (magic != 'ggml') {
version = file.read_u32();
}
if (magic == 'ggml' && version == 0) {
if (magic == LLAMA_FILE_MAGIC_GGML) {
file_version = LLAMA_FILE_VERSION_GGML;
} else if (magic == 'ggmf' && version == 1) {
file_version = LLAMA_FILE_VERSION_GGMF_V1;
} else if (magic == 'ggjt' && version == 1) {
file_version = LLAMA_FILE_VERSION_GGJT_V1;
} else if (magic == 'ggjt' && version == 2) {
file_version = LLAMA_FILE_VERSION_GGJT_V2;
} else if (magic == 'ggjt' && version == 3) {
file_version = LLAMA_FILE_VERSION_GGJT_V3;
} else {
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
return;
}
uint32_t version = file.read_u32();
switch (magic) {
case LLAMA_FILE_MAGIC_GGMF:
switch (version) {
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
}
break;
case LLAMA_FILE_MAGIC_GGJT:
switch (version) {
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
}
}
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
}
void read_hparams() {
hparams.n_vocab = file.read_u32();
@ -647,7 +652,7 @@ struct llama_model_loader {
}
}
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) {
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@ -658,10 +663,10 @@ struct llama_model_loader {
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
}
return get_tensor_for(lt);
return get_tensor_for(lt, backend);
}
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor;
if (lt.ne.size() == 2) {
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@ -671,6 +676,7 @@ struct llama_model_loader {
}
ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
tensor->backend = backend;
lt.ggml_tensor = tensor;
num_ggml_tensors_created++;
return tensor;
@ -684,12 +690,16 @@ struct llama_model_loader {
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0;
size_t prefetch_size = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
prefetch_size += lt.size;
}
}
if (use_mmap) {
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
if (!lmlock) {
// Don't call the callback since the actual loading will be lazy
// and we can't measure it.
@ -702,6 +712,9 @@ struct llama_model_loader {
size_t done_size = 0;
for (llama_load_tensor & lt : tensors_map.tensors) {
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
continue;
}
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
}
@ -714,9 +727,6 @@ struct llama_model_loader {
lmlock->grow_to(done_size);
}
}
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
}
}
void load_data_for(llama_load_tensor & lt) {
@ -971,27 +981,7 @@ static void llama_model_load_internal(
size_t ctx_size;
size_t mmapped_size;
ml->calc_sizes(&ctx_size, &mmapped_size);
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
// print memory requirements
{
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
mmapped_size +
MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at(model.type);
// this is the memory required by one llama_state
const size_t mem_required_state =
scale*MEM_REQ_KV_SELF().at(model.type);
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
}
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
// create the ggml context
{
@ -1013,7 +1003,14 @@ static void llama_model_load_internal(
}
}
#ifdef GGML_USE_CUBLAS
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
#else
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
#endif
// prepare memory for the weights
size_t vram_total = 0;
{
const uint32_t n_embd = hparams.n_embd;
const uint32_t n_layer = hparams.n_layer;
@ -1021,33 +1018,87 @@ static void llama_model_load_internal(
ml->ggml_ctx = ctx;
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
model.norm = ml->get_tensor("norm.weight", {n_embd});
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
// "output" tensor
{
ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) { // NOLINT
backend_output = LLAMA_BACKEND_OFFLOAD;
} else {
backend_output = GGML_BACKEND_CPU;
}
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
}
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
auto & layer = model.layers[i];
std::string layers_i = "layers." + std::to_string(i);
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
if (backend == GGML_BACKEND_CUDA) {
vram_total +=
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
}
}
}
ml->done_getting_tensors();
// print memory requirements
{
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
mmapped_size - vram_total + // weights in VRAM not in memory
MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at(model.type);
// this is the memory required by one llama_state
const size_t mem_required_state =
scale*MEM_REQ_KV_SELF().at(model.type);
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
#ifdef GGML_USE_CUBLAS
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
}
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
#elif !defined(GGML_USE_CLBLAST)
(void) n_gpu_layers;
#endif
}
// populate `tensors_by_name`
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@ -1055,38 +1106,32 @@ static void llama_model_load_internal(
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
model.mapping = std::move(ml->mapping);
#if defined(GGML_USE_CUBLAS)
#ifdef GGML_USE_CUBLAS
{
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
size_t vram_total = 0;
for (int i = 0; i < n_gpu; ++i) {
const auto & layer = model.layers[i];
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
size_t done_size = 0;
size_t data_size = 0;
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
done_size += lt.size;
}
}
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
continue;
}
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
}
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
done_size += lt.size;
}
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
}
#elif defined(GGML_USE_CLBLAST)
{
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
size_t vram_total = 0;
@ -1102,16 +1147,22 @@ static void llama_model_load_internal(
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
}
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: [opencl] offloading output layer to GPU\n", __func__);
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
}
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
}
#else
(void) n_gpu_layers;
#endif
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
}
model.mapping = std::move(ml->mapping);
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
@ -1209,10 +1260,8 @@ static bool llama_eval_internal(
{
cur = ggml_rms_norm(ctx0, inpL);
// cur = attention_norm*cur
cur = ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
cur);
// cur = cur*attention_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
}
// self-attention
@ -1319,10 +1368,8 @@ static bool llama_eval_internal(
{
cur = ggml_rms_norm(ctx0, inpFF);
// cur = ffn_norm*cur
cur = ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
cur);
// cur = cur*ffn_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
}
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@ -1359,10 +1406,8 @@ static bool llama_eval_internal(
inpL = ggml_rms_norm(ctx0, inpL);
// inpL = norm*inpL
inpL = ggml_mul(ctx0,
ggml_repeat(ctx0, model.norm, inpL),
inpL);
// inpL = inpL*norm(broadcasted)
inpL = ggml_mul(ctx0, inpL, model.norm);
embeddings = inpL;
}
@ -2186,7 +2231,7 @@ struct llama_context * llama_init_from_file(
unsigned * cur_percentage_p = (unsigned *) ctx;
unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) {
++*cur_percentage_p;
*cur_percentage_p = percentage;
fprintf(stderr, ".");
fflush(stderr);
if (percentage >= 100) {
@ -2279,7 +2324,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic != 'ggla') {
if (magic != LLAMA_FILE_MAGIC_GGLA) {
fprintf(stderr, "%s: bad file magic\n", __func__);
return 1;
}
@ -2343,7 +2388,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
// maybe this should in llama_model_loader
if (model_loader->use_mmap) {
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
}
}
@ -2436,7 +2481,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
}
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
lt.data = (uint8_t *) lt.ggml_tensor->data;
model_loader->load_data_for(lt);
lt.ggml_tensor->data = lt.data;