fixed compile errors, made mmap automatic when lora is selected, added updated quantizers and quantization handling for gpt neox gpt 2 and gptj

This commit is contained in:
Concedo 2023-04-24 23:20:06 +08:00
parent 3962eb39c7
commit 59fb174678
11 changed files with 297 additions and 590 deletions

View file

@ -91,6 +91,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
case 1: wtype = GGML_TYPE_F16; break;
case 2: wtype = GGML_TYPE_Q4_0; break;
case 3: wtype = GGML_TYPE_Q4_1; break;
case 5: wtype = GGML_TYPE_Q4_2; break;
case 6: wtype = GGML_TYPE_Q4_3; break;
default:
{
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
@ -143,7 +145,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
ctx_size += (6 + 12*n_layer)*256; // object overhead
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
@ -258,22 +259,20 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
int32_t ttype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (fin.eof()) {
break;
}
int32_t nelements = 1;
int64_t ne[2] = { 1, 1 };
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
int32_t ne_cur;
fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
ne[i] = ne_cur;
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
@ -297,24 +296,12 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
return ModelLoadResult::FAIL;
}
// for debugging
if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
size_t bpe = 0;
switch (ftype) {
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
default:
{
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
return ModelLoadResult::FAIL;
}
};
const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
@ -370,8 +357,7 @@ bool gpt2_eval(
const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab;
//todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now
static size_t buf_size = 256u*1024*1024;
static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N*1.9 > buf_size) {