fixed compile errors, made mmap automatic when lora is selected, added updated quantizers and quantization handling for gpt neox gpt 2 and gptj

2025-09-16 03:49:42 +00:00 · 2023-04-24 23:20:06 +08:00 · 2023-04-24 23:20:06 +08:00 · 59fb174678
commit 59fb174678
parent 3962eb39c7
11 changed files with 297 additions and 590 deletions
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@ -91,6 +91,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
        case 1: wtype = GGML_TYPE_F16;  break;
        case 2: wtype = GGML_TYPE_Q4_0; break;
        case 3: wtype = GGML_TYPE_Q4_1; break;
+        case 5: wtype = GGML_TYPE_Q4_2; break;
+        case 6: wtype = GGML_TYPE_Q4_3; break;
        default:
                {
                    fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
@ -143,7 +145,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g

        ctx_size += (6 + 12*n_layer)*256; // object overhead

-
        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }

@ -258,22 +259,20 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;

            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));

            if (fin.eof()) {
                break;
            }

            int32_t nelements = 1;
-            int64_t ne[2] = { 1, 1 };
+            int32_t ne[2] = { 1, 1 };
            for (int i = 0; i < n_dims; ++i) {
-                int32_t ne_cur;
-                fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
-                ne[i] = ne_cur;
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
                nelements *= ne[i];
            }

@ -297,24 +296,12 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
                return ModelLoadResult::FAIL;
            }

+            // for debugging
            if (0) {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }

-            size_t bpe = 0;
-
-            switch (ftype) {
-                case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
-                case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
-                case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
-                case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
-                default:
-                        {
-                            fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
-                            return ModelLoadResult::FAIL;
-                        }
-            };
+            const size_t bpe = ggml_type_size(ggml_type(ttype));

            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
@ -370,8 +357,7 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;

-    //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now  
-    static size_t buf_size = 256u*1024*1024;    
+    static size_t buf_size = 256u*1024*1024;
    static void * buf = malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N*1.9 > buf_size) {