From 05cf5f7d6e2e32baf8bcba6824de7bfd1743b804 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 13 May 2023 11:35:38 +0800
Subject: [PATCH] partially working, but the blas matmul is broken

---
 ggml.c                | 32 +++++++++++++++++++++++---------
 ggml.h                |  2 ++
 ggml_v2.c             |  5 +++++
 gpttype_adapter.cpp   | 21 ++++++++++++---------
 llama.cpp             |  3 ++-
 model_adapter.cpp     |  8 +++++++-
 model_adapter.h       |  1 +
 otherarch/gpt2_v2.cpp |  2 +-
 8 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/ggml.c b/ggml.c
index addf61303..4db085f64 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1429,6 +1429,20 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
     return quantize_fns[i];
 }
 
+bool quants_unshuffled = false; //new GGJT_2 is unshuffled, all old ones are shuffled
+static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT]; //forward decl
+static inline quantize_fns_t get_quantize_fn(size_t i) 
+{
+    if(quants_unshuffled)
+    {
+        return quantize_fns[i];
+    }
+    else
+    {
+        return quantize_fns_v2[i];
+    }
+}
+
 
 //
 // simd mappings
@@ -5637,7 +5651,7 @@ static void ggml_compute_forward_dup_f16(
                     }
                 }
             } else if (ggml_is_quantized(dst->type)) {
-                quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
+                quantize_row_q_t const quantize_row_q = get_quantize_fn(dst->type).quantize_row_q;
                 float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 
                 size_t id = 0;
@@ -5936,7 +5950,7 @@ static void ggml_compute_forward_dup_f32(
                     }
                 }
             } else if (ggml_is_quantized(dst->type)) {
-                quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
+                quantize_row_q_t const quantize_row_q = get_quantize_fn(dst->type).quantize_row_q;
 
                 size_t id = 0;
                 size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
@@ -6346,8 +6360,8 @@ static void ggml_compute_forward_add_q_f32(
     GGML_ASSERT(ne3  == ne13);
 
     const enum ggml_type type = src0->type;
-    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-    quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
+    dequantize_row_q_t const dequantize_row_q = get_quantize_fn(type).dequantize_row_q;
+    quantize_row_q_t const quantize_row_q = get_quantize_fn(type).quantize_row_q;
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
@@ -7809,9 +7823,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
     GGML_ASSERT(ne3  == ne13);
 
     const enum ggml_type type = src0->type;
-    quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
-    vec_dot_q_t      const vec_dot_q          = quantize_fns[type].vec_dot_q;
-    enum ggml_type   const vec_dot_type       = quantize_fns[type].vec_dot_type;
+    quantize_row_q_t const quantize_row_q_dot = get_quantize_fn(type).quantize_row_q_dot;
+    vec_dot_q_t      const vec_dot_q          = get_quantize_fn(type).vec_dot_q;
+    enum ggml_type   const vec_dot_type       = get_quantize_fn(type).vec_dot_type;
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
@@ -8138,7 +8152,7 @@ static void ggml_compute_forward_get_rows_q(
     const int nc = src0->ne[0];
     const int nr = ggml_nelements(src1);
     const enum ggml_type type = src0->type;
-    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
+    dequantize_row_q_t const dequantize_row_q = get_quantize_fn(type).dequantize_row_q;
 
     assert( dst->ne[0] == nc);
     assert( dst->ne[1] == nr);
@@ -10923,7 +10937,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             } else
 #endif
                             {
-                                const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
+                                const enum ggml_type type_q = get_quantize_fn(node->src0->type).vec_dot_type;
                                 cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
                             }
                         } else {
diff --git a/ggml.h b/ggml.h
index a045bfc3e..35bd45e8a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -895,6 +895,8 @@ extern "C" {
     // system info
     //
 
+    void SetQuantsUnshuffled(bool unshuffled);
+
     GGML_API int ggml_cpu_has_avx        (void);
     GGML_API int ggml_cpu_has_avx2       (void);
     GGML_API int ggml_cpu_has_avx512     (void);
diff --git a/ggml_v2.c b/ggml_v2.c
index 7a4281691..38bf9108d 100644
--- a/ggml_v2.c
+++ b/ggml_v2.c
@@ -1571,6 +1571,11 @@ static void ggml_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const voi
 static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q8_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 
+void SetQuantsUnshuffled(bool unshuffle)
+{
+    quants_unshuffled = unshuffle;
+}
+
 //TODO: integrate backwards compat
 static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = {
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 775e1e44a..92a42a2fa 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -225,8 +225,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
 
     printf("System Info: %s\n", llama_print_system_info());
 
-    if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT)
+    if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
     {
+        //newer format has bit unshuffling
+        SetQuantsUnshuffled(file_format== FileFormat::GGJT_2);        
+
         llama_ctx_params = llama_context_default_params();
         llama_ctx_params.n_ctx = inputs.max_context_length;
         llama_ctx_params.n_parts = -1;
@@ -243,7 +246,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
             return ModelLoadResult::FAIL;
         }
-        if (file_format < FileFormat::GGJT)
+        if (file_format < FileFormat::GGJT_2)
         {
             printf("\n---\nWarning: Your model has an INVALID or OUTDATED format (ver %d). Please reconvert it for better results!\n---\n", file_format);
         }
@@ -484,7 +487,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     // tokenize the prompt
     std::vector<int> embd_inp;
 
-    if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT)
+    if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
     {
         params.prompt.insert(0, 1, ' ');
         if (file_format == FileFormat::GGML)
@@ -543,7 +546,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     {
         //for non llama, limit to 256
         int bbs = blasbatchsize;
-        if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT)
+        if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2)
         {
             bbs = (blasbatchsize > 256 ? 256 : blasbatchsize);
         }
@@ -573,7 +576,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     double time1 = 0, time2 = 0;
     int32_t n_vocab = 0;
 
-    if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT)
+    if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
     {
         n_vocab = llama_n_vocab(llama_ctx_v1);
     }
@@ -624,7 +627,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     if(debugmode)
     {
         printf("\n[Debug: Dump Input Tokens]\n");
-        if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT)
+        if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
         {
             for (auto id : embd_inp)
             {
@@ -661,7 +664,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
 
             bool evalres = false;
 
-            if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT)
+            if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
             {
                 evalres = (llama_eval(llama_ctx_v1, embd.data(), embdsize, n_past, params.n_threads)==0);
             }
@@ -722,7 +725,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                 printf("\n");
             }
 
-            if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT)
+            if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
             {
                 auto logits = llama_get_logits(llama_ctx_v1);
 
@@ -772,7 +775,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             // decrement remaining sampling budget
             --remaining_tokens;
 
-            if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT)
+            if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
             {
                 concat_output += llama_token_to_str(llama_ctx_v1, id);
                 if(unbanTokens && id==llama_token_eos())
diff --git a/llama.cpp b/llama.cpp
index 5bcbe2978..37b4ef800 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -937,7 +937,8 @@ static void llama_model_load_internal(
         if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
             hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
             hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
-            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
+            printf("\nLegacy LLAMA GGJT compatability changes triggered.\n");
+            //throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
         }
     }
 
diff --git a/model_adapter.cpp b/model_adapter.cpp
index 6c54f3041..3c0543724 100644
--- a/model_adapter.cpp
+++ b/model_adapter.cpp
@@ -145,7 +145,13 @@ void print_tok_vec(std::vector<float> &embd)
     }
     else if(magic == 0x67676a74) //v3 format ggjt
     {
-        fileformat = FileFormat::GGJT; //ggjt by default
+        fileformat = FileFormat::GGJT_2; //ggjt by default
+        uint32_t temp;
+        fin.read((char *)&temp, sizeof(temp)); //file version
+        if(temp==1)
+        {
+            fileformat = FileFormat::GGJT;
+        }
     }
     fin.close();
     
diff --git a/model_adapter.h b/model_adapter.h
index 3e3f8f30c..d151831e7 100644
--- a/model_adapter.h
+++ b/model_adapter.h
@@ -19,6 +19,7 @@ enum FileFormat
     GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header)
     GGHF=2, // 2=(llama ggmf)
     GGJT=3, // 3=(llama ggjt) 
+    GGJT_2=4, //newer llama format
 
     GPTJ_1=100, //the very first super old GPTJ format
     GPTJ_2=101, //pygmalion, uses old ggml lib
diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp
index c1be1f4ca..bb2310206 100644
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@@ -352,7 +352,7 @@ bool gpt2_eval(
 
     if (mem_per_token > 0 && (mem_per_token*N*2 + 48u*1024*1024) > buf_size) {
         const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
         // reallocate
         if (buf_size_new > buf_size)