fixed quant tools not compiling, updated docs

2025-09-11 17:44:38 +00:00 · 2024-04-06 23:11:05 +08:00 · 2024-04-06 23:11:05 +08:00 · 0061299cce
commit 0061299cce
parent 273d48ad96
8 changed files with 228 additions and 132 deletions
--- a/10
+++ b/10
@ -606,15 +606,15 @@ endif
 # tools
 quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_gptj: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_gptj: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_gpt2: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_gpt2: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_neox: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_neox: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_mpt: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_mpt: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_clip: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
+quantize_clip: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 #window simple clinfo
--- a/kcpp_docs.embd
+++ b/kcpp_docs.embd
@ -151,6 +151,11 @@
                             "exclusiveMinimum": 0,
                             "type": "number"
                         },
                         "dynatemp_exponent": {
                            "default": 1,
                            "description": "Exponent used in dynatemp.",
                            "type": "number"
                         },
                         "mirostat": {
                            "description": "KoboldCpp ONLY. Sets the mirostat mode, 0=disabled, 1=mirostat_v1, 2=mirostat_v2",
                            "minimum": 0,
@ -1004,6 +1009,98 @@
                      ]
                   }
                },
                "/sdapi/v1/img2img": {
                   "post": {
                      "description": "Transforms an existing image into a new image, guided by a text prompt, and returns a base64 encoded png.",
                      "requestBody": {
                         "content": {
                            "application/json": {
                               "example": {
                                  "prompt": "picture of a kobold, high quality HD render",
                                  "negative_prompt": "ugly, deformed, censored",
                                  "cfg_scale": 5,
                                  "steps": 20,
                                  "width": 512,
                                  "height": 512,
                                  "seed": -1,
                                  "sampler_name": "Euler a",
                                  "denoising_strength": 0.6,
                                  "init_images":["base64_image_data"],
                               },
                               "schema": {
                                  "properties": {
                                     "prompt": {
                                        "type": "string"
                                     },
                                     "negative_prompt": {
                                        "type": "string"
                                     },
                                     "cfg_scale": {
                                        "type": "number"
                                     },
                                     "steps": {
                                        "type": "number"
                                     },
                                     "width": {
                                        "type": "number"
                                     },
                                     "height": {
                                        "type": "number"
                                     },
                                     "seed": {
                                        "type": "number"
                                     },
                                     "sampler_name": {
                                        "type": "string"
                                     },
                                     "denoising_strength": {
                                        "type": "number"
                                     },
                                     "init_images": {
                                        "type": "array"
                                     },
                                  },
                                  "type": "object"
                               }
                            }
                         },
                         "required": false
                      },
                      "responses": {
                         "200": {
                            "content": {
                               "application/json": {
                                  "example":
                                  {
                                     "images":["base64_image_data"],"parameters":{},"info":""
                                  },
                                  "schema": {
                                     "properties": {
                                        "images": {
                                           "type": "string",
                                           "description": "A base64 string containing the encoded PNG of the generated image."
                                        },
                                        "parameters": {
                                           "type": "object",
                                           "description": "Not used. Will be empty."
                                        },
                                           "info": {
                                           "type": "string",
                                           "description": "Not used. Will be empty."
                                        }
                                     }
                                  }
                               }
                            },
                            "description": "Successful request"
                         }
                      },
                      "summary": "Transforms an existing image into a new image",
                      "tags": [
                         "sdapi/v1"
                      ]
                   }
                },
                "/sdapi/v1/interrogate": {
                   "post": {
                      "description": "Generates a short text caption describing an image.",
@ -1112,7 +1209,6 @@
          };
    </script>
   <script>
      //self destruct into json if requested
      const urlParams = new URLSearchParams(window.location.search);
--- a/otherarch/tools/common-ggml.cpp
+++ b/otherarch/tools/common-ggml.cpp
@ -3,63 +3,63 @@
 #include <regex>
 #include <map>
-static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
+static const std::map<std::string, enum ggml_v3_ftype> GGML_V3_FTYPE_MAP = {
-    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
+    {"q4_0", GGML_V3_FTYPE_MOSTLY_Q4_0},
-    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
+    {"q4_1", GGML_V3_FTYPE_MOSTLY_Q4_1},
-    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
+    {"q5_0", GGML_V3_FTYPE_MOSTLY_Q5_0},
-    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
+    {"q5_1", GGML_V3_FTYPE_MOSTLY_Q5_1},
-    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
+    {"q8_0", GGML_V3_FTYPE_MOSTLY_Q8_0},
 };
-void ggml_print_ftypes(FILE * fp) {
+void ggml_v3_print_ftypes(FILE * fp) {
-    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
+    for (auto it = GGML_V3_FTYPE_MAP.begin(); it != GGML_V3_FTYPE_MAP.end(); it++) {
        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
    }
 }
-enum ggml_ftype ggml_parse_ftype(const char * str) {
+enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str) {
-    enum ggml_ftype ftype;
+    enum ggml_v3_ftype ftype;
    if (str[0] == 'q') {
-        const auto it = GGML_FTYPE_MAP.find(str);
+        const auto it = GGML_V3_FTYPE_MAP.find(str);
-        if (it == GGML_FTYPE_MAP.end()) {
+        if (it == GGML_V3_FTYPE_MAP.end()) {
            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
-            return GGML_FTYPE_UNKNOWN;
+            return GGML_V3_FTYPE_UNKNOWN;
        }
        ftype = it->second;
    } else {
-        ftype = (enum ggml_ftype) atoi(str);
+        ftype = (enum ggml_v3_ftype) atoi(str);
    }
    return ftype;
 }
-bool ggml_common_quantize_0(
+bool ggml_v3_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
-        const ggml_ftype ftype,
+        const ggml_v3_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip) {
-    ggml_type qtype = GGML_TYPE_F32;
+    ggml_v3_type qtype = GGML_V3_TYPE_F32;
    switch (ftype) {
-        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
+        case GGML_V3_FTYPE_MOSTLY_Q4_0: qtype = GGML_V3_TYPE_Q4_0; break;
-        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
+        case GGML_V3_FTYPE_MOSTLY_Q4_1: qtype = GGML_V3_TYPE_Q4_1; break;
-        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
+        case GGML_V3_FTYPE_MOSTLY_Q5_0: qtype = GGML_V3_TYPE_Q5_0; break;
-        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
+        case GGML_V3_FTYPE_MOSTLY_Q5_1: qtype = GGML_V3_TYPE_Q5_1; break;
-        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
+        case GGML_V3_FTYPE_MOSTLY_Q8_0: qtype = GGML_V3_TYPE_Q8_0; break;
-        case GGML_FTYPE_UNKNOWN:
+        case GGML_V3_FTYPE_UNKNOWN:
-        case GGML_FTYPE_ALL_F32:
+        case GGML_V3_FTYPE_ALL_F32:
-        case GGML_FTYPE_MOSTLY_F16:
+        case GGML_V3_FTYPE_MOSTLY_F16:
-        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
+        case GGML_V3_FTYPE_MOSTLY_Q4_1_SOME_F16:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
                }
    };
-    if (!ggml_is_quantized(qtype)) {
+    if (!ggml_v3_is_quantized(qtype)) {
-        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
+        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_v3_type_name(qtype));
        return false;
    }
@ -69,7 +69,7 @@ bool ggml_common_quantize_0(
    std::vector<float> work;
    std::vector<uint8_t>     data_u8;
-    std::vector<ggml_fp16_t> data_f16;
+    std::vector<ggml_v3_fp16_t> data_f16;
    std::vector<float>       data_f32;
    std::vector<int64_t> hist_all(1 << 4, 0);
@ -97,7 +97,7 @@ bool ggml_common_quantize_0(
        std::string name(length, 0);
        finp.read (&name[0], length);
-        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
+        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_v3_type_name((ggml_v3_type) ttype));
        bool quantize = false;
@ -121,17 +121,17 @@ bool ggml_common_quantize_0(
        quantize &= (n_dims == 2);
        if (quantize) {
-            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
+            if (ttype != GGML_V3_TYPE_F32 && ttype != GGML_V3_TYPE_F16) {
-                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
                return false;
            }
-            if (ttype == GGML_TYPE_F16) {
+            if (ttype == GGML_V3_TYPE_F16) {
                data_f16.resize(nelements);
-                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
+                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_v3_fp16_t));
                data_f32.resize(nelements);
                for (int i = 0; i < nelements; ++i) {
-                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
+                    data_f32[i] = ggml_v3_fp16_to_fp32(data_f16[i]);
                }
            } else {
                data_f32.resize(nelements);
@ -160,36 +160,36 @@ bool ggml_common_quantize_0(
            size_t cur_size = 0;
            std::vector<int64_t> hist_cur(1 << 4, 0);
-            switch ((ggml_type) ttype) {
+            switch ((ggml_v3_type) ttype) {
-                case GGML_TYPE_Q4_0:
+                case GGML_V3_TYPE_Q4_0:
                    {
-                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_v3_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
-                case GGML_TYPE_Q4_1:
+                case GGML_V3_TYPE_Q4_1:
                    {
-                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_v3_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
-                case GGML_TYPE_Q5_0:
+                case GGML_V3_TYPE_Q5_0:
                    {
-                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_v3_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
-                case GGML_TYPE_Q5_1:
+                case GGML_V3_TYPE_Q5_1:
                    {
-                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_v3_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
-                case GGML_TYPE_Q8_0:
+                case GGML_V3_TYPE_Q8_0:
                    {
-                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_v3_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
-                case GGML_TYPE_F32:
+                case GGML_V3_TYPE_F32:
-                case GGML_TYPE_F16:
+                case GGML_V3_TYPE_F16:
-                case GGML_TYPE_I8:
+                case GGML_V3_TYPE_I8:
-                case GGML_TYPE_I16:
+                case GGML_V3_TYPE_I16:
-                case GGML_TYPE_I32:
+                case GGML_V3_TYPE_I32:
-                case GGML_TYPE_Q8_1:
+                case GGML_V3_TYPE_Q8_1:
-                case GGML_TYPE_COUNT:
+                case GGML_V3_TYPE_COUNT:
                    {
-                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
                        return false;
                    }
            }
@ -216,7 +216,7 @@ bool ggml_common_quantize_0(
    }
    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
-    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
+    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_v3_type_name(qtype));
    {
        int64_t sum_all = 0;
--- a/otherarch/tools/common-ggml.h
+++ b/otherarch/tools/common-ggml.h
@ -1,18 +1,18 @@
 #pragma once
-#include "ggml.h"
+#include "ggml_v3.h"
 #include <fstream>
 #include <vector>
 #include <string>
-enum ggml_ftype ggml_parse_ftype(const char * str);
+enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str);
-void ggml_print_ftypes(FILE * fp = stderr);
+void ggml_v3_print_ftypes(FILE * fp = stderr);
-bool ggml_common_quantize_0(
+bool ggml_v3_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
-        const ggml_ftype ftype,
+        const ggml_v3_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip);
--- a/otherarch/tools/gpt2_quantize.cpp
+++ b/otherarch/tools/gpt2_quantize.cpp
@ -1,4 +1,4 @@
-#include "utils.h"
+#include "otherarch/utils.h"
 #include "common-ggml.h"
 #include <cassert>
@ -22,7 +22,7 @@ struct gpt2_hparams {
 };
 // quantize a model
-bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
    gpt_vocab vocab;
    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -62,8 +62,8 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t qntvr_src =    hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+        const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
@ -73,7 +73,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
@ -120,7 +120,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
        "model/h.*/mlp/c_proj/w",
    };
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+    if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
        return false;
    }
@ -137,41 +137,41 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
 int main(int argc, char ** argv) {
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
+        ggml_v3_print_ftypes(stderr);
        return 1;
    }
    // needed to initialize f16 tables
    {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_v3_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
+        struct ggml_v3_context * ctx = ggml_v3_init(params);
-        ggml_free(ctx);
+        ggml_v3_free(ctx);
    }
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+    const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
-    const int64_t t_main_start_us = ggml_time_us();
+    const int64_t t_main_start_us = ggml_v3_time_us();
    int64_t t_quantize_us = 0;
    // load the model
    {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = ggml_v3_time_us();
-        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
-        t_quantize_us = ggml_time_us() - t_start_us;
+        t_quantize_us = ggml_v3_time_us() - t_start_us;
    }
    // report timing
    {
-        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_main_end_us = ggml_v3_time_us();
        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
--- a/otherarch/tools/gptj_quantize.cpp
+++ b/otherarch/tools/gptj_quantize.cpp
@ -1,6 +1,6 @@
 #include "ggml.h"
-#include "utils.h"
+#include "otherarch/utils.h"
 #include "common-ggml.h"
 #include <cassert>
@ -25,7 +25,7 @@ struct gptj_hparams {
 };
 // quantize a model
-bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
    gpt_vocab vocab;
    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -66,8 +66,8 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t qntvr_src =    hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+        const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
@ -77,7 +77,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
@ -120,7 +120,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
        ".*weight",
    };
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+    if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
        return false;
    }
@ -135,44 +135,44 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
 //  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
 //
 int main(int argc, char ** argv) {
-    ggml_time_init();
+    ggml_v3_time_init();
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
+        ggml_v3_print_ftypes(stderr);
        return 1;
    }
    // needed to initialize f16 tables
    {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_v3_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
+        struct ggml_v3_context * ctx = ggml_v3_init(params);
-        ggml_free(ctx);
+        ggml_v3_free(ctx);
    }
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+    const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
-    const int64_t t_main_start_us = ggml_time_us();
+    const int64_t t_main_start_us = ggml_v3_time_us();
    int64_t t_quantize_us = 0;
    // load the model
    {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = ggml_v3_time_us();
-        if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+        if (!gptj_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
-        t_quantize_us = ggml_time_us() - t_start_us;
+        t_quantize_us = ggml_v3_time_us() - t_start_us;
    }
    // report timing
    {
-        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_main_end_us = ggml_v3_time_us();
        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
--- a/otherarch/tools/mpt_quantize.cpp
+++ b/otherarch/tools/mpt_quantize.cpp
@ -1,4 +1,4 @@
-#include "utils.h"
+#include "otherarch/utils.h"
 #include "common-ggml.h"
 #include <cassert>
@ -24,7 +24,7 @@ struct mpt_hparams {
 // quantize a model
 bool mpt_model_quantize(const std::string & fname_inp,
-                        const std::string & fname_out, ggml_ftype ftype) {
+                        const std::string & fname_out, ggml_v3_ftype ftype) {
    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -68,8 +68,8 @@ bool mpt_model_quantize(const std::string & fname_inp,
        finp.read((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
        finp.read((char *) &hparams.ftype,          sizeof(hparams.ftype));
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t qntvr_src =    hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+        const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
        printf("%s: d_model        = %d\n", __func__, hparams.d_model);
        printf("%s: max_seq_len    = %d\n", __func__, hparams.max_seq_len);
@ -81,7 +81,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
        fout.write((char *) &hparams.d_model,        sizeof(hparams.d_model));
        fout.write((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
@ -116,7 +116,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
        ".*weight",
    };
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+    if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
                fname_inp.c_str());
        return false;
@ -136,42 +136,42 @@ int main(int argc, char ** argv) {
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
                argv[0]);
-        ggml_print_ftypes(stderr);
+        ggml_v3_print_ftypes(stderr);
        return 1;
    }
    // needed to initialize f16 tables
    {
-        struct ggml_init_params params = {0, NULL, false};
+        struct ggml_v3_init_params params = {0, NULL, false};
-        struct ggml_context * ctx = ggml_init(params);
+        struct ggml_v3_context * ctx = ggml_v3_init(params);
-        ggml_free(ctx);
+        ggml_v3_free(ctx);
    }
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+    const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
-    const int64_t t_main_start_us = ggml_time_us();
+    const int64_t t_main_start_us = ggml_v3_time_us();
    int64_t t_quantize_us = 0;
    // load the model
    {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = ggml_v3_time_us();
-        if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+        if (!mpt_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n",
                    __func__, fname_inp.c_str());
            return 1;
        }
-        t_quantize_us = ggml_time_us() - t_start_us;
+        t_quantize_us = ggml_v3_time_us() - t_start_us;
    }
    // report timing
    {
-        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_main_end_us = ggml_v3_time_us();
        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__,
--- a/otherarch/tools/neox_quantize.cpp
+++ b/otherarch/tools/neox_quantize.cpp
@ -1,6 +1,6 @@
 #include "ggml.h"
-#include "utils.h"
+#include "otherarch/utils.h"
 #include "common-ggml.h"
 #include <cassert>
@ -26,7 +26,7 @@ struct gpt_neox_hparams {
 };
 // quantize a model
-bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
    gpt_vocab vocab;
    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -68,8 +68,8 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
        finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t qntvr_src =    hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+        const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
@ -80,7 +80,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
@ -116,7 +116,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
        ".*weight",
    };
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+    if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
        return false;
    }
@ -131,44 +131,44 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
 //  ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
 //
 int main(int argc, char ** argv) {
-    ggml_time_init();
+    ggml_v3_time_init();
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
+        ggml_v3_print_ftypes(stderr);
        return 1;
    }
    // needed to initialize f16 tables
    {
-        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_v3_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
+        struct ggml_v3_context * ctx = ggml_v3_init(params);
-        ggml_free(ctx);
+        ggml_v3_free(ctx);
    }
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+    const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
-    const int64_t t_main_start_us = ggml_time_us();
+    const int64_t t_main_start_us = ggml_v3_time_us();
    int64_t t_quantize_us = 0;
    // load the model
    {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = ggml_v3_time_us();
-        if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+        if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
-        t_quantize_us = ggml_time_us() - t_start_us;
+        t_quantize_us = ggml_v3_time_us() - t_start_us;
    }
    // report timing
    {
-        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_main_end_us = ggml_v3_time_us();
        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);