fixed quant tools not compiling, updated docs

This commit is contained in:
Concedo 2024-04-06 23:11:05 +08:00
parent 273d48ad96
commit 0061299cce
8 changed files with 228 additions and 132 deletions

View file

@ -606,15 +606,15 @@ endif
# tools
quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_gptj: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
quantize_gptj: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_gpt2: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
quantize_gpt2: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_neox: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
quantize_neox: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_mpt: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
quantize_mpt: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_clip: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
quantize_clip: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
#window simple clinfo

View file

@ -151,6 +151,11 @@
"exclusiveMinimum": 0,
"type": "number"
},
"dynatemp_exponent": {
"default": 1,
"description": "Exponent used in dynatemp.",
"type": "number"
},
"mirostat": {
"description": "KoboldCpp ONLY. Sets the mirostat mode, 0=disabled, 1=mirostat_v1, 2=mirostat_v2",
"minimum": 0,
@ -1004,6 +1009,98 @@
]
}
},
"/sdapi/v1/img2img": {
"post": {
"description": "Transforms an existing image into a new image, guided by a text prompt, and returns a base64 encoded png.",
"requestBody": {
"content": {
"application/json": {
"example": {
"prompt": "picture of a kobold, high quality HD render",
"negative_prompt": "ugly, deformed, censored",
"cfg_scale": 5,
"steps": 20,
"width": 512,
"height": 512,
"seed": -1,
"sampler_name": "Euler a",
"denoising_strength": 0.6,
"init_images":["base64_image_data"],
},
"schema": {
"properties": {
"prompt": {
"type": "string"
},
"negative_prompt": {
"type": "string"
},
"cfg_scale": {
"type": "number"
},
"steps": {
"type": "number"
},
"width": {
"type": "number"
},
"height": {
"type": "number"
},
"seed": {
"type": "number"
},
"sampler_name": {
"type": "string"
},
"denoising_strength": {
"type": "number"
},
"init_images": {
"type": "array"
},
},
"type": "object"
}
}
},
"required": false
},
"responses": {
"200": {
"content": {
"application/json": {
"example":
{
"images":["base64_image_data"],"parameters":{},"info":""
},
"schema": {
"properties": {
"images": {
"type": "string",
"description": "A base64 string containing the encoded PNG of the generated image."
},
"parameters": {
"type": "object",
"description": "Not used. Will be empty."
},
"info": {
"type": "string",
"description": "Not used. Will be empty."
}
}
}
}
},
"description": "Successful request"
}
},
"summary": "Transforms an existing image into a new image",
"tags": [
"sdapi/v1"
]
}
},
"/sdapi/v1/interrogate": {
"post": {
"description": "Generates a short text caption describing an image.",
@ -1112,7 +1209,6 @@
};
</script>
<script>
//self destruct into json if requested
const urlParams = new URLSearchParams(window.location.search);

View file

@ -3,63 +3,63 @@
#include <regex>
#include <map>
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
static const std::map<std::string, enum ggml_v3_ftype> GGML_V3_FTYPE_MAP = {
{"q4_0", GGML_V3_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_V3_FTYPE_MOSTLY_Q4_1},
{"q5_0", GGML_V3_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_V3_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_V3_FTYPE_MOSTLY_Q8_0},
};
void ggml_print_ftypes(FILE * fp) {
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
void ggml_v3_print_ftypes(FILE * fp) {
for (auto it = GGML_V3_FTYPE_MAP.begin(); it != GGML_V3_FTYPE_MAP.end(); it++) {
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
}
}
enum ggml_ftype ggml_parse_ftype(const char * str) {
enum ggml_ftype ftype;
enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str) {
enum ggml_v3_ftype ftype;
if (str[0] == 'q') {
const auto it = GGML_FTYPE_MAP.find(str);
if (it == GGML_FTYPE_MAP.end()) {
const auto it = GGML_V3_FTYPE_MAP.find(str);
if (it == GGML_V3_FTYPE_MAP.end()) {
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
return GGML_FTYPE_UNKNOWN;
return GGML_V3_FTYPE_UNKNOWN;
}
ftype = it->second;
} else {
ftype = (enum ggml_ftype) atoi(str);
ftype = (enum ggml_v3_ftype) atoi(str);
}
return ftype;
}
bool ggml_common_quantize_0(
bool ggml_v3_common_quantize_0(
std::ifstream & finp,
std::ofstream & fout,
const ggml_ftype ftype,
const ggml_v3_ftype ftype,
const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip) {
ggml_type qtype = GGML_TYPE_F32;
ggml_v3_type qtype = GGML_V3_TYPE_F32;
switch (ftype) {
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
case GGML_FTYPE_UNKNOWN:
case GGML_FTYPE_ALL_F32:
case GGML_FTYPE_MOSTLY_F16:
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
case GGML_V3_FTYPE_MOSTLY_Q4_0: qtype = GGML_V3_TYPE_Q4_0; break;
case GGML_V3_FTYPE_MOSTLY_Q4_1: qtype = GGML_V3_TYPE_Q4_1; break;
case GGML_V3_FTYPE_MOSTLY_Q5_0: qtype = GGML_V3_TYPE_Q5_0; break;
case GGML_V3_FTYPE_MOSTLY_Q5_1: qtype = GGML_V3_TYPE_Q5_1; break;
case GGML_V3_FTYPE_MOSTLY_Q8_0: qtype = GGML_V3_TYPE_Q8_0; break;
case GGML_V3_FTYPE_UNKNOWN:
case GGML_V3_FTYPE_ALL_F32:
case GGML_V3_FTYPE_MOSTLY_F16:
case GGML_V3_FTYPE_MOSTLY_Q4_1_SOME_F16:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
}
};
if (!ggml_is_quantized(qtype)) {
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
if (!ggml_v3_is_quantized(qtype)) {
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_v3_type_name(qtype));
return false;
}
@ -69,7 +69,7 @@ bool ggml_common_quantize_0(
std::vector<float> work;
std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16;
std::vector<ggml_v3_fp16_t> data_f16;
std::vector<float> data_f32;
std::vector<int64_t> hist_all(1 << 4, 0);
@ -97,7 +97,7 @@ bool ggml_common_quantize_0(
std::string name(length, 0);
finp.read (&name[0], length);
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_v3_type_name((ggml_v3_type) ttype));
bool quantize = false;
@ -121,17 +121,17 @@ bool ggml_common_quantize_0(
quantize &= (n_dims == 2);
if (quantize) {
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
if (ttype != GGML_V3_TYPE_F32 && ttype != GGML_V3_TYPE_F16) {
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
return false;
}
if (ttype == GGML_TYPE_F16) {
if (ttype == GGML_V3_TYPE_F16) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_v3_fp16_t));
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
data_f32[i] = ggml_v3_fp16_to_fp32(data_f16[i]);
}
} else {
data_f32.resize(nelements);
@ -160,36 +160,36 @@ bool ggml_common_quantize_0(
size_t cur_size = 0;
std::vector<int64_t> hist_cur(1 << 4, 0);
switch ((ggml_type) ttype) {
case GGML_TYPE_Q4_0:
switch ((ggml_v3_type) ttype) {
case GGML_V3_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_v3_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
case GGML_V3_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_v3_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_0:
case GGML_V3_TYPE_Q5_0:
{
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_v3_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_1:
case GGML_V3_TYPE_Q5_1:
{
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_v3_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q8_0:
case GGML_V3_TYPE_Q8_0:
{
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_v3_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_Q8_1:
case GGML_TYPE_COUNT:
case GGML_V3_TYPE_F32:
case GGML_V3_TYPE_F16:
case GGML_V3_TYPE_I8:
case GGML_V3_TYPE_I16:
case GGML_V3_TYPE_I32:
case GGML_V3_TYPE_Q8_1:
case GGML_V3_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
return false;
}
}
@ -216,7 +216,7 @@ bool ggml_common_quantize_0(
}
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_v3_type_name(qtype));
{
int64_t sum_all = 0;

View file

@ -1,18 +1,18 @@
#pragma once
#include "ggml.h"
#include "ggml_v3.h"
#include <fstream>
#include <vector>
#include <string>
enum ggml_ftype ggml_parse_ftype(const char * str);
enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str);
void ggml_print_ftypes(FILE * fp = stderr);
void ggml_v3_print_ftypes(FILE * fp = stderr);
bool ggml_common_quantize_0(
bool ggml_v3_common_quantize_0(
std::ifstream & finp,
std::ofstream & fout,
const ggml_ftype ftype,
const ggml_v3_ftype ftype,
const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip);

View file

@ -1,4 +1,4 @@
#include "utils.h"
#include "otherarch/utils.h"
#include "common-ggml.h"
#include <cassert>
@ -22,7 +22,7 @@ struct gpt2_hparams {
};
// quantize a model
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -62,8 +62,8 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -73,7 +73,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@ -120,7 +120,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
"model/h.*/mlp/c_proj/w",
};
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false;
}
@ -137,41 +137,41 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
int main(int argc, char ** argv) {
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
ggml_print_ftypes(stderr);
ggml_v3_print_ftypes(stderr);
return 1;
}
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
struct ggml_v3_init_params params = { 0, NULL, false };
struct ggml_v3_context * ctx = ggml_v3_init(params);
ggml_v3_free(ctx);
}
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
const int64_t t_main_start_us = ggml_v3_time_us();
int64_t t_quantize_us = 0;
// load the model
{
const int64_t t_start_us = ggml_time_us();
const int64_t t_start_us = ggml_v3_time_us();
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
t_quantize_us = ggml_time_us() - t_start_us;
t_quantize_us = ggml_v3_time_us() - t_start_us;
}
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
const int64_t t_main_end_us = ggml_v3_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);

View file

@ -1,6 +1,6 @@
#include "ggml.h"
#include "utils.h"
#include "otherarch/utils.h"
#include "common-ggml.h"
#include <cassert>
@ -25,7 +25,7 @@ struct gptj_hparams {
};
// quantize a model
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -66,8 +66,8 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -77,7 +77,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@ -120,7 +120,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
".*weight",
};
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false;
}
@ -135,44 +135,44 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
ggml_time_init();
ggml_v3_time_init();
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
ggml_print_ftypes(stderr);
ggml_v3_print_ftypes(stderr);
return 1;
}
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
struct ggml_v3_init_params params = { 0, NULL, false };
struct ggml_v3_context * ctx = ggml_v3_init(params);
ggml_v3_free(ctx);
}
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
const int64_t t_main_start_us = ggml_v3_time_us();
int64_t t_quantize_us = 0;
// load the model
{
const int64_t t_start_us = ggml_time_us();
const int64_t t_start_us = ggml_v3_time_us();
if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
if (!gptj_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
t_quantize_us = ggml_time_us() - t_start_us;
t_quantize_us = ggml_v3_time_us() - t_start_us;
}
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
const int64_t t_main_end_us = ggml_v3_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);

View file

@ -1,4 +1,4 @@
#include "utils.h"
#include "otherarch/utils.h"
#include "common-ggml.h"
#include <cassert>
@ -24,7 +24,7 @@ struct mpt_hparams {
// quantize a model
bool mpt_model_quantize(const std::string & fname_inp,
const std::string & fname_out, ggml_ftype ftype) {
const std::string & fname_out, ggml_v3_ftype ftype) {
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -68,8 +68,8 @@ bool mpt_model_quantize(const std::string & fname_inp,
finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
@ -81,7 +81,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
fout.write((char *) &hparams.d_model, sizeof(hparams.d_model));
fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
@ -116,7 +116,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
".*weight",
};
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
fname_inp.c_str());
return false;
@ -136,42 +136,42 @@ int main(int argc, char ** argv) {
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
argv[0]);
ggml_print_ftypes(stderr);
ggml_v3_print_ftypes(stderr);
return 1;
}
// needed to initialize f16 tables
{
struct ggml_init_params params = {0, NULL, false};
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
struct ggml_v3_init_params params = {0, NULL, false};
struct ggml_v3_context * ctx = ggml_v3_init(params);
ggml_v3_free(ctx);
}
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
const int64_t t_main_start_us = ggml_v3_time_us();
int64_t t_quantize_us = 0;
// load the model
{
const int64_t t_start_us = ggml_time_us();
const int64_t t_start_us = ggml_v3_time_us();
if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
if (!mpt_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n",
__func__, fname_inp.c_str());
return 1;
}
t_quantize_us = ggml_time_us() - t_start_us;
t_quantize_us = ggml_v3_time_us() - t_start_us;
}
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
const int64_t t_main_end_us = ggml_v3_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__,

View file

@ -1,6 +1,6 @@
#include "ggml.h"
#include "utils.h"
#include "otherarch/utils.h"
#include "common-ggml.h"
#include <cassert>
@ -26,7 +26,7 @@ struct gpt_neox_hparams {
};
// quantize a model
bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -68,8 +68,8 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -80,7 +80,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@ -116,7 +116,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
".*weight",
};
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false;
}
@ -131,44 +131,44 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
// ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
ggml_time_init();
ggml_v3_time_init();
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
ggml_print_ftypes(stderr);
ggml_v3_print_ftypes(stderr);
return 1;
}
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
struct ggml_v3_init_params params = { 0, NULL, false };
struct ggml_v3_context * ctx = ggml_v3_init(params);
ggml_v3_free(ctx);
}
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
const int64_t t_main_start_us = ggml_v3_time_us();
int64_t t_quantize_us = 0;
// load the model
{
const int64_t t_start_us = ggml_time_us();
const int64_t t_start_us = ggml_v3_time_us();
if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
t_quantize_us = ggml_time_us() - t_start_us;
t_quantize_us = ggml_v3_time_us() - t_start_us;
}
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
const int64_t t_main_end_us = ggml_v3_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);