fixed quant tools not compiling, updated docs

This commit is contained in:
Concedo 2024-04-06 23:11:05 +08:00
parent 273d48ad96
commit 0061299cce
8 changed files with 228 additions and 132 deletions

View file

@ -606,15 +606,15 @@ endif
# tools # tools
quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_gptj: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp quantize_gptj: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_gpt2: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp quantize_gpt2: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_neox: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp quantize_neox: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_mpt: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp quantize_mpt: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_clip: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp quantize_clip: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
#window simple clinfo #window simple clinfo

View file

@ -151,6 +151,11 @@
"exclusiveMinimum": 0, "exclusiveMinimum": 0,
"type": "number" "type": "number"
}, },
"dynatemp_exponent": {
"default": 1,
"description": "Exponent used in dynatemp.",
"type": "number"
},
"mirostat": { "mirostat": {
"description": "KoboldCpp ONLY. Sets the mirostat mode, 0=disabled, 1=mirostat_v1, 2=mirostat_v2", "description": "KoboldCpp ONLY. Sets the mirostat mode, 0=disabled, 1=mirostat_v1, 2=mirostat_v2",
"minimum": 0, "minimum": 0,
@ -1004,6 +1009,98 @@
] ]
} }
}, },
"/sdapi/v1/img2img": {
"post": {
"description": "Transforms an existing image into a new image, guided by a text prompt, and returns a base64 encoded png.",
"requestBody": {
"content": {
"application/json": {
"example": {
"prompt": "picture of a kobold, high quality HD render",
"negative_prompt": "ugly, deformed, censored",
"cfg_scale": 5,
"steps": 20,
"width": 512,
"height": 512,
"seed": -1,
"sampler_name": "Euler a",
"denoising_strength": 0.6,
"init_images":["base64_image_data"],
},
"schema": {
"properties": {
"prompt": {
"type": "string"
},
"negative_prompt": {
"type": "string"
},
"cfg_scale": {
"type": "number"
},
"steps": {
"type": "number"
},
"width": {
"type": "number"
},
"height": {
"type": "number"
},
"seed": {
"type": "number"
},
"sampler_name": {
"type": "string"
},
"denoising_strength": {
"type": "number"
},
"init_images": {
"type": "array"
},
},
"type": "object"
}
}
},
"required": false
},
"responses": {
"200": {
"content": {
"application/json": {
"example":
{
"images":["base64_image_data"],"parameters":{},"info":""
},
"schema": {
"properties": {
"images": {
"type": "string",
"description": "A base64 string containing the encoded PNG of the generated image."
},
"parameters": {
"type": "object",
"description": "Not used. Will be empty."
},
"info": {
"type": "string",
"description": "Not used. Will be empty."
}
}
}
}
},
"description": "Successful request"
}
},
"summary": "Transforms an existing image into a new image",
"tags": [
"sdapi/v1"
]
}
},
"/sdapi/v1/interrogate": { "/sdapi/v1/interrogate": {
"post": { "post": {
"description": "Generates a short text caption describing an image.", "description": "Generates a short text caption describing an image.",
@ -1112,7 +1209,6 @@
}; };
</script> </script>
<script> <script>
//self destruct into json if requested //self destruct into json if requested
const urlParams = new URLSearchParams(window.location.search); const urlParams = new URLSearchParams(window.location.search);

View file

@ -3,63 +3,63 @@
#include <regex> #include <regex>
#include <map> #include <map>
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = { static const std::map<std::string, enum ggml_v3_ftype> GGML_V3_FTYPE_MAP = {
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0}, {"q4_0", GGML_V3_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1}, {"q4_1", GGML_V3_FTYPE_MOSTLY_Q4_1},
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, {"q5_0", GGML_V3_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, {"q5_1", GGML_V3_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, {"q8_0", GGML_V3_FTYPE_MOSTLY_Q8_0},
}; };
void ggml_print_ftypes(FILE * fp) { void ggml_v3_print_ftypes(FILE * fp) {
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) { for (auto it = GGML_V3_FTYPE_MAP.begin(); it != GGML_V3_FTYPE_MAP.end(); it++) {
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second); fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
} }
} }
enum ggml_ftype ggml_parse_ftype(const char * str) { enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str) {
enum ggml_ftype ftype; enum ggml_v3_ftype ftype;
if (str[0] == 'q') { if (str[0] == 'q') {
const auto it = GGML_FTYPE_MAP.find(str); const auto it = GGML_V3_FTYPE_MAP.find(str);
if (it == GGML_FTYPE_MAP.end()) { if (it == GGML_V3_FTYPE_MAP.end()) {
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str); fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
return GGML_FTYPE_UNKNOWN; return GGML_V3_FTYPE_UNKNOWN;
} }
ftype = it->second; ftype = it->second;
} else { } else {
ftype = (enum ggml_ftype) atoi(str); ftype = (enum ggml_v3_ftype) atoi(str);
} }
return ftype; return ftype;
} }
bool ggml_common_quantize_0( bool ggml_v3_common_quantize_0(
std::ifstream & finp, std::ifstream & finp,
std::ofstream & fout, std::ofstream & fout,
const ggml_ftype ftype, const ggml_v3_ftype ftype,
const std::vector<std::string> & to_quant, const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip) { const std::vector<std::string> & to_skip) {
ggml_type qtype = GGML_TYPE_F32; ggml_v3_type qtype = GGML_V3_TYPE_F32;
switch (ftype) { switch (ftype) {
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; case GGML_V3_FTYPE_MOSTLY_Q4_0: qtype = GGML_V3_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; case GGML_V3_FTYPE_MOSTLY_Q4_1: qtype = GGML_V3_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; case GGML_V3_FTYPE_MOSTLY_Q5_0: qtype = GGML_V3_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; case GGML_V3_FTYPE_MOSTLY_Q5_1: qtype = GGML_V3_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; case GGML_V3_FTYPE_MOSTLY_Q8_0: qtype = GGML_V3_TYPE_Q8_0; break;
case GGML_FTYPE_UNKNOWN: case GGML_V3_FTYPE_UNKNOWN:
case GGML_FTYPE_ALL_F32: case GGML_V3_FTYPE_ALL_F32:
case GGML_FTYPE_MOSTLY_F16: case GGML_V3_FTYPE_MOSTLY_F16:
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: case GGML_V3_FTYPE_MOSTLY_Q4_1_SOME_F16:
{ {
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false; return false;
} }
}; };
if (!ggml_is_quantized(qtype)) { if (!ggml_v3_is_quantized(qtype)) {
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype)); fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_v3_type_name(qtype));
return false; return false;
} }
@ -69,7 +69,7 @@ bool ggml_common_quantize_0(
std::vector<float> work; std::vector<float> work;
std::vector<uint8_t> data_u8; std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16; std::vector<ggml_v3_fp16_t> data_f16;
std::vector<float> data_f32; std::vector<float> data_f32;
std::vector<int64_t> hist_all(1 << 4, 0); std::vector<int64_t> hist_all(1 << 4, 0);
@ -97,7 +97,7 @@ bool ggml_common_quantize_0(
std::string name(length, 0); std::string name(length, 0);
finp.read (&name[0], length); finp.read (&name[0], length);
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype)); printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_v3_type_name((ggml_v3_type) ttype));
bool quantize = false; bool quantize = false;
@ -121,17 +121,17 @@ bool ggml_common_quantize_0(
quantize &= (n_dims == 2); quantize &= (n_dims == 2);
if (quantize) { if (quantize) {
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) { if (ttype != GGML_V3_TYPE_F32 && ttype != GGML_V3_TYPE_F16) {
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
return false; return false;
} }
if (ttype == GGML_TYPE_F16) { if (ttype == GGML_V3_TYPE_F16) {
data_f16.resize(nelements); data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t)); finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_v3_fp16_t));
data_f32.resize(nelements); data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) { for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); data_f32[i] = ggml_v3_fp16_to_fp32(data_f16[i]);
} }
} else { } else {
data_f32.resize(nelements); data_f32.resize(nelements);
@ -160,36 +160,36 @@ bool ggml_common_quantize_0(
size_t cur_size = 0; size_t cur_size = 0;
std::vector<int64_t> hist_cur(1 << 4, 0); std::vector<int64_t> hist_cur(1 << 4, 0);
switch ((ggml_type) ttype) { switch ((ggml_v3_type) ttype) {
case GGML_TYPE_Q4_0: case GGML_V3_TYPE_Q4_0:
{ {
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); cur_size = ggml_v3_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break; } break;
case GGML_TYPE_Q4_1: case GGML_V3_TYPE_Q4_1:
{ {
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); cur_size = ggml_v3_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break; } break;
case GGML_TYPE_Q5_0: case GGML_V3_TYPE_Q5_0:
{ {
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); cur_size = ggml_v3_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break; } break;
case GGML_TYPE_Q5_1: case GGML_V3_TYPE_Q5_1:
{ {
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); cur_size = ggml_v3_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break; } break;
case GGML_TYPE_Q8_0: case GGML_V3_TYPE_Q8_0:
{ {
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); cur_size = ggml_v3_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break; } break;
case GGML_TYPE_F32: case GGML_V3_TYPE_F32:
case GGML_TYPE_F16: case GGML_V3_TYPE_F16:
case GGML_TYPE_I8: case GGML_V3_TYPE_I8:
case GGML_TYPE_I16: case GGML_V3_TYPE_I16:
case GGML_TYPE_I32: case GGML_V3_TYPE_I32:
case GGML_TYPE_Q8_1: case GGML_V3_TYPE_Q8_1:
case GGML_TYPE_COUNT: case GGML_V3_TYPE_COUNT:
{ {
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
return false; return false;
} }
} }
@ -216,7 +216,7 @@ bool ggml_common_quantize_0(
} }
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype)); printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_v3_type_name(qtype));
{ {
int64_t sum_all = 0; int64_t sum_all = 0;

View file

@ -1,18 +1,18 @@
#pragma once #pragma once
#include "ggml.h" #include "ggml_v3.h"
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include <string> #include <string>
enum ggml_ftype ggml_parse_ftype(const char * str); enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str);
void ggml_print_ftypes(FILE * fp = stderr); void ggml_v3_print_ftypes(FILE * fp = stderr);
bool ggml_common_quantize_0( bool ggml_v3_common_quantize_0(
std::ifstream & finp, std::ifstream & finp,
std::ofstream & fout, std::ofstream & fout,
const ggml_ftype ftype, const ggml_v3_ftype ftype,
const std::vector<std::string> & to_quant, const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip); const std::vector<std::string> & to_skip);

View file

@ -1,4 +1,4 @@
#include "utils.h" #include "otherarch/utils.h"
#include "common-ggml.h" #include "common-ggml.h"
#include <cassert> #include <cassert>
@ -22,7 +22,7 @@ struct gpt2_hparams {
}; };
// quantize a model // quantize a model
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
gpt_vocab vocab; gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -62,8 +62,8 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -73,7 +73,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@ -120,7 +120,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
"model/h.*/mlp/c_proj/w", "model/h.*/mlp/c_proj/w",
}; };
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false; return false;
} }
@ -137,41 +137,41 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc != 4) { if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
ggml_print_ftypes(stderr); ggml_v3_print_ftypes(stderr);
return 1; return 1;
} }
// needed to initialize f16 tables // needed to initialize f16 tables
{ {
struct ggml_init_params params = { 0, NULL, false }; struct ggml_v3_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params); struct ggml_v3_context * ctx = ggml_v3_init(params);
ggml_free(ctx); ggml_v3_free(ctx);
} }
const std::string fname_inp = argv[1]; const std::string fname_inp = argv[1];
const std::string fname_out = argv[2]; const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us(); const int64_t t_main_start_us = ggml_v3_time_us();
int64_t t_quantize_us = 0; int64_t t_quantize_us = 0;
// load the model // load the model
{ {
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_v3_time_us();
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { if (!gpt2_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1; return 1;
} }
t_quantize_us = ggml_time_us() - t_start_us; t_quantize_us = ggml_v3_time_us() - t_start_us;
} }
// report timing // report timing
{ {
const int64_t t_main_end_us = ggml_time_us(); const int64_t t_main_end_us = ggml_v3_time_us();
printf("\n"); printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);

View file

@ -1,6 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "utils.h" #include "otherarch/utils.h"
#include "common-ggml.h" #include "common-ggml.h"
#include <cassert> #include <cassert>
@ -25,7 +25,7 @@ struct gptj_hparams {
}; };
// quantize a model // quantize a model
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
gpt_vocab vocab; gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -66,8 +66,8 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -77,7 +77,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@ -120,7 +120,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
".*weight", ".*weight",
}; };
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false; return false;
} }
@ -135,44 +135,44 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type // ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
// //
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
ggml_time_init(); ggml_v3_time_init();
if (argc != 4) { if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
ggml_print_ftypes(stderr); ggml_v3_print_ftypes(stderr);
return 1; return 1;
} }
// needed to initialize f16 tables // needed to initialize f16 tables
{ {
struct ggml_init_params params = { 0, NULL, false }; struct ggml_v3_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params); struct ggml_v3_context * ctx = ggml_v3_init(params);
ggml_free(ctx); ggml_v3_free(ctx);
} }
const std::string fname_inp = argv[1]; const std::string fname_inp = argv[1];
const std::string fname_out = argv[2]; const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us(); const int64_t t_main_start_us = ggml_v3_time_us();
int64_t t_quantize_us = 0; int64_t t_quantize_us = 0;
// load the model // load the model
{ {
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_v3_time_us();
if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { if (!gptj_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1; return 1;
} }
t_quantize_us = ggml_time_us() - t_start_us; t_quantize_us = ggml_v3_time_us() - t_start_us;
} }
// report timing // report timing
{ {
const int64_t t_main_end_us = ggml_time_us(); const int64_t t_main_end_us = ggml_v3_time_us();
printf("\n"); printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);

View file

@ -1,4 +1,4 @@
#include "utils.h" #include "otherarch/utils.h"
#include "common-ggml.h" #include "common-ggml.h"
#include <cassert> #include <cassert>
@ -24,7 +24,7 @@ struct mpt_hparams {
// quantize a model // quantize a model
bool mpt_model_quantize(const std::string & fname_inp, bool mpt_model_quantize(const std::string & fname_inp,
const std::string & fname_out, ggml_ftype ftype) { const std::string & fname_out, ggml_v3_ftype ftype) {
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -68,8 +68,8 @@ bool mpt_model_quantize(const std::string & fname_inp,
finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv)); finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
printf("%s: d_model = %d\n", __func__, hparams.d_model); printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
@ -81,7 +81,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
fout.write((char *) &hparams.d_model, sizeof(hparams.d_model)); fout.write((char *) &hparams.d_model, sizeof(hparams.d_model));
fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
@ -116,7 +116,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
".*weight", ".*weight",
}; };
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
fname_inp.c_str()); fname_inp.c_str());
return false; return false;
@ -136,42 +136,42 @@ int main(int argc, char ** argv) {
if (argc != 4) { if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
argv[0]); argv[0]);
ggml_print_ftypes(stderr); ggml_v3_print_ftypes(stderr);
return 1; return 1;
} }
// needed to initialize f16 tables // needed to initialize f16 tables
{ {
struct ggml_init_params params = {0, NULL, false}; struct ggml_v3_init_params params = {0, NULL, false};
struct ggml_context * ctx = ggml_init(params); struct ggml_v3_context * ctx = ggml_v3_init(params);
ggml_free(ctx); ggml_v3_free(ctx);
} }
const std::string fname_inp = argv[1]; const std::string fname_inp = argv[1];
const std::string fname_out = argv[2]; const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us(); const int64_t t_main_start_us = ggml_v3_time_us();
int64_t t_quantize_us = 0; int64_t t_quantize_us = 0;
// load the model // load the model
{ {
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_v3_time_us();
if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { if (!mpt_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", fprintf(stderr, "%s: failed to quantize model from '%s'\n",
__func__, fname_inp.c_str()); __func__, fname_inp.c_str());
return 1; return 1;
} }
t_quantize_us = ggml_time_us() - t_start_us; t_quantize_us = ggml_v3_time_us() - t_start_us;
} }
// report timing // report timing
{ {
const int64_t t_main_end_us = ggml_time_us(); const int64_t t_main_end_us = ggml_v3_time_us();
printf("\n"); printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, printf("%s: quantize time = %8.2f ms\n", __func__,

View file

@ -1,6 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "utils.h" #include "otherarch/utils.h"
#include "common-ggml.h" #include "common-ggml.h"
#include <cassert> #include <cassert>
@ -26,7 +26,7 @@ struct gpt_neox_hparams {
}; };
// quantize a model // quantize a model
bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
gpt_vocab vocab; gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -68,8 +68,8 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
finp.read((char *) &hparams.par_res, sizeof(hparams.par_res)); finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -80,7 +80,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@ -116,7 +116,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
".*weight", ".*weight",
}; };
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false; return false;
} }
@ -131,44 +131,44 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
// ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type // ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
// //
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
ggml_time_init(); ggml_v3_time_init();
if (argc != 4) { if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
ggml_print_ftypes(stderr); ggml_v3_print_ftypes(stderr);
return 1; return 1;
} }
// needed to initialize f16 tables // needed to initialize f16 tables
{ {
struct ggml_init_params params = { 0, NULL, false }; struct ggml_v3_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params); struct ggml_v3_context * ctx = ggml_v3_init(params);
ggml_free(ctx); ggml_v3_free(ctx);
} }
const std::string fname_inp = argv[1]; const std::string fname_inp = argv[1];
const std::string fname_out = argv[2]; const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us(); const int64_t t_main_start_us = ggml_v3_time_us();
int64_t t_quantize_us = 0; int64_t t_quantize_us = 0;
// load the model // load the model
{ {
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_v3_time_us();
if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1; return 1;
} }
t_quantize_us = ggml_time_us() - t_start_us; t_quantize_us = ggml_v3_time_us() - t_start_us;
} }
// report timing // report timing
{ {
const int64_t t_main_end_us = ggml_time_us(); const int64_t t_main_end_us = ggml_v3_time_us();
printf("\n"); printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);