mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
fixed quant tools not compiling, updated docs
This commit is contained in:
parent
273d48ad96
commit
0061299cce
8 changed files with 228 additions and 132 deletions
10
Makefile
10
Makefile
|
@ -606,15 +606,15 @@ endif
|
||||||
# tools
|
# tools
|
||||||
quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o
|
quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_gptj: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
|
quantize_gptj: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_gpt2: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
|
quantize_gpt2: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_neox: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
|
quantize_neox: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_mpt: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
|
quantize_mpt: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
quantize_clip: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
|
quantize_clip: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
#window simple clinfo
|
#window simple clinfo
|
||||||
|
|
|
@ -151,6 +151,11 @@
|
||||||
"exclusiveMinimum": 0,
|
"exclusiveMinimum": 0,
|
||||||
"type": "number"
|
"type": "number"
|
||||||
},
|
},
|
||||||
|
"dynatemp_exponent": {
|
||||||
|
"default": 1,
|
||||||
|
"description": "Exponent used in dynatemp.",
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
"mirostat": {
|
"mirostat": {
|
||||||
"description": "KoboldCpp ONLY. Sets the mirostat mode, 0=disabled, 1=mirostat_v1, 2=mirostat_v2",
|
"description": "KoboldCpp ONLY. Sets the mirostat mode, 0=disabled, 1=mirostat_v1, 2=mirostat_v2",
|
||||||
"minimum": 0,
|
"minimum": 0,
|
||||||
|
@ -1004,6 +1009,98 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/sdapi/v1/img2img": {
|
||||||
|
"post": {
|
||||||
|
"description": "Transforms an existing image into a new image, guided by a text prompt, and returns a base64 encoded png.",
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {
|
||||||
|
"prompt": "picture of a kobold, high quality HD render",
|
||||||
|
"negative_prompt": "ugly, deformed, censored",
|
||||||
|
"cfg_scale": 5,
|
||||||
|
"steps": 20,
|
||||||
|
"width": 512,
|
||||||
|
"height": 512,
|
||||||
|
"seed": -1,
|
||||||
|
"sampler_name": "Euler a",
|
||||||
|
"denoising_strength": 0.6,
|
||||||
|
"init_images":["base64_image_data"],
|
||||||
|
},
|
||||||
|
"schema": {
|
||||||
|
"properties": {
|
||||||
|
"prompt": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"negative_prompt": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"cfg_scale": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"steps": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"width": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"height": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"seed": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"sampler_name": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"denoising_strength": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"init_images": {
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": false
|
||||||
|
},
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example":
|
||||||
|
{
|
||||||
|
"images":["base64_image_data"],"parameters":{},"info":""
|
||||||
|
},
|
||||||
|
"schema": {
|
||||||
|
"properties": {
|
||||||
|
"images": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "A base64 string containing the encoded PNG of the generated image."
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Not used. Will be empty."
|
||||||
|
},
|
||||||
|
"info": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Not used. Will be empty."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": "Successful request"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"summary": "Transforms an existing image into a new image",
|
||||||
|
"tags": [
|
||||||
|
"sdapi/v1"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
"/sdapi/v1/interrogate": {
|
"/sdapi/v1/interrogate": {
|
||||||
"post": {
|
"post": {
|
||||||
"description": "Generates a short text caption describing an image.",
|
"description": "Generates a short text caption describing an image.",
|
||||||
|
@ -1112,7 +1209,6 @@
|
||||||
};
|
};
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
//self destruct into json if requested
|
//self destruct into json if requested
|
||||||
const urlParams = new URLSearchParams(window.location.search);
|
const urlParams = new URLSearchParams(window.location.search);
|
||||||
|
|
|
@ -3,63 +3,63 @@
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
static const std::map<std::string, enum ggml_v3_ftype> GGML_V3_FTYPE_MAP = {
|
||||||
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
{"q4_0", GGML_V3_FTYPE_MOSTLY_Q4_0},
|
||||||
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
{"q4_1", GGML_V3_FTYPE_MOSTLY_Q4_1},
|
||||||
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
{"q5_0", GGML_V3_FTYPE_MOSTLY_Q5_0},
|
||||||
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
{"q5_1", GGML_V3_FTYPE_MOSTLY_Q5_1},
|
||||||
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
{"q8_0", GGML_V3_FTYPE_MOSTLY_Q8_0},
|
||||||
};
|
};
|
||||||
|
|
||||||
void ggml_print_ftypes(FILE * fp) {
|
void ggml_v3_print_ftypes(FILE * fp) {
|
||||||
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
|
for (auto it = GGML_V3_FTYPE_MAP.begin(); it != GGML_V3_FTYPE_MAP.end(); it++) {
|
||||||
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_ftype ggml_parse_ftype(const char * str) {
|
enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str) {
|
||||||
enum ggml_ftype ftype;
|
enum ggml_v3_ftype ftype;
|
||||||
if (str[0] == 'q') {
|
if (str[0] == 'q') {
|
||||||
const auto it = GGML_FTYPE_MAP.find(str);
|
const auto it = GGML_V3_FTYPE_MAP.find(str);
|
||||||
if (it == GGML_FTYPE_MAP.end()) {
|
if (it == GGML_V3_FTYPE_MAP.end()) {
|
||||||
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
|
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
|
||||||
return GGML_FTYPE_UNKNOWN;
|
return GGML_V3_FTYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
ftype = it->second;
|
ftype = it->second;
|
||||||
} else {
|
} else {
|
||||||
ftype = (enum ggml_ftype) atoi(str);
|
ftype = (enum ggml_v3_ftype) atoi(str);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ftype;
|
return ftype;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_common_quantize_0(
|
bool ggml_v3_common_quantize_0(
|
||||||
std::ifstream & finp,
|
std::ifstream & finp,
|
||||||
std::ofstream & fout,
|
std::ofstream & fout,
|
||||||
const ggml_ftype ftype,
|
const ggml_v3_ftype ftype,
|
||||||
const std::vector<std::string> & to_quant,
|
const std::vector<std::string> & to_quant,
|
||||||
const std::vector<std::string> & to_skip) {
|
const std::vector<std::string> & to_skip) {
|
||||||
|
|
||||||
ggml_type qtype = GGML_TYPE_F32;
|
ggml_v3_type qtype = GGML_V3_TYPE_F32;
|
||||||
|
|
||||||
switch (ftype) {
|
switch (ftype) {
|
||||||
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
case GGML_V3_FTYPE_MOSTLY_Q4_0: qtype = GGML_V3_TYPE_Q4_0; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
case GGML_V3_FTYPE_MOSTLY_Q4_1: qtype = GGML_V3_TYPE_Q4_1; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
case GGML_V3_FTYPE_MOSTLY_Q5_0: qtype = GGML_V3_TYPE_Q5_0; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
case GGML_V3_FTYPE_MOSTLY_Q5_1: qtype = GGML_V3_TYPE_Q5_1; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
case GGML_V3_FTYPE_MOSTLY_Q8_0: qtype = GGML_V3_TYPE_Q8_0; break;
|
||||||
case GGML_FTYPE_UNKNOWN:
|
case GGML_V3_FTYPE_UNKNOWN:
|
||||||
case GGML_FTYPE_ALL_F32:
|
case GGML_V3_FTYPE_ALL_F32:
|
||||||
case GGML_FTYPE_MOSTLY_F16:
|
case GGML_V3_FTYPE_MOSTLY_F16:
|
||||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
case GGML_V3_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!ggml_is_quantized(qtype)) {
|
if (!ggml_v3_is_quantized(qtype)) {
|
||||||
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
|
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_v3_type_name(qtype));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@ bool ggml_common_quantize_0(
|
||||||
std::vector<float> work;
|
std::vector<float> work;
|
||||||
|
|
||||||
std::vector<uint8_t> data_u8;
|
std::vector<uint8_t> data_u8;
|
||||||
std::vector<ggml_fp16_t> data_f16;
|
std::vector<ggml_v3_fp16_t> data_f16;
|
||||||
std::vector<float> data_f32;
|
std::vector<float> data_f32;
|
||||||
|
|
||||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||||
|
@ -97,7 +97,7 @@ bool ggml_common_quantize_0(
|
||||||
std::string name(length, 0);
|
std::string name(length, 0);
|
||||||
finp.read (&name[0], length);
|
finp.read (&name[0], length);
|
||||||
|
|
||||||
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
|
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_v3_type_name((ggml_v3_type) ttype));
|
||||||
|
|
||||||
bool quantize = false;
|
bool quantize = false;
|
||||||
|
|
||||||
|
@ -121,17 +121,17 @@ bool ggml_common_quantize_0(
|
||||||
quantize &= (n_dims == 2);
|
quantize &= (n_dims == 2);
|
||||||
|
|
||||||
if (quantize) {
|
if (quantize) {
|
||||||
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
|
if (ttype != GGML_V3_TYPE_F32 && ttype != GGML_V3_TYPE_F16) {
|
||||||
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ttype == GGML_TYPE_F16) {
|
if (ttype == GGML_V3_TYPE_F16) {
|
||||||
data_f16.resize(nelements);
|
data_f16.resize(nelements);
|
||||||
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_v3_fp16_t));
|
||||||
data_f32.resize(nelements);
|
data_f32.resize(nelements);
|
||||||
for (int i = 0; i < nelements; ++i) {
|
for (int i = 0; i < nelements; ++i) {
|
||||||
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
data_f32[i] = ggml_v3_fp16_to_fp32(data_f16[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
data_f32.resize(nelements);
|
data_f32.resize(nelements);
|
||||||
|
@ -160,36 +160,36 @@ bool ggml_common_quantize_0(
|
||||||
size_t cur_size = 0;
|
size_t cur_size = 0;
|
||||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||||
|
|
||||||
switch ((ggml_type) ttype) {
|
switch ((ggml_v3_type) ttype) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_V3_TYPE_Q4_0:
|
||||||
{
|
{
|
||||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
cur_size = ggml_v3_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_V3_TYPE_Q4_1:
|
||||||
{
|
{
|
||||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
cur_size = ggml_v3_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_V3_TYPE_Q5_0:
|
||||||
{
|
{
|
||||||
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
cur_size = ggml_v3_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_V3_TYPE_Q5_1:
|
||||||
{
|
{
|
||||||
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
cur_size = ggml_v3_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_V3_TYPE_Q8_0:
|
||||||
{
|
{
|
||||||
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
cur_size = ggml_v3_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_V3_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_V3_TYPE_F16:
|
||||||
case GGML_TYPE_I8:
|
case GGML_V3_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_V3_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_V3_TYPE_I32:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_V3_TYPE_Q8_1:
|
||||||
case GGML_TYPE_COUNT:
|
case GGML_V3_TYPE_COUNT:
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -216,7 +216,7 @@ bool ggml_common_quantize_0(
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||||
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
|
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_v3_type_name(qtype));
|
||||||
|
|
||||||
{
|
{
|
||||||
int64_t sum_all = 0;
|
int64_t sum_all = 0;
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml_v3.h"
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
enum ggml_ftype ggml_parse_ftype(const char * str);
|
enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str);
|
||||||
|
|
||||||
void ggml_print_ftypes(FILE * fp = stderr);
|
void ggml_v3_print_ftypes(FILE * fp = stderr);
|
||||||
|
|
||||||
bool ggml_common_quantize_0(
|
bool ggml_v3_common_quantize_0(
|
||||||
std::ifstream & finp,
|
std::ifstream & finp,
|
||||||
std::ofstream & fout,
|
std::ofstream & fout,
|
||||||
const ggml_ftype ftype,
|
const ggml_v3_ftype ftype,
|
||||||
const std::vector<std::string> & to_quant,
|
const std::vector<std::string> & to_quant,
|
||||||
const std::vector<std::string> & to_skip);
|
const std::vector<std::string> & to_skip);
|
|
@ -1,4 +1,4 @@
|
||||||
#include "utils.h"
|
#include "otherarch/utils.h"
|
||||||
#include "common-ggml.h"
|
#include "common-ggml.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -22,7 +22,7 @@ struct gpt2_hparams {
|
||||||
};
|
};
|
||||||
|
|
||||||
// quantize a model
|
// quantize a model
|
||||||
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
|
||||||
gpt_vocab vocab;
|
gpt_vocab vocab;
|
||||||
|
|
||||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
|
@ -62,8 +62,8 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
||||||
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||||
|
|
||||||
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
|
||||||
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
|
||||||
|
|
||||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
|
@ -73,7 +73,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
||||||
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
||||||
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
||||||
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
||||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
|
||||||
|
|
||||||
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||||
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||||
|
@ -120,7 +120,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
||||||
"model/h.*/mlp/c_proj/w",
|
"model/h.*/mlp/c_proj/w",
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -137,41 +137,41 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||||
ggml_print_ftypes(stderr);
|
ggml_v3_print_ftypes(stderr);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// needed to initialize f16 tables
|
// needed to initialize f16 tables
|
||||||
{
|
{
|
||||||
struct ggml_init_params params = { 0, NULL, false };
|
struct ggml_v3_init_params params = { 0, NULL, false };
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_v3_context * ctx = ggml_v3_init(params);
|
||||||
ggml_free(ctx);
|
ggml_v3_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string fname_inp = argv[1];
|
const std::string fname_inp = argv[1];
|
||||||
const std::string fname_out = argv[2];
|
const std::string fname_out = argv[2];
|
||||||
|
|
||||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_v3_time_us();
|
||||||
|
|
||||||
int64_t t_quantize_us = 0;
|
int64_t t_quantize_us = 0;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
{
|
{
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_v3_time_us();
|
||||||
|
|
||||||
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
|
||||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
t_quantize_us = ggml_time_us() - t_start_us;
|
t_quantize_us = ggml_v3_time_us() - t_start_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
// report timing
|
// report timing
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = ggml_v3_time_us();
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include "utils.h"
|
#include "otherarch/utils.h"
|
||||||
#include "common-ggml.h"
|
#include "common-ggml.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -25,7 +25,7 @@ struct gptj_hparams {
|
||||||
};
|
};
|
||||||
|
|
||||||
// quantize a model
|
// quantize a model
|
||||||
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
|
||||||
gpt_vocab vocab;
|
gpt_vocab vocab;
|
||||||
|
|
||||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
|
@ -66,8 +66,8 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
||||||
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||||
|
|
||||||
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
|
||||||
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
|
||||||
|
|
||||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
|
@ -77,7 +77,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
||||||
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
||||||
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
||||||
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
||||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
|
||||||
|
|
||||||
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||||
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||||
|
@ -120,7 +120,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
||||||
".*weight",
|
".*weight",
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -135,44 +135,44 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
||||||
// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
|
// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
|
||||||
//
|
//
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_v3_time_init();
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||||
ggml_print_ftypes(stderr);
|
ggml_v3_print_ftypes(stderr);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// needed to initialize f16 tables
|
// needed to initialize f16 tables
|
||||||
{
|
{
|
||||||
struct ggml_init_params params = { 0, NULL, false };
|
struct ggml_v3_init_params params = { 0, NULL, false };
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_v3_context * ctx = ggml_v3_init(params);
|
||||||
ggml_free(ctx);
|
ggml_v3_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string fname_inp = argv[1];
|
const std::string fname_inp = argv[1];
|
||||||
const std::string fname_out = argv[2];
|
const std::string fname_out = argv[2];
|
||||||
|
|
||||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_v3_time_us();
|
||||||
|
|
||||||
int64_t t_quantize_us = 0;
|
int64_t t_quantize_us = 0;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
{
|
{
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_v3_time_us();
|
||||||
|
|
||||||
if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
if (!gptj_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
|
||||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
t_quantize_us = ggml_time_us() - t_start_us;
|
t_quantize_us = ggml_v3_time_us() - t_start_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
// report timing
|
// report timing
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = ggml_v3_time_us();
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#include "utils.h"
|
#include "otherarch/utils.h"
|
||||||
#include "common-ggml.h"
|
#include "common-ggml.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -24,7 +24,7 @@ struct mpt_hparams {
|
||||||
|
|
||||||
// quantize a model
|
// quantize a model
|
||||||
bool mpt_model_quantize(const std::string & fname_inp,
|
bool mpt_model_quantize(const std::string & fname_inp,
|
||||||
const std::string & fname_out, ggml_ftype ftype) {
|
const std::string & fname_out, ggml_v3_ftype ftype) {
|
||||||
|
|
||||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
|
|
||||||
|
@ -68,8 +68,8 @@ bool mpt_model_quantize(const std::string & fname_inp,
|
||||||
finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
|
finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
|
||||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||||
|
|
||||||
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
|
||||||
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
|
||||||
|
|
||||||
printf("%s: d_model = %d\n", __func__, hparams.d_model);
|
printf("%s: d_model = %d\n", __func__, hparams.d_model);
|
||||||
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
|
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
|
||||||
|
@ -81,7 +81,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
|
||||||
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
||||||
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
||||||
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
||||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
|
||||||
|
|
||||||
fout.write((char *) &hparams.d_model, sizeof(hparams.d_model));
|
fout.write((char *) &hparams.d_model, sizeof(hparams.d_model));
|
||||||
fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
|
fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
|
||||||
|
@ -116,7 +116,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
|
||||||
".*weight",
|
".*weight",
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
|
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
|
||||||
fname_inp.c_str());
|
fname_inp.c_str());
|
||||||
return false;
|
return false;
|
||||||
|
@ -136,42 +136,42 @@ int main(int argc, char ** argv) {
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
|
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
|
||||||
argv[0]);
|
argv[0]);
|
||||||
ggml_print_ftypes(stderr);
|
ggml_v3_print_ftypes(stderr);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// needed to initialize f16 tables
|
// needed to initialize f16 tables
|
||||||
{
|
{
|
||||||
struct ggml_init_params params = {0, NULL, false};
|
struct ggml_v3_init_params params = {0, NULL, false};
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_v3_context * ctx = ggml_v3_init(params);
|
||||||
ggml_free(ctx);
|
ggml_v3_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string fname_inp = argv[1];
|
const std::string fname_inp = argv[1];
|
||||||
const std::string fname_out = argv[2];
|
const std::string fname_out = argv[2];
|
||||||
|
|
||||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_v3_time_us();
|
||||||
|
|
||||||
int64_t t_quantize_us = 0;
|
int64_t t_quantize_us = 0;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
{
|
{
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_v3_time_us();
|
||||||
|
|
||||||
if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
if (!mpt_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
|
||||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n",
|
fprintf(stderr, "%s: failed to quantize model from '%s'\n",
|
||||||
__func__, fname_inp.c_str());
|
__func__, fname_inp.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
t_quantize_us = ggml_time_us() - t_start_us;
|
t_quantize_us = ggml_v3_time_us() - t_start_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
// report timing
|
// report timing
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = ggml_v3_time_us();
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("%s: quantize time = %8.2f ms\n", __func__,
|
printf("%s: quantize time = %8.2f ms\n", __func__,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include "utils.h"
|
#include "otherarch/utils.h"
|
||||||
#include "common-ggml.h"
|
#include "common-ggml.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -26,7 +26,7 @@ struct gpt_neox_hparams {
|
||||||
};
|
};
|
||||||
|
|
||||||
// quantize a model
|
// quantize a model
|
||||||
bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
|
||||||
gpt_vocab vocab;
|
gpt_vocab vocab;
|
||||||
|
|
||||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
|
@ -68,8 +68,8 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
|
||||||
finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
|
finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
|
||||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||||
|
|
||||||
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
|
||||||
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
|
||||||
|
|
||||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
|
@ -80,7 +80,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
|
||||||
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
||||||
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
||||||
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
||||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
|
||||||
|
|
||||||
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||||
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||||
|
@ -116,7 +116,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
|
||||||
".*weight",
|
".*weight",
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -131,44 +131,44 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
|
||||||
// ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
|
// ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
|
||||||
//
|
//
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_v3_time_init();
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||||
ggml_print_ftypes(stderr);
|
ggml_v3_print_ftypes(stderr);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// needed to initialize f16 tables
|
// needed to initialize f16 tables
|
||||||
{
|
{
|
||||||
struct ggml_init_params params = { 0, NULL, false };
|
struct ggml_v3_init_params params = { 0, NULL, false };
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_v3_context * ctx = ggml_v3_init(params);
|
||||||
ggml_free(ctx);
|
ggml_v3_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string fname_inp = argv[1];
|
const std::string fname_inp = argv[1];
|
||||||
const std::string fname_out = argv[2];
|
const std::string fname_out = argv[2];
|
||||||
|
|
||||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_v3_time_us();
|
||||||
|
|
||||||
int64_t t_quantize_us = 0;
|
int64_t t_quantize_us = 0;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
{
|
{
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_v3_time_us();
|
||||||
|
|
||||||
if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
|
||||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
t_quantize_us = ggml_time_us() - t_start_us;
|
t_quantize_us = ggml_v3_time_us() - t_start_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
// report timing
|
// report timing
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = ggml_v3_time_us();
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue