mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
fixed quant tools not compiling, updated docs
This commit is contained in:
parent
273d48ad96
commit
0061299cce
8 changed files with 228 additions and 132 deletions
10
Makefile
10
Makefile
|
@ -606,15 +606,15 @@ endif
|
|||
# tools
|
||||
quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
quantize_gptj: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||
quantize_gptj: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
quantize_gpt2: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||
quantize_gpt2: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
quantize_neox: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||
quantize_neox: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
quantize_mpt: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||
quantize_mpt: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
quantize_clip: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
|
||||
quantize_clip: ggml_v3.o ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
#window simple clinfo
|
||||
|
|
|
@ -151,6 +151,11 @@
|
|||
"exclusiveMinimum": 0,
|
||||
"type": "number"
|
||||
},
|
||||
"dynatemp_exponent": {
|
||||
"default": 1,
|
||||
"description": "Exponent used in dynatemp.",
|
||||
"type": "number"
|
||||
},
|
||||
"mirostat": {
|
||||
"description": "KoboldCpp ONLY. Sets the mirostat mode, 0=disabled, 1=mirostat_v1, 2=mirostat_v2",
|
||||
"minimum": 0,
|
||||
|
@ -1004,6 +1009,98 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/sdapi/v1/img2img": {
|
||||
"post": {
|
||||
"description": "Transforms an existing image into a new image, guided by a text prompt, and returns a base64 encoded png.",
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"example": {
|
||||
"prompt": "picture of a kobold, high quality HD render",
|
||||
"negative_prompt": "ugly, deformed, censored",
|
||||
"cfg_scale": 5,
|
||||
"steps": 20,
|
||||
"width": 512,
|
||||
"height": 512,
|
||||
"seed": -1,
|
||||
"sampler_name": "Euler a",
|
||||
"denoising_strength": 0.6,
|
||||
"init_images":["base64_image_data"],
|
||||
},
|
||||
"schema": {
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string"
|
||||
},
|
||||
"negative_prompt": {
|
||||
"type": "string"
|
||||
},
|
||||
"cfg_scale": {
|
||||
"type": "number"
|
||||
},
|
||||
"steps": {
|
||||
"type": "number"
|
||||
},
|
||||
"width": {
|
||||
"type": "number"
|
||||
},
|
||||
"height": {
|
||||
"type": "number"
|
||||
},
|
||||
"seed": {
|
||||
"type": "number"
|
||||
},
|
||||
"sampler_name": {
|
||||
"type": "string"
|
||||
},
|
||||
"denoising_strength": {
|
||||
"type": "number"
|
||||
},
|
||||
"init_images": {
|
||||
"type": "array"
|
||||
},
|
||||
},
|
||||
"type": "object"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": false
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"example":
|
||||
{
|
||||
"images":["base64_image_data"],"parameters":{},"info":""
|
||||
},
|
||||
"schema": {
|
||||
"properties": {
|
||||
"images": {
|
||||
"type": "string",
|
||||
"description": "A base64 string containing the encoded PNG of the generated image."
|
||||
},
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"description": "Not used. Will be empty."
|
||||
},
|
||||
"info": {
|
||||
"type": "string",
|
||||
"description": "Not used. Will be empty."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "Successful request"
|
||||
}
|
||||
},
|
||||
"summary": "Transforms an existing image into a new image",
|
||||
"tags": [
|
||||
"sdapi/v1"
|
||||
]
|
||||
}
|
||||
},
|
||||
"/sdapi/v1/interrogate": {
|
||||
"post": {
|
||||
"description": "Generates a short text caption describing an image.",
|
||||
|
@ -1112,7 +1209,6 @@
|
|||
};
|
||||
</script>
|
||||
|
||||
|
||||
<script>
|
||||
//self destruct into json if requested
|
||||
const urlParams = new URLSearchParams(window.location.search);
|
||||
|
|
|
@ -3,63 +3,63 @@
|
|||
#include <regex>
|
||||
#include <map>
|
||||
|
||||
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
||||
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
||||
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
||||
static const std::map<std::string, enum ggml_v3_ftype> GGML_V3_FTYPE_MAP = {
|
||||
{"q4_0", GGML_V3_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", GGML_V3_FTYPE_MOSTLY_Q4_1},
|
||||
{"q5_0", GGML_V3_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", GGML_V3_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", GGML_V3_FTYPE_MOSTLY_Q8_0},
|
||||
};
|
||||
|
||||
void ggml_print_ftypes(FILE * fp) {
|
||||
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
|
||||
void ggml_v3_print_ftypes(FILE * fp) {
|
||||
for (auto it = GGML_V3_FTYPE_MAP.begin(); it != GGML_V3_FTYPE_MAP.end(); it++) {
|
||||
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
||||
}
|
||||
}
|
||||
|
||||
enum ggml_ftype ggml_parse_ftype(const char * str) {
|
||||
enum ggml_ftype ftype;
|
||||
enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str) {
|
||||
enum ggml_v3_ftype ftype;
|
||||
if (str[0] == 'q') {
|
||||
const auto it = GGML_FTYPE_MAP.find(str);
|
||||
if (it == GGML_FTYPE_MAP.end()) {
|
||||
const auto it = GGML_V3_FTYPE_MAP.find(str);
|
||||
if (it == GGML_V3_FTYPE_MAP.end()) {
|
||||
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
|
||||
return GGML_FTYPE_UNKNOWN;
|
||||
return GGML_V3_FTYPE_UNKNOWN;
|
||||
}
|
||||
ftype = it->second;
|
||||
} else {
|
||||
ftype = (enum ggml_ftype) atoi(str);
|
||||
ftype = (enum ggml_v3_ftype) atoi(str);
|
||||
}
|
||||
|
||||
return ftype;
|
||||
}
|
||||
|
||||
bool ggml_common_quantize_0(
|
||||
bool ggml_v3_common_quantize_0(
|
||||
std::ifstream & finp,
|
||||
std::ofstream & fout,
|
||||
const ggml_ftype ftype,
|
||||
const ggml_v3_ftype ftype,
|
||||
const std::vector<std::string> & to_quant,
|
||||
const std::vector<std::string> & to_skip) {
|
||||
|
||||
ggml_type qtype = GGML_TYPE_F32;
|
||||
ggml_v3_type qtype = GGML_V3_TYPE_F32;
|
||||
|
||||
switch (ftype) {
|
||||
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
||||
case GGML_FTYPE_UNKNOWN:
|
||||
case GGML_FTYPE_ALL_F32:
|
||||
case GGML_FTYPE_MOSTLY_F16:
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
case GGML_V3_FTYPE_MOSTLY_Q4_0: qtype = GGML_V3_TYPE_Q4_0; break;
|
||||
case GGML_V3_FTYPE_MOSTLY_Q4_1: qtype = GGML_V3_TYPE_Q4_1; break;
|
||||
case GGML_V3_FTYPE_MOSTLY_Q5_0: qtype = GGML_V3_TYPE_Q5_0; break;
|
||||
case GGML_V3_FTYPE_MOSTLY_Q5_1: qtype = GGML_V3_TYPE_Q5_1; break;
|
||||
case GGML_V3_FTYPE_MOSTLY_Q8_0: qtype = GGML_V3_TYPE_Q8_0; break;
|
||||
case GGML_V3_FTYPE_UNKNOWN:
|
||||
case GGML_V3_FTYPE_ALL_F32:
|
||||
case GGML_V3_FTYPE_MOSTLY_F16:
|
||||
case GGML_V3_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
if (!ggml_is_quantized(qtype)) {
|
||||
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
|
||||
if (!ggml_v3_is_quantized(qtype)) {
|
||||
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_v3_type_name(qtype));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -69,7 +69,7 @@ bool ggml_common_quantize_0(
|
|||
std::vector<float> work;
|
||||
|
||||
std::vector<uint8_t> data_u8;
|
||||
std::vector<ggml_fp16_t> data_f16;
|
||||
std::vector<ggml_v3_fp16_t> data_f16;
|
||||
std::vector<float> data_f32;
|
||||
|
||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||
|
@ -97,7 +97,7 @@ bool ggml_common_quantize_0(
|
|||
std::string name(length, 0);
|
||||
finp.read (&name[0], length);
|
||||
|
||||
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
|
||||
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_v3_type_name((ggml_v3_type) ttype));
|
||||
|
||||
bool quantize = false;
|
||||
|
||||
|
@ -121,17 +121,17 @@ bool ggml_common_quantize_0(
|
|||
quantize &= (n_dims == 2);
|
||||
|
||||
if (quantize) {
|
||||
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
|
||||
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
||||
if (ttype != GGML_V3_TYPE_F32 && ttype != GGML_V3_TYPE_F16) {
|
||||
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ttype == GGML_TYPE_F16) {
|
||||
if (ttype == GGML_V3_TYPE_F16) {
|
||||
data_f16.resize(nelements);
|
||||
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
||||
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_v3_fp16_t));
|
||||
data_f32.resize(nelements);
|
||||
for (int i = 0; i < nelements; ++i) {
|
||||
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
||||
data_f32[i] = ggml_v3_fp16_to_fp32(data_f16[i]);
|
||||
}
|
||||
} else {
|
||||
data_f32.resize(nelements);
|
||||
|
@ -160,36 +160,36 @@ bool ggml_common_quantize_0(
|
|||
size_t cur_size = 0;
|
||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||
|
||||
switch ((ggml_type) ttype) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
switch ((ggml_v3_type) ttype) {
|
||||
case GGML_V3_TYPE_Q4_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_v3_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_V3_TYPE_Q4_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_v3_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_V3_TYPE_Q5_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_v3_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_V3_TYPE_Q5_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_v3_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_V3_TYPE_Q8_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_v3_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_COUNT:
|
||||
case GGML_V3_TYPE_F32:
|
||||
case GGML_V3_TYPE_F16:
|
||||
case GGML_V3_TYPE_I8:
|
||||
case GGML_V3_TYPE_I16:
|
||||
case GGML_V3_TYPE_I32:
|
||||
case GGML_V3_TYPE_Q8_1:
|
||||
case GGML_V3_TYPE_COUNT:
|
||||
{
|
||||
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
||||
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_v3_type_name((ggml_v3_type) ttype));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -216,7 +216,7 @@ bool ggml_common_quantize_0(
|
|||
}
|
||||
|
||||
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
|
||||
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_v3_type_name(qtype));
|
||||
|
||||
{
|
||||
int64_t sum_all = 0;
|
||||
|
|
|
@ -1,18 +1,18 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml_v3.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
enum ggml_ftype ggml_parse_ftype(const char * str);
|
||||
enum ggml_v3_ftype ggml_v3_parse_ftype(const char * str);
|
||||
|
||||
void ggml_print_ftypes(FILE * fp = stderr);
|
||||
void ggml_v3_print_ftypes(FILE * fp = stderr);
|
||||
|
||||
bool ggml_common_quantize_0(
|
||||
bool ggml_v3_common_quantize_0(
|
||||
std::ifstream & finp,
|
||||
std::ofstream & fout,
|
||||
const ggml_ftype ftype,
|
||||
const ggml_v3_ftype ftype,
|
||||
const std::vector<std::string> & to_quant,
|
||||
const std::vector<std::string> & to_skip);
|
|
@ -1,4 +1,4 @@
|
|||
#include "utils.h"
|
||||
#include "otherarch/utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -22,7 +22,7 @@ struct gpt2_hparams {
|
|||
};
|
||||
|
||||
// quantize a model
|
||||
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
||||
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
|
||||
gpt_vocab vocab;
|
||||
|
||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||
|
@ -62,8 +62,8 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
||||
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
|
||||
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
|
@ -73,7 +73,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
||||
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
||||
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
|
||||
|
||||
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||
|
@ -120,7 +120,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
"model/h.*/mlp/c_proj/w",
|
||||
};
|
||||
|
||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
||||
return false;
|
||||
}
|
||||
|
@ -137,41 +137,41 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
int main(int argc, char ** argv) {
|
||||
if (argc != 4) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||
ggml_print_ftypes(stderr);
|
||||
ggml_v3_print_ftypes(stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// needed to initialize f16 tables
|
||||
{
|
||||
struct ggml_init_params params = { 0, NULL, false };
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
struct ggml_v3_init_params params = { 0, NULL, false };
|
||||
struct ggml_v3_context * ctx = ggml_v3_init(params);
|
||||
ggml_v3_free(ctx);
|
||||
}
|
||||
|
||||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
||||
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
const int64_t t_main_start_us = ggml_v3_time_us();
|
||||
|
||||
int64_t t_quantize_us = 0;
|
||||
|
||||
// load the model
|
||||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
const int64_t t_start_us = ggml_v3_time_us();
|
||||
|
||||
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
||||
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
t_quantize_us = ggml_time_us() - t_start_us;
|
||||
t_quantize_us = ggml_v3_time_us() - t_start_us;
|
||||
}
|
||||
|
||||
// report timing
|
||||
{
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
const int64_t t_main_end_us = ggml_v3_time_us();
|
||||
|
||||
printf("\n");
|
||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#include "utils.h"
|
||||
#include "otherarch/utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -25,7 +25,7 @@ struct gptj_hparams {
|
|||
};
|
||||
|
||||
// quantize a model
|
||||
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
||||
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
|
||||
gpt_vocab vocab;
|
||||
|
||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||
|
@ -66,8 +66,8 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
||||
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
|
||||
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
|
@ -77,7 +77,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
||||
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
||||
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
|
||||
|
||||
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||
|
@ -120,7 +120,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
".*weight",
|
||||
};
|
||||
|
||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
||||
return false;
|
||||
}
|
||||
|
@ -135,44 +135,44 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
|
||||
//
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
ggml_v3_time_init();
|
||||
if (argc != 4) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||
ggml_print_ftypes(stderr);
|
||||
ggml_v3_print_ftypes(stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// needed to initialize f16 tables
|
||||
{
|
||||
struct ggml_init_params params = { 0, NULL, false };
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
struct ggml_v3_init_params params = { 0, NULL, false };
|
||||
struct ggml_v3_context * ctx = ggml_v3_init(params);
|
||||
ggml_v3_free(ctx);
|
||||
}
|
||||
|
||||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
||||
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
const int64_t t_main_start_us = ggml_v3_time_us();
|
||||
|
||||
int64_t t_quantize_us = 0;
|
||||
|
||||
// load the model
|
||||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
const int64_t t_start_us = ggml_v3_time_us();
|
||||
|
||||
if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
||||
if (!gptj_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
t_quantize_us = ggml_time_us() - t_start_us;
|
||||
t_quantize_us = ggml_v3_time_us() - t_start_us;
|
||||
}
|
||||
|
||||
// report timing
|
||||
{
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
const int64_t t_main_end_us = ggml_v3_time_us();
|
||||
|
||||
printf("\n");
|
||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#include "utils.h"
|
||||
#include "otherarch/utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -24,7 +24,7 @@ struct mpt_hparams {
|
|||
|
||||
// quantize a model
|
||||
bool mpt_model_quantize(const std::string & fname_inp,
|
||||
const std::string & fname_out, ggml_ftype ftype) {
|
||||
const std::string & fname_out, ggml_v3_ftype ftype) {
|
||||
|
||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||
|
||||
|
@ -68,8 +68,8 @@ bool mpt_model_quantize(const std::string & fname_inp,
|
|||
finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
|
||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
||||
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
|
||||
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
|
||||
|
||||
printf("%s: d_model = %d\n", __func__, hparams.d_model);
|
||||
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
|
||||
|
@ -81,7 +81,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
|
|||
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
||||
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
||||
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
|
||||
|
||||
fout.write((char *) &hparams.d_model, sizeof(hparams.d_model));
|
||||
fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
|
||||
|
@ -116,7 +116,7 @@ bool mpt_model_quantize(const std::string & fname_inp,
|
|||
".*weight",
|
||||
};
|
||||
|
||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
|
||||
fname_inp.c_str());
|
||||
return false;
|
||||
|
@ -136,42 +136,42 @@ int main(int argc, char ** argv) {
|
|||
if (argc != 4) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
|
||||
argv[0]);
|
||||
ggml_print_ftypes(stderr);
|
||||
ggml_v3_print_ftypes(stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// needed to initialize f16 tables
|
||||
{
|
||||
struct ggml_init_params params = {0, NULL, false};
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
struct ggml_v3_init_params params = {0, NULL, false};
|
||||
struct ggml_v3_context * ctx = ggml_v3_init(params);
|
||||
ggml_v3_free(ctx);
|
||||
}
|
||||
|
||||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
||||
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
const int64_t t_main_start_us = ggml_v3_time_us();
|
||||
|
||||
int64_t t_quantize_us = 0;
|
||||
|
||||
// load the model
|
||||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
const int64_t t_start_us = ggml_v3_time_us();
|
||||
|
||||
if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
||||
if (!mpt_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n",
|
||||
__func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
t_quantize_us = ggml_time_us() - t_start_us;
|
||||
t_quantize_us = ggml_v3_time_us() - t_start_us;
|
||||
}
|
||||
|
||||
// report timing
|
||||
{
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
const int64_t t_main_end_us = ggml_v3_time_us();
|
||||
|
||||
printf("\n");
|
||||
printf("%s: quantize time = %8.2f ms\n", __func__,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#include "utils.h"
|
||||
#include "otherarch/utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -26,7 +26,7 @@ struct gpt_neox_hparams {
|
|||
};
|
||||
|
||||
// quantize a model
|
||||
bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
||||
bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_v3_ftype ftype) {
|
||||
gpt_vocab vocab;
|
||||
|
||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||
|
@ -68,8 +68,8 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
|
|||
finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
|
||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
||||
const int32_t qntvr_src = hparams.ftype / GGML_V3_QNT_VERSION_FACTOR;
|
||||
const int32_t ftype_dst = GGML_V3_QNT_VERSION * GGML_V3_QNT_VERSION_FACTOR + ftype;
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
|
@ -80,7 +80,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
|
|||
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
||||
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
||||
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
||||
printf("%s: qntvr (dst) = %d\n", __func__, GGML_V3_QNT_VERSION);
|
||||
|
||||
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||
|
@ -116,7 +116,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
|
|||
".*weight",
|
||||
};
|
||||
|
||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
if (!ggml_v3_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
||||
return false;
|
||||
}
|
||||
|
@ -131,44 +131,44 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
|
|||
// ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
|
||||
//
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
ggml_v3_time_init();
|
||||
if (argc != 4) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||
ggml_print_ftypes(stderr);
|
||||
ggml_v3_print_ftypes(stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// needed to initialize f16 tables
|
||||
{
|
||||
struct ggml_init_params params = { 0, NULL, false };
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
struct ggml_v3_init_params params = { 0, NULL, false };
|
||||
struct ggml_v3_context * ctx = ggml_v3_init(params);
|
||||
ggml_v3_free(ctx);
|
||||
}
|
||||
|
||||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
||||
const ggml_v3_ftype ftype = ggml_v3_parse_ftype(argv[3]);
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
const int64_t t_main_start_us = ggml_v3_time_us();
|
||||
|
||||
int64_t t_quantize_us = 0;
|
||||
|
||||
// load the model
|
||||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
const int64_t t_start_us = ggml_v3_time_us();
|
||||
|
||||
if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
||||
if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_v3_ftype(ftype))) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
t_quantize_us = ggml_time_us() - t_start_us;
|
||||
t_quantize_us = ggml_v3_time_us() - t_start_us;
|
||||
}
|
||||
|
||||
// report timing
|
||||
{
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
const int64_t t_main_end_us = ggml_v3_time_us();
|
||||
|
||||
printf("\n");
|
||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue