mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-10 04:00:53 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # docs/build.md # docs/ops.md # docs/ops/CPU.csv # ggml/src/ggml-cpu/kleidiai/kernels.cpp # ggml/src/ggml-cpu/kleidiai/kleidiai.cpp # ggml/src/ggml-cpu/repack.cpp # ggml/src/ggml-cpu/repack.h # src/llama-quant.cpp # tests/test-json-schema-to-grammar.cpp
This commit is contained in:
commit
1802b09e6f
9 changed files with 554 additions and 341 deletions
|
|
@ -19,6 +19,13 @@
|
|||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
|
||||
// result of parsing --tensor-type option
|
||||
// (changes to this struct must be reflected in src/llama-quant.cpp)
|
||||
struct tensor_type_option {
|
||||
std::string name;
|
||||
ggml_type type = GGML_TYPE_COUNT;
|
||||
};
|
||||
|
||||
struct quant_option {
|
||||
std::string name;
|
||||
llama_ftype ftype;
|
||||
|
|
@ -66,12 +73,6 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
|
|||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
};
|
||||
|
||||
// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
|
||||
struct tensor_quantization {
|
||||
std::string name;
|
||||
ggml_type quant = GGML_TYPE_COUNT;
|
||||
};
|
||||
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
|
||||
|
|
@ -414,7 +415,7 @@ static ggml_type parse_ggml_type(const char * arg) {
|
|||
return GGML_TYPE_COUNT;
|
||||
}
|
||||
|
||||
static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
|
||||
static bool parse_tensor_type(const char * data, std::vector<tensor_type_option> & tensor_type) {
|
||||
const char * sep = strchr(data, '=');
|
||||
if (sep == nullptr) {
|
||||
printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
|
||||
|
|
@ -434,11 +435,11 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
|
|||
std::string tn(data, tn_len);
|
||||
std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
|
||||
sep++;
|
||||
tensor_quantization tqz;
|
||||
tqz.name = tn;
|
||||
tqz.quant = parse_ggml_type(sep);
|
||||
tensor_type.emplace_back(std::move(tqz));
|
||||
if (tqz.quant == GGML_TYPE_COUNT) {
|
||||
tensor_type_option tensor_type_opt;
|
||||
tensor_type_opt.name = tn;
|
||||
tensor_type_opt.type = parse_ggml_type(sep);
|
||||
tensor_type.emplace_back(std::move(tensor_type_opt));
|
||||
if (tensor_type_opt.type == GGML_TYPE_COUNT) {
|
||||
printf("\n%s: invalid quantization type '%s'\n\n", __func__, sep);
|
||||
return false;
|
||||
}
|
||||
|
|
@ -446,7 +447,7 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool parse_tensor_type_file(const char * filename, std::vector<tensor_quantization> & tensor_type) {
|
||||
static bool parse_tensor_type_file(const char * filename, std::vector<tensor_type_option> & tensor_type) {
|
||||
std::ifstream file(filename);
|
||||
if (!file) {
|
||||
printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno));
|
||||
|
|
@ -502,7 +503,7 @@ int main(int argc, char ** argv) {
|
|||
std::string imatrix_file;
|
||||
std::vector<std::string> included_weights, excluded_weights;
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
std::vector<tensor_quantization> tensor_types;
|
||||
std::vector<tensor_type_option> tensor_type_opts;
|
||||
std::vector<int> prune_layers;
|
||||
|
||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||
|
|
@ -527,11 +528,11 @@ int main(int argc, char ** argv) {
|
|||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
|
||||
if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_type_opts)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) {
|
||||
if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_type_opts)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
|
||||
|
|
@ -625,8 +626,8 @@ int main(int argc, char ** argv) {
|
|||
kv_overrides.back().key[0] = 0;
|
||||
params.kv_overrides = &kv_overrides;
|
||||
}
|
||||
if (!tensor_types.empty()) {
|
||||
params.tensor_types = &tensor_types;
|
||||
if (!tensor_type_opts.empty()) {
|
||||
params.tensor_types = &tensor_type_opts;
|
||||
}
|
||||
if (!prune_layers.empty()) {
|
||||
params.prune_layers = &prune_layers;
|
||||
|
|
@ -693,18 +694,6 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
if (!params.dry_run &&
|
||||
(
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M
|
||||
) && imatrix_data.empty()) {
|
||||
fprintf(stderr, "\n==========================================================================================================\n");
|
||||
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
||||
fprintf(stderr, "==========================================================================================================\n\n\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!params.dry_run) {
|
||||
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
|
||||
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
|
||||
|
|
@ -754,4 +743,3 @@ int main(int argc, char ** argv) {
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue