mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-08 03:19:05 +00:00
Removed support for GGML_TYPE_Q4_0_4_4, GGML_TYPE_0_4_8, and GGML_TYPE_0_8_8 (GGUF no longer supports these types)
This commit is contained in:
parent
9cbdf01645
commit
e2cda4cfa0
5 changed files with 11 additions and 119 deletions
|
@ -385,9 +385,9 @@ extern "C" {
|
||||||
GGML_TYPE_F64 = 28,
|
GGML_TYPE_F64 = 28,
|
||||||
GGML_TYPE_IQ1_M = 29,
|
GGML_TYPE_IQ1_M = 29,
|
||||||
GGML_TYPE_BF16 = 30,
|
GGML_TYPE_BF16 = 30,
|
||||||
GGML_TYPE_Q4_0_4_4 = 31,
|
// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
|
||||||
GGML_TYPE_Q4_0_4_8 = 32,
|
// GGML_TYPE_Q4_0_4_8 = 32,
|
||||||
GGML_TYPE_Q4_0_8_8 = 33,
|
// GGML_TYPE_Q4_0_8_8 = 33,
|
||||||
GGML_TYPE_TQ1_0 = 34,
|
GGML_TYPE_TQ1_0 = 34,
|
||||||
GGML_TYPE_TQ2_0 = 35,
|
GGML_TYPE_TQ2_0 = 35,
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
|
@ -431,9 +431,6 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
|
||||||
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
|
||||||
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
|
|
|
@ -15725,15 +15725,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||||
{
|
{
|
||||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
|
||||||
{
|
|
||||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
|
||||||
{
|
|
||||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
|
|
||||||
} break;
|
|
||||||
|
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
|
|
|
@ -1076,54 +1076,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_BF16,
|
.vec_dot_type = GGML_TYPE_BF16,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_0_4_4] = {
|
|
||||||
.type_name = "q4_0_4x4",
|
|
||||||
.blck_size = QK4_0,
|
|
||||||
.blck_size_interleave = 4,
|
|
||||||
.type_size = sizeof(block_q4_0),
|
|
||||||
.is_quantized = true,
|
|
||||||
.to_float = NULL,
|
|
||||||
.from_float = NULL,
|
|
||||||
.from_float_ref = NULL,
|
|
||||||
.vec_dot = NULL,
|
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
|
||||||
.nrows = 1,
|
|
||||||
.ncols = 4,
|
|
||||||
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
|
||||||
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
|
||||||
},
|
|
||||||
[GGML_TYPE_Q4_0_4_8] = {
|
|
||||||
.type_name = "q4_0_4x8",
|
|
||||||
.blck_size = QK4_0,
|
|
||||||
.blck_size_interleave = 8,
|
|
||||||
.type_size = sizeof(block_q4_0),
|
|
||||||
.is_quantized = true,
|
|
||||||
.to_float = NULL,
|
|
||||||
.from_float = NULL,
|
|
||||||
.from_float_ref = NULL,
|
|
||||||
.vec_dot = NULL,
|
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
|
||||||
.nrows = 1,
|
|
||||||
.ncols = 4,
|
|
||||||
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
|
||||||
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
|
||||||
},
|
|
||||||
[GGML_TYPE_Q4_0_8_8] = {
|
|
||||||
.type_name = "q4_0_8x8",
|
|
||||||
.blck_size = QK4_0,
|
|
||||||
.blck_size_interleave = 8,
|
|
||||||
.type_size = sizeof(block_q4_0),
|
|
||||||
.is_quantized = true,
|
|
||||||
.to_float = NULL,
|
|
||||||
.from_float = NULL,
|
|
||||||
.from_float_ref = NULL,
|
|
||||||
.vec_dot = NULL,
|
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
|
||||||
.nrows = 1,
|
|
||||||
.ncols = 8,
|
|
||||||
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
|
||||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
|
||||||
},
|
|
||||||
[GGML_TYPE_TQ1_0] = {
|
[GGML_TYPE_TQ1_0] = {
|
||||||
.type_name = "tq1_0",
|
.type_name = "tq1_0",
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
|
@ -3578,9 +3530,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
||||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
|
|
||||||
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
|
|
||||||
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
|
|
||||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||||
}
|
}
|
||||||
|
@ -9517,9 +9466,6 @@ static void ggml_compute_forward_add(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add_q_f32(params, dst);
|
ggml_compute_forward_add_q_f32(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -9897,9 +9843,6 @@ static void ggml_compute_forward_add1(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add1_q_f32(params, dst);
|
ggml_compute_forward_add1_q_f32(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -10027,9 +9970,6 @@ static void ggml_compute_forward_acc(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
@ -13093,9 +13033,6 @@ static void ggml_compute_forward_out_prod(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
|
||||||
{
|
{
|
||||||
ggml_compute_forward_out_prod_q_f32(params, dst);
|
ggml_compute_forward_out_prod_q_f32(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -13283,9 +13220,6 @@ static void ggml_compute_forward_set(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
@ -13547,9 +13481,6 @@ static void ggml_compute_forward_get_rows(
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
|
||||||
{
|
{
|
||||||
ggml_compute_forward_get_rows_q(params, dst);
|
ggml_compute_forward_get_rows_q(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
@ -14139,9 +14070,6 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
@ -21941,9 +21869,6 @@ size_t ggml_quantize_chunk(
|
||||||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
||||||
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
||||||
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
size_t elemsize = sizeof(ggml_fp16_t);
|
size_t elemsize = sizeof(ggml_fp16_t);
|
||||||
|
|
|
@ -165,18 +165,18 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors, 1 bit quantization
|
||||||
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors, 1 bit quantization
|
||||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
|
// LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
|
// LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
|
// LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
||||||
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||||
|
|
||||||
|
|
|
@ -4843,9 +4843,7 @@ struct llama_model_loader {
|
||||||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||||
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
|
||||||
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
|
||||||
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||||
|
@ -5653,9 +5651,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
|
||||||
|
|
||||||
default: return "unknown, may not work";
|
default: return "unknown, may not work";
|
||||||
}
|
}
|
||||||
|
@ -18996,10 +18991,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||||
new_type = GGML_TYPE_IQ3_S;
|
new_type = GGML_TYPE_IQ3_S;
|
||||||
}
|
}
|
||||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
|
|
||||||
new_type == GGML_TYPE_Q4_0_8_8) {
|
|
||||||
new_type = GGML_TYPE_Q4_0;
|
|
||||||
}
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
||||||
new_type = GGML_TYPE_Q4_K;
|
new_type = GGML_TYPE_Q4_K;
|
||||||
}
|
}
|
||||||
|
@ -19322,10 +19313,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
|
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
|
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
|
|
||||||
|
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19645,14 +19633,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
f32_data = (float *) f32_conv_buf.data();
|
f32_data = (float *) f32_conv_buf.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
int chunk_size_multiplier = 1;
|
|
||||||
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
|
|
||||||
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
|
|
||||||
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
|
||||||
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
|
||||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
|
@ -19665,8 +19645,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
const int64_t nrows = tensor->ne[1];
|
const int64_t nrows = tensor->ne[1];
|
||||||
|
|
||||||
static const int64_t min_chunk_size = 32 * 512;
|
static const int64_t min_chunk_size = 32 * 512;
|
||||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
|
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
||||||
chunk_size_multiplier;
|
|
||||||
|
|
||||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||||
|
|
Loading…
Add table
Reference in a new issue