diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 7aa2c4bd4..7e344f4b8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -390,12 +390,12 @@ extern "C" { GGML_TYPE_F64 = 28, GGML_TYPE_IQ1_M = 29, GGML_TYPE_BF16 = 30, - // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files - // GGML_TYPE_Q4_0_4_8 = 32, - // GGML_TYPE_Q4_0_8_8 = 33, + GGML_TYPE_Q4_0_4_4 = 31, //deprecated upstream + GGML_TYPE_Q4_0_4_8 = 32, //deprecated upstream + GGML_TYPE_Q4_0_8_8 = 33, //deprecated upstream GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, - // GGML_TYPE_IQ4_NL_4_4 = 36, + GGML_TYPE_IQ4_NL_4_4 = 36, //deprecated upstream // GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_8_8 = 38, GGML_TYPE_COUNT = 39, diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 386cd53d6..977b07585 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -3692,7 +3692,14 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in return out; } +static bool kcpp_q_already_repacked = false; //to support legacy q4_0_M_N quants that were preconverted. + static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { + if(kcpp_q_already_repacked) //using legacy prepacked quant, so just copy it + { + memcpy(t->data, data, data_size); + return 0; + } GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 4 || interleave_block == 8); constexpr int nrows_interleaved = 4; @@ -3724,6 +3731,11 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block } static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { + if(kcpp_q_already_repacked) //using legacy prepacked quant, so just copy it + { + memcpy(t->data, data, data_size); + return 0; + } GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 8); constexpr int nrows_interleaved = 8; @@ -3790,6 +3802,11 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s } static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { + if(kcpp_q_already_repacked) //using legacy prepacked quant, so just copy it + { + memcpy(t->data, data, data_size); + return 0; + } GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); //GGML_ASSERT(interleave_block == 4 || interleave_block == 8); GGML_ASSERT(interleave_block == 4); @@ -4143,6 +4160,15 @@ static const tensor_traits iq4_nl_4x4_q8_0; } } // namespace ggml::cpu::aarch64 +static void flag_aarch_prepacked_quant(int type) +{ + if(!kcpp_q_already_repacked) + { + printf("\nWARNING! Legacy aarch64 prepacked QM_0_M_N quant (%d) detected! Please switch to Q4_0!\n",type); + kcpp_q_already_repacked = true; + } +} + static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { if (cur->type == GGML_TYPE_Q4_0) { if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { @@ -4167,6 +4193,26 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con } } } + else if (cur->type == GGML_TYPE_Q4_0_4_4) //kcpp backport old quant support + { + flag_aarch_prepacked_quant(cur->type); + return &ggml::cpu::aarch64::q4_0_4x4_q8_0; + } + else if (cur->type == GGML_TYPE_Q4_0_4_8) + { + flag_aarch_prepacked_quant(cur->type); + return &ggml::cpu::aarch64::q4_0_4x8_q8_0; + } + else if (cur->type == GGML_TYPE_Q4_0_8_8) + { + flag_aarch_prepacked_quant(cur->type); + return &ggml::cpu::aarch64::q4_0_8x8_q8_0; + } + else if (cur->type == GGML_TYPE_IQ4_NL) + { + flag_aarch_prepacked_quant(cur->type); + return &ggml::cpu::aarch64::iq4_nl_4x4_q8_0; + } return nullptr; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index fb56fb500..1077d8b91 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -792,23 +792,32 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, }, - [31] = { // GGML_TYPE_Q4_0_4_4 - .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking", - .blck_size = 0, - .type_size = 0, - .is_quantized = false, + [GGML_TYPE_Q4_0_4_4] = { // deprecated upstream + .type_name = "q4_0_4x4", + .blck_size = QK4_0, + .blck_size_interleave = 4, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, }, - [32] = { // GGML_TYPE_Q4_0_4_8 - .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking", - .blck_size = 0, - .type_size = 0, - .is_quantized = false, + [GGML_TYPE_Q4_0_4_8] = { // deprecated upstream + .type_name = "q4_0_4x8", + .blck_size = QK4_0, + .blck_size_interleave = 8, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, }, - [33] = { // GGML_TYPE_Q4_0_8_8 - .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking", - .blck_size = 0, - .type_size = 0, - .is_quantized = false, + [GGML_TYPE_Q4_0_8_8] = { // deprecated upstream + .type_name = "q4_0_8x8", + .blck_size = QK4_0, + .blck_size_interleave = 8, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", @@ -826,11 +835,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_tq2_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, }, - [36] = { // GGML_TYPE_IQ4_NL_4_4 - .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking", - .blck_size = 0, - .type_size = 0, - .is_quantized = false, + [GGML_TYPE_IQ4_NL_4_4] = { // deprecated upstream + .type_name = "iq4_nl_4x4", + .blck_size = QK4_NL, + .blck_size_interleave = 4, + .type_size = sizeof(block_iq4_nl), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, }, [37] = { // GGML_TYPE_IQ4_NL_4_8 .type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",