diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
index 29819e48d..060578f0b 100644
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -893,23 +893,6 @@ static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
builder.consume_reasoning_with_xml_tool_calls(form, "", "");
}
-static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) {
- static const xml_tool_call_format form = ([]() {
- xml_tool_call_format form {};
- form.scope_start = "";
- form.tool_start = "") != std::string::npos);
+
// Handle thinking tags appropriately based on inputs.enable_thinking
- if (string_ends_with(data.prompt, "\n")) {
+ if (supports_reasoning && string_ends_with(data.prompt, "\n")) {
if (!inputs.enable_thinking) {
data.prompt += "";
} else {
@@ -1552,19 +1554,21 @@ static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_
}
data.preserved_tokens = {
- "",
- "",
"",
"",
};
+ if (supports_reasoning) {
+ data.preserved_tokens.insert(data.preserved_tokens.end(), {"", ""});
+ }
+
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
auto parser = build_chat_peg_constructed_parser([&](auto & p) {
auto reasoning = p.eps();
- if (inputs.enable_thinking && extract_reasoning) {
+ if (supports_reasoning && inputs.enable_thinking && extract_reasoning) {
auto reasoning_content = p.reasoning(p.until("")) + ("" | p.end());
if (data.thinking_forced_open) {
reasoning = reasoning_content;
@@ -1902,38 +1906,6 @@ static common_chat_params common_chat_params_init_minimax_m2(const common_chat_t
return data;
}
-static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & params) {
- common_chat_params data;
- data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
- data.prompt = apply(tmpl, params);
- data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML;
-
- data.preserved_tokens = {
- "",
- "",
- "",
- "",
- };
-
- // build grammar for tool call
- static const xml_tool_call_format form {
- /* form.scope_start = */ "\n",
- /* form.tool_start = */ "\n",
- /* form.key_start = */ "\n",
- /* form.val_end = */ "\n\n",
- /* form.tool_end = */ "\n",
- /* form.scope_end = */ "",
- };
- build_grammar_xml_tool_call(data, params.tools, form);
-
- return data;
-}
-
static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
common_chat_params data;
data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -3161,13 +3133,7 @@ static common_chat_params common_chat_templates_apply_jinja(
src.find(" support (Step-3.5-Flash, Nemotron 3 Nano) use the
- // Nemotron v3 PEG parser for streaming and schema-aware parameter parsing.
- // Qwen3-Coder has no in its template.
- if (src.find("") != std::string::npos) {
- return common_chat_params_init_nemotron_v3(tmpl, params);
- }
- return common_chat_params_init_qwen3_coder_xml(tmpl, params);
+ return common_chat_params_init_qwen3_coder(tmpl, params);
}
// Xiaomi MiMo format detection (must come before Hermes 2 Pro)
diff --git a/common/chat.h b/common/chat.h
index 1bf43f726..6f0b9409e 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -128,7 +128,6 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_GLM_4_5,
COMMON_CHAT_FORMAT_MINIMAX_M2,
COMMON_CHAT_FORMAT_KIMI_K2,
- COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
COMMON_CHAT_FORMAT_APRIEL_1_5,
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
COMMON_CHAT_FORMAT_SOLAR_OPEN,
diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h
index b6e989939..45fb3e42d 100644
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -163,15 +163,9 @@
#elif defined(__riscv)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
-#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
-#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
-#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
-#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
-#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
-#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c
index ae0ebb3ca..bf9f4df11 100644
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -1954,3 +1954,773 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
#endif
}
+static const uint8_t sign_gather_indices_arr[64] = {
+ 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,
+ 4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7
+};
+
+static const uint8_t sign_bit_masks_arr[64] = {
+ 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128,
+ 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128
+};
+
+static void ggml_vec_dot_iq2_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
+
+ const block_iq2_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+ const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
+
+ // --- Pre-load Constants ---
+ uint16_t gather_qh_arr[8] = {0, 0, 0, 0, 1, 1, 1, 1};
+ vuint16mf2_t v_gather_qh = __riscv_vle16_v_u16mf2(gather_qh_arr, 8);
+ uint16_t shift_qh_arr[8] = {11, 9, 7, 5, 11, 9, 7, 5};
+ vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 8);
+
+ // Constants for sign extraction
+ vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
+ vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
+
+ float sumf = 0.0f;
+
+ for (int i = 0; i < nb; ++i) {
+ const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint8_t * GGML_RESTRICT scales = x[i].scales;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ const uint8_t * signs_ptr = qs + 32;
+
+ float sum_block = 0.0f;
+
+ for (int ib = 0; ib < 4; ++ib) {
+ // Combine low + high bits
+ vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 8);
+ qs += 8;
+ uint16_t qh_val;
+ memcpy(&qh_val, qh, 2);
+ qh += 2;
+ vuint8mf8_t v_qh_raw = __riscv_vle8_v_u8mf8((const uint8_t*)&qh_val, 2);
+ vuint16mf4_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf4(v_qh_raw, 2);
+ vuint16mf2_t v_qh_u16_ext = __riscv_vlmul_ext_v_u16mf4_u16mf2(v_qh_u16);
+ vuint16mf2_t v_qh_expanded = __riscv_vrgather_vv_u16mf2(v_qh_u16_ext, v_gather_qh, 8);
+ v_qh_expanded = __riscv_vsll_vv_u16mf2(v_qh_expanded, v_shift_qh, 8);
+
+ // Mask: We want bits 11-12. 0x1800 = 0001 1000 0000 0000
+ v_qh_expanded = __riscv_vand_vx_u16mf2(v_qh_expanded, 0x1800, 8);
+ vuint16mf2_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 8);
+
+ // Multiply by 8 to get byte offset, instead of element offset
+ v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16, 3, 8);
+ vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_expanded, 8);
+
+ // Lookup Grid using Byte Offsets
+ vuint64m2_t v_grid_vals = __riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 8);
+
+ vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u64m2_u8m2(v_grid_vals);
+ vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(v_grid_u8);
+
+ // Load signs and generate sign mask
+ vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 8);
+ signs_ptr += 8;
+
+ vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
+ vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
+
+ vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
+ vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
+
+ vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
+ q8 += 64;
+
+ vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
+ vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 64);
+
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
+
+ int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
+ __riscv_vget_v_i16m4_i16m1(v_dot, 0), v_zero, 16));
+ int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
+ __riscv_vget_v_i16m4_i16m1(v_dot, 1), v_zero, 16));
+ int32_t s2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
+ __riscv_vget_v_i16m4_i16m1(v_dot, 2), v_zero, 16));
+ int32_t s3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
+ __riscv_vget_v_i16m4_i16m1(v_dot, 3), v_zero, 16));
+
+ uint8_t sc0 = scales[0];
+ uint8_t sc1 = scales[1];
+ scales += 2;
+
+ sum_block += s0 * (2 * (sc0 & 0xF) + 1);
+ sum_block += s1 * (2 * (sc0 >> 4) + 1);
+ sum_block += s2 * (2 * (sc1 & 0xF) + 1);
+ sum_block += s3 * (2 * (sc1 >> 4) + 1);
+ }
+ sumf += sum_block * combined_scale;
+ }
+ *s = 0.125f * sumf;
+}
+
+static void ggml_vec_dot_iq2_s_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
+
+ const block_iq2_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+ const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
+
+ // Pre-load Constants
+ vuint8m2_t v_ids = __riscv_vid_v_u8m2(32);
+ vuint8m2_t v_sign_gather_indices = __riscv_vsrl_vx_u8m2(v_ids, 3, 32);
+ vuint8m2_t v_ones = __riscv_vmv_v_x_u8m2(1, 32);
+ vuint8m2_t v_shift_amts = __riscv_vand_vx_u8m2(v_ids, 7, 32);
+ vuint8m2_t v_sign_masks = __riscv_vsll_vv_u8m2(v_ones, v_shift_amts, 32);
+ uint16_t shift_qh_arr[4] = {11, 9, 7, 5};
+ vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 4);
+
+ float sumf = 0.0f;
+
+ for (int i = 0; i < nb; ++i) {
+ const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint8_t * GGML_RESTRICT scales = x[i].scales;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ const uint8_t * signs_ptr = qs + 32;
+ float sum_block = 0.0f;
+
+ for (int ib = 0; ib < 8; ++ib) {
+
+ // Load Low Bits [4 bytes]
+ vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 4);
+ qs += 4;
+
+ // Load 1 byte. It contains bits for 4 mini-blocks.
+ uint8_t qh_val = *qh++;
+
+ // Combine Low + High bits of 10bit indices
+ vuint8mf4_t v_qh_raw = __riscv_vmv_v_x_u8mf4(qh_val, 4);
+ vuint16mf2_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qh_raw, 4);
+ vuint16mf2_t v_qh_mf2 = __riscv_vsll_vv_u16mf2(v_qh_u16, v_shift_qh, 4);
+ v_qh_mf2 = __riscv_vand_vx_u16mf2(v_qh_mf2, 0x1800, 4);
+ vuint16mf2_t v_qs_u16_mf2 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 4);
+ vuint16mf2_t v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16_mf2, 3, 4);
+ vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_mf2, 4);
+
+ // Lookup Grid
+ vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(__riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 4)));
+
+ vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 4);
+ signs_ptr += 4;
+ vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
+ vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 32);
+
+ // generating sign mask
+ vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 32);
+ vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 32);
+
+ vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 32);
+ q8 += 32;
+
+ // apply signs
+ vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative,v_q8, v_q8, 0, 32);
+ vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 32);
+
+ // Reduction
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
+
+ // Reduce 0-15 (First Half)
+ int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
+ __riscv_vget_v_i16m4_i16m2(v_dot, 0), v_zero, 16));
+
+ // Reduce 16-31 (Second Half)
+ int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
+ __riscv_vget_v_i16m4_i16m2(v_dot, 1), v_zero, 16));
+
+ // Apply sub Scales
+ uint8_t sc = *scales++;
+
+ sum_block += s0 * (2 * (sc & 0xF) + 1);
+ sum_block += s1 * (2 * (sc >> 4) + 1);
+ }
+ sumf += sum_block * combined_scale;
+ }
+ *s = 0.125f * sumf;
+}
+
+void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+#if defined __riscv_v_intrinsic
+ switch (__riscv_vlenb() * 8) {
+ case 128:
+ ggml_vec_dot_iq2_s_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ case 256:
+ ggml_vec_dot_iq2_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ default:
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ }
+#else
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+static void ggml_vec_dot_iq3_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq3_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+ const uint64_t * grid64 = (const uint64_t *)iq3s_grid;
+
+ // --- Pre-load Constants ---
+ const uint16_t qh_bit_shifts_arr[16] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ };
+ vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
+ vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
+ vuint16m1_t v_qh_shifts = __riscv_vle16_v_u16m1(qh_bit_shifts_arr, 16);
+
+ float sumf = 0.0f;
+
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d);
+ const float combined_scale = d * y[i].d;
+
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint8_t * GGML_RESTRICT scales = x[i].scales;
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ float sum_block = 0.0f;
+
+ // Loop: Process 64 weights (16 mini-blocks of 4) per iteration
+ for (int ib = 0; ib < 4; ++ib) {
+
+ vuint8mf2_t v_qs_u8 = __riscv_vle8_v_u8mf2(qs, 16);
+ qs += 16;
+
+ uint16_t qh_val;
+ memcpy(&qh_val, qh, 2);
+ qh += 2;
+
+ vuint16m1_t v_qh_val = __riscv_vmv_v_x_u16m1(qh_val, 16);
+ // Extract bits: (qh >> i) & 1
+ v_qh_val = __riscv_vsrl_vv_u16m1(v_qh_val, v_qh_shifts, 16);
+ v_qh_val = __riscv_vand_vx_u16m1(v_qh_val, 1, 16);
+
+ vuint16m1_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16m1(v_qs_u8, 16);
+ v_qs_u16 = __riscv_vsll_vx_u16m1(v_qs_u16, 2, 16);
+ v_qh_val = __riscv_vsll_vx_u16m1(v_qh_val, 10, 16);
+ vuint16m1_t v_grid_offsets = __riscv_vor_vv_u16m1(v_qs_u16, v_qh_val, 16);
+
+ // Grid value is 4xuint8
+ vuint32m2_t v_grid_packed = __riscv_vluxei16_v_u32m2((const uint32_t *)grid64, v_grid_offsets, 16);
+ vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u32m2_u8m2(v_grid_packed);
+ vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs, 8);
+ signs += 8;
+
+ // Generate sign mask
+ vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
+ vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
+ vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
+ vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
+
+ vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
+ q8 += 64;
+
+ // Apply Signs
+ vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
+ vint16m4_t v_dot = __riscv_vwmulsu_vv_i16m4(v_q8_signed, v_grid_u8, 64);
+
+ // Reduction
+ vint16m2_t v_dot_lo = __riscv_vget_v_i16m4_i16m2(v_dot, 0);
+ vint16m2_t v_dot_hi = __riscv_vget_v_i16m4_i16m2(v_dot, 1);
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
+
+ int32_t s_lo = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_lo, v_zero, 32));
+ int32_t s_hi = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_hi, v_zero, 32));
+
+ // Apply sub-scales
+ uint8_t sc_byte = *scales++;
+ int sc_lo = (sc_byte & 0xF) * 2 + 1;
+ int sc_hi = (sc_byte >> 4) * 2 + 1;
+
+ sum_block += s_lo * sc_lo + s_hi * sc_hi;
+ }
+ sumf += sum_block * combined_scale;
+ }
+ *s = 0.125f * sumf;
+}
+
+void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+#if defined __riscv_v_intrinsic
+ switch (__riscv_vlenb() * 8) {
+ case 256:
+ ggml_vec_dot_iq3_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ default:
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ }
+#else
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+static void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_tq1_0 * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+ float sumf = 0.0f;
+ uint8_t pow[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
+
+ for (int i = 0; i < nb; i++) {
+ // First loop.
+ vint32m4_t suml1;
+ {
+ const int vl = 32;
+ vuint8m1_t tq = __riscv_vle8_v_u8m1(x[i].qs, vl);
+
+ vuint16m2_t tq0 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(tq, 3, vl), 8, vl);
+ vuint16m2_t tq1 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 3, vl), 3, vl), 8, vl);
+ vuint16m2_t tq2 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 9, vl), 3, vl), 8, vl);
+ vuint16m2_t tq3 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 27, vl), 3, vl), 8, vl);
+ vuint16m2_t tq4 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 81, vl), 3, vl), 8, vl);
+
+ vint16m2_t q80 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 0, vl), vl);
+ vint16m2_t q81 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 32, vl), vl);
+ vint16m2_t q82 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 64, vl), vl);
+ vint16m2_t q83 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 96, vl), vl);
+ vint16m2_t q84 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 128, vl), vl);
+
+ vint16m2_t sum0 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq0, 1, vl)), q80, vl);
+ vint16m2_t sum1 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq1, 1, vl)), q81, vl);
+ vint16m2_t sum2 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq2, 1, vl)), q82, vl);
+ vint16m2_t sum3 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq3, 1, vl)), q83, vl);
+ vint16m2_t sum4 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq4, 1, vl)), q84, vl);
+
+ vint32m4_t sumi0 = __riscv_vwadd_vv_i32m4(sum0, sum1, vl);
+ vint32m4_t sumi1 = __riscv_vwadd_vv_i32m4(sum2, sum3, vl);
+ suml1 = __riscv_vadd_vv_i32m4(__riscv_vwcvt_x_x_v_i32m4(sum4, vl), __riscv_vadd_vv_i32m4(sumi0, sumi1, vl), vl);
+ }
+
+ // Second loop.
+ vint32m2_t suml2;
+ {
+ const int vl = 16;
+ vuint8mf2_t tq = __riscv_vle8_v_u8mf2(x[i].qs + 32, vl);
+
+ vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(tq, 3 * 1, vl), 8, vl);
+ vuint16m1_t tq1 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 3, vl), 3, vl), 8, vl);
+ vuint16m1_t tq2 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 9, vl), 3, vl), 8, vl);
+ vuint16m1_t tq3 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 27, vl), 3, vl), 8, vl);
+ vuint16m1_t tq4 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 81, vl), 3, vl), 8, vl);
+
+ vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 160, vl), vl);
+ vint16m1_t q81 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 176, vl), vl);
+ vint16m1_t q82 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 192, vl), vl);
+ vint16m1_t q83 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 208, vl), vl);
+ vint16m1_t q84 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 224, vl), vl);
+
+ vint16m1_t sum0 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
+ vint16m1_t sum1 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq1, 1, vl)), q81, vl);
+ vint16m1_t sum2 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq2, 1, vl)), q82, vl);
+ vint16m1_t sum3 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq3, 1, vl)), q83, vl);
+ vint16m1_t sum4 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq4, 1, vl)), q84, vl);
+
+ vint32m2_t sumi0 = __riscv_vwadd_vv_i32m2(sum0, sum1, vl);
+ vint32m2_t sumi1 = __riscv_vwadd_vv_i32m2(sum2, sum3, vl);
+ suml2 = __riscv_vadd_vv_i32m2(__riscv_vwcvt_x_x_v_i32m2(sum4, vl), __riscv_vadd_vv_i32m2(sumi0, sumi1, vl), vl);
+ }
+
+ // Third loop.
+ vint32m2_t suml3;
+ {
+ const int vl = 16;
+
+ uint32_t qh;
+ memcpy(&qh, &x[i].qh[0], 4);
+ // Prevent fusion with vmv.
+ __asm__ __volatile__("" : "+r"(qh));
+ vuint8mf2_t tq = __riscv_vreinterpret_v_u32mf2_u8mf2(__riscv_vmv_v_x_u32mf2(qh, vl / 4));
+
+ vuint8mf2_t p = __riscv_vle8_v_u8mf2(pow, vl);
+
+ vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vv_u8mf2(tq, p, vl), 3, vl), 8, vl);
+
+ vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 240, vl), vl);
+
+ vint16m1_t sum0 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
+ suml3 = __riscv_vwcvt_x_x_v_i32m2(sum0, vl);
+ }
+
+ vint32m2_t sumb = __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(suml1, 0), __riscv_vget_v_i32m4_i32m2(suml1, 1), 16);
+ sumb = __riscv_vadd_vv_i32m2(sumb, suml2, 16);
+ sumb = __riscv_vadd_vv_i32m2(sumb, suml3, 16);
+
+ vint32m1_t sum = __riscv_vredsum_vs_i32m2_i32m1(sumb, __riscv_vmv_v_x_i32m1(0, 1), 16);
+ sumf += __riscv_vmv_x_s_i32m1_i32(sum) * y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ }
+
+ *s = sumf;
+}
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+#if defined __riscv_v_intrinsic
+ switch (__riscv_vlenb() * 8) {
+ case 256:
+ ggml_vec_dot_tq1_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ default:
+ ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ }
+#else
+ ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+static void ggml_vec_dot_tq2_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_tq2_0 * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+ float sumf = 0.0f;
+ for (int i = 0; i < nb; ++i) {
+ int32_t sumi = 0;
+
+ for (size_t j = 0; j < sizeof(x[0].qs); j += 32) {
+ const int8_t * py0 = &y[i].qs[j * 4 + 0 * 32];
+ const int8_t * py1 = &y[i].qs[j * 4 + 1 * 32];
+ const int8_t * py2 = &y[i].qs[j * 4 + 2 * 32];
+ const int8_t * py3 = &y[i].qs[j * 4 + 3 * 32];
+ const uint8_t* px = &x[i].qs[j];
+
+ size_t vlmax_16m2 = __riscv_vsetvl_e16m2(32);
+ vint16m2_t vacc16 = __riscv_vmv_v_x_i16m2(0, vlmax_16m2);
+
+ size_t vl = __riscv_vsetvl_e8m1(32);
+
+ vuint8m1_t vx_u8 = __riscv_vle8_v_u8m1(px, vl);
+
+ vint8m1_t vy0 = __riscv_vle8_v_i8m1(py0 , vl);
+ vint8m1_t vy1 = __riscv_vle8_v_i8m1(py1, vl);
+ vint8m1_t vy2 = __riscv_vle8_v_i8m1(py2, vl);
+ vint8m1_t vy3 = __riscv_vle8_v_i8m1(py3, vl);
+
+ // l=0 (bits 1:0)
+ vuint8m1_t t0 = __riscv_vand_vx_u8m1(vx_u8, 0x03, vl);
+ vint8m1_t vq0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t0), 1, vl);
+
+ // l=1 (bits 3:2)
+ vuint8m1_t t1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 2, vl), 0x03, vl);
+ vint8m1_t vq1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t1), 1, vl);
+
+ // l=2 (bits 5:4)
+ vuint8m1_t t2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 4, vl), 0x03, vl);
+ vint8m1_t vq2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t2), 1, vl);
+
+ // l=3 (bits 7:6)
+ vuint8m1_t t3 = __riscv_vsrl_vx_u8m1(vx_u8, 6, vl); // No final AND needed as vsrl shifts in zeros
+ vint8m1_t vq3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t3), 1, vl);
+
+ // 4. Multiply and accumulate
+ vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq0, vy0, vl);
+ vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq1, vy1, vl);
+ vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq2, vy2, vl);
+ vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq3, vy3, vl);
+
+ vlmax_16m2 = __riscv_vsetvl_e16m2(32);
+ vint32m1_t vzero32 = __riscv_vmv_v_x_i32m1(0, 1);
+ vint32m1_t vred32 = __riscv_vwredsum_vs_i16m2_i32m1(vacc16, vzero32, vlmax_16m2);
+
+ sumi += __riscv_vmv_x_s_i32m1_i32(vred32);
+ }
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ sumf += (float)sumi * d;
+ }
+
+ *s = sumf;
+}
+
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+#if defined __riscv_v_intrinsic
+ switch (__riscv_vlenb() * 8) {
+ case 256:
+ ggml_vec_dot_tq2_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ default:
+ ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ }
+#else
+ ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+static void ggml_vec_dot_iq1_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq1_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+ float sumf = 0;
+ for (int i = 0; i < nb; ++i) {
+ // Load qh once for the entire superblock.
+ vuint16mf2_t qh = __riscv_vle16_v_u16mf2(x[i].qh, 8);
+
+ // Calculate ls.
+ vuint16mf2_t temp = __riscv_vsrl_vx_u16mf2(qh, 12, 8);
+ temp = __riscv_vand_vx_u16mf2(temp, 7, 8);
+ vint32m1_t ls = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwmulu_vx_u32m1(temp, 2, 8));
+ ls = __riscv_vadd_vx_i32m1(ls, 1, 8);
+
+ // Calculate delta.
+ vbool32_t mask = __riscv_vmseq_vx_u16mf2_b32(__riscv_vand_vx_u16mf2(qh, 0x8000, 8), 0, 8);
+ vint32m1_t delta_neg = __riscv_vmv_v_x_i32m1(-1, 8);
+ vint32m1_t delta_pos = __riscv_vmv_v_x_i32m1(1, 8);
+ vint32m1_t delta = __riscv_vmerge_vvm_i32m1(delta_neg, delta_pos, mask, 8);
+
+ // Load qs.
+ vuint8m1_t qs = __riscv_vle8_v_u8m1(x[i].qs, 32);
+
+ // Prepare the indices.
+ const uint64_t shift = 0x0009000600030000;
+ vuint16m2_t qh_shift = __riscv_vreinterpret_v_u64m2_u16m2(__riscv_vmv_v_x_u64m2(shift, 8));
+ vuint16m2_t qh_gather_index = __riscv_vreinterpret_v_i16m2_u16m2(
+ __riscv_vdiv_vx_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(32)), 4, 32));
+ vuint16m2_t qh_ext = __riscv_vlmul_ext_v_u16m1_u16m2(__riscv_vlmul_ext_v_u16mf2_u16m1(qh));
+ vuint16m2_t qh_index = __riscv_vrgather_vv_u16m2(qh_ext, qh_gather_index, 32);
+ qh_index = __riscv_vsrl_vv_u16m2(qh_index, qh_shift, 32);
+ qh_index = __riscv_vand_vx_u16m2(qh_index, 7, 32);
+ qh_index = __riscv_vsll_vx_u16m2(qh_index, 8, 32);
+ qh_index = __riscv_vor_vv_u16m2(qh_index, __riscv_vzext_vf2_u16m2(qs, 32), 32);
+ vuint16m2_t index = __riscv_vsll_vx_u16m2(qh_index, 3, 32);
+
+ // Final lsums.
+ int32_t lsums_s[8];
+ vint32m1_t one_scalar = __riscv_vmv_v_x_i32m1(0, 1);
+
+ // Sub-blocks 1-4
+ {
+ vuint16m1_t grid_index0 = __riscv_vget_v_u16m2_u16m1(index, 0);
+ vint8m4_t grid0 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index0, 16));
+ vint8m4_t q80 = __riscv_vle8_v_i8m4(y[i].qs, 128);
+ vint16m8_t lsum0 = __riscv_vwmul_vv_i16m8(grid0, q80, 128);
+ lsums_s[0] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 0), one_scalar, 32));
+ lsums_s[1] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 1), one_scalar, 32));
+ lsums_s[2] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 2), one_scalar, 32));
+ lsums_s[3] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 3), one_scalar, 32));
+ }
+ __asm__ __volatile__("" ::: "memory");
+ // Sub-blocks 5-8
+ {
+ vuint16m1_t grid_index1 = __riscv_vget_v_u16m2_u16m1(index, 1);
+ vint8m4_t grid1 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index1, 16));
+ vint8m4_t q81 = __riscv_vle8_v_i8m4(&y[i].qs[128], 128);
+ vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(grid1, q81, 128);
+ lsums_s[4] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 0), one_scalar, 32));
+ lsums_s[5] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 1), one_scalar, 32));
+ lsums_s[6] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 2), one_scalar, 32));
+ lsums_s[7] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 3), one_scalar, 32));
+ }
+ __asm__ __volatile__("" ::: "memory");
+ vint32m1_t lsums = __riscv_vle32_v_i32m1(&lsums_s[0], 8);
+
+ // Calculate the bsums.
+ vint16m1_t bsums_0 = __riscv_vle16_v_i16m1(y[i].bsums, 16);
+ const vuint32m1_t bsums_i32 = __riscv_vreinterpret_v_u16m1_u32m1(__riscv_vreinterpret_v_i16m1_u16m1(bsums_0));
+ const vint16mf2_t bsums_i32_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 0, 8));
+ const vint16mf2_t bsums_i32_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 16, 8));
+ const vint32m1_t bsums = __riscv_vwadd_vv_i32m1(bsums_i32_0, bsums_i32_1, 8);
+
+ // Accumulation.
+ vint32m1_t sumi_v = __riscv_vmul_vv_i32m1(ls, lsums, 8);
+ vint32m1_t sumi1_v = __riscv_vmul_vv_i32m1(__riscv_vmul_vv_i32m1(ls, delta, 8), bsums, 8);
+
+ // Update sumf.
+ int sumi = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
+ int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi1_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+ }
+
+ *s = sumf;
+}
+
+void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+#if defined __riscv_v_intrinsic
+ switch (__riscv_vlenb() * 8) {
+ case 256:
+ ggml_vec_dot_iq1_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ default:
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ }
+#else
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+static void ggml_vec_dot_iq1_m_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq1_m * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+ iq1m_scale_t scale;
+ float sumf = 0.0f;
+ for (int i = 0; i < nb; ++i) {
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * qs = x[i].qs;
+ const uint8_t * qh = x[i].qh;
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+ // Accumulators.
+ vint32m2_t acc1 = __riscv_vmv_v_x_i32m2(0, 16);
+ vint32m2_t acc2 = __riscv_vmv_v_x_i32m2(0, 16);
+
+ // We process 4 sub-blocks together.
+ for (int ib = 0; ib < QK_K/128; ib++) {
+ // Load qh for 4 sub-blocks.
+ const vuint8mf4_t qh_8 = __riscv_vle8_v_u8mf4(qh, 8);
+ const vuint16mf2_t qh_16_lo = __riscv_vzext_vf2_u16mf2(qh_8, 8);
+ const vuint16mf2_t qh_16_hi = __riscv_vsll_vx_u16mf2(qh_16_lo, 8, 8);
+ const vuint16m1_t qhb = __riscv_vzext_vf2_u16m1(
+ __riscv_vreinterpret_v_u16mf2_u8mf2(__riscv_vor_vv_u16mf2(qh_16_lo, qh_16_hi, 8)), 16);
+ qh += 8;
+
+ // Prepare grid indices.
+ const vuint16m1_t qsb = __riscv_vzext_vf2_u16m1(__riscv_vle8_v_u8mf2(&qs[0], 16), 16);
+ const vuint16m1_t shift = __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00040008, 8));
+ vuint16m1_t index = __riscv_vor_vv_u16m1(qsb, __riscv_vand_vx_u16m1(__riscv_vsll_vv_u16m1(qhb, shift, 16), 0x700, 16), 16);
+ index = __riscv_vsll_vx_u16m1(index, 3, 16);
+ qs += 16;
+
+ // Load the grid.
+ const vint8m4_t iq1b = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vreinterpret_v_u64m4_i64m4(
+ __riscv_vluxei16_v_u64m4(iq1s_grid, index, 16)));
+
+ // Prepare the deltas.
+ const vbool16_t mask = __riscv_vmsgtu_vx_u16m1_b16(
+ __riscv_vand_vv_u16m1(qhb, __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00800008, 8)), 16), 0, 16);
+ const vint64m4_t delta_pos = __riscv_vmv_v_x_i64m4(0x0101010101010101, 16);
+ const vint64m4_t delta_neg = __riscv_vmv_v_x_i64m4(0xffffffffffffffff, 16);
+ const vint8m4_t delta = __riscv_vreinterpret_v_i64m4_i8m4(
+ __riscv_vmerge_vvm_i64m4(delta_pos, delta_neg, mask, 16));
+
+ // Load q8 for sub-blocks.
+ const vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 128);
+ q8 += 128;
+
+ // Calculate the lsums.
+ const vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(iq1b, q8b, 128);
+ const vint16m8_t lsum2 = __riscv_vwmul_vv_i16m8(delta, q8b, 128);
+
+ // Prepare the scales.
+ const int16_t ls_0_0 = 2*((sc[0] >> 0) & 0x7) + 1;
+ const int16_t ls_0_1 = 2*((sc[0] >> 3) & 0x7) + 1;
+ const int16_t ls_1_0 = 2*((sc[0] >> 6) & 0x7) + 1;
+ const int16_t ls_1_1 = 2*((sc[0] >> 9) & 0x7) + 1;
+ const int16_t ls_2_0 = 2*((sc[1] >> 0) & 0x7) + 1;
+ const int16_t ls_2_1 = 2*((sc[1] >> 3) & 0x7) + 1;
+ const int16_t ls_3_0 = 2*((sc[1] >> 6) & 0x7) + 1;
+ const int16_t ls_3_1 = 2*((sc[1] >> 9) & 0x7) + 1;
+ sc += 2;
+
+ // Accumulate in acc0 and acc1 for each sub-block.
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum1, 0), 16);
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum1, 1), 16);
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum2, 0), 16);
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum2, 1), 16);
+ //
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum1, 2), 16);
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum1, 3), 16);
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum2, 2), 16);
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum2, 3), 16);
+ //
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum1, 4), 16);
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum1, 5), 16);
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum2, 4), 16);
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum2, 5), 16);
+ //
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum1, 6), 16);
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum1, 7), 16);
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum2, 6), 16);
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum2, 7), 16);
+ }
+
+ // Reduce and accumulate in `sumf`.
+ vint32m1_t one = __riscv_vmv_v_x_i32m1(0, 1);
+ int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc1, one, 16));
+ int sumi2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc2, one, 16));
+ sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (sumi1 + IQ1M_DELTA * sumi2);
+ }
+
+ *s = sumf;
+}
+
+void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+#if defined __riscv_v_intrinsic
+ switch (__riscv_vlenb() * 8) {
+ case 256:
+ ggml_vec_dot_iq1_m_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ default:
+ ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+ break;
+ }
+#else
+ ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 1e8b50321..86e91b84d 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1140,7 +1140,6 @@ struct ggml_cuda_graph_node_properties {
};
static_assert(std::is_trivial::value, "ggml_cuda_graph_node_properties must be trivial");
-static bool cugraph_warned_rec = false;
struct ggml_cuda_graph {
#ifdef USE_CUDA_GRAPH
~ggml_cuda_graph() {
@@ -1156,8 +1155,7 @@ struct ggml_cuda_graph {
size_t num_nodes = 0;
std::vector nodes;
bool disable_due_to_gpu_arch = false;
- bool disable_due_to_too_many_updates = false;
- int number_consecutive_updates = 0;
+ bool warmup_complete = false;
std::vector props;
// these are extra tensors (inputs) that participate in the ggml graph but are not nodes
@@ -1166,25 +1164,9 @@ struct ggml_cuda_graph {
// ref: https://github.com/ggml-org/llama.cpp/pull/19165
std::vector extra;
- void record_update(bool use_graph, bool update_required) {
- if (use_graph && update_required) {
- number_consecutive_updates++;
- } else {
- number_consecutive_updates = 0;
- }
- if (number_consecutive_updates >= 4) {
- if(!cugraph_warned_rec)
- {
- cugraph_warned_rec = true;
- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
- }
- disable_due_to_too_many_updates = true;
- }
- }
-
bool is_enabled() const {
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
- return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
+ return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env);
}
#endif
};
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c611a7766..0c78a141f 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3014,10 +3014,6 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
- if (graph->instance == nullptr) {
- res = true;
- }
-
// Check if the graph size has changed
if (graph->props.size() != (size_t)cgraph->n_nodes) {
res = true;
@@ -3971,14 +3967,35 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
#ifdef USE_CUDA_GRAPH
graph_key = ggml_cuda_graph_get_key(cgraph);
- use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
+ ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
if (graph->is_enabled()) {
- cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
- use_cuda_graph = ggml_cuda_graph_check_compability(cgraph);
+ const bool graph_compatible = ggml_cuda_graph_check_compability(cgraph);
+ if (graph_compatible) {
+ const bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
- graph->record_update(use_cuda_graph, cuda_graph_update_required);
+ if (!graph->warmup_complete) {
+ // Warmup: need at least 2 calls with no property change on the 2nd call
+ if (!properties_changed) {
+ graph->warmup_complete = true;
+ GGML_LOG_DEBUG("%s: CUDA graph warmup complete\n", __func__);
+ use_cuda_graph = true;
+ cuda_graph_update_required = true;
+ }
+ // else: properties changed or first call - execute directly (use_cuda_graph stays false)
+ } else {
+ // Post-warmup: normal CUDA graph operation
+ if (properties_changed) {
+ // Properties changed - reset warmup, execute directly until stable again
+ graph->warmup_complete = false;
+ GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__);
+ } else {
+ use_cuda_graph = true;
+ cuda_graph_update_required = graph->instance == nullptr;
+ }
+ }
+ }
}
#endif // USE_CUDA_GRAPH
diff --git a/include/llama.h b/include/llama.h
index 2667f685c..f273a3b11 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -392,6 +392,7 @@ extern "C" {
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
+ bool dry_run; // calculate and show the final quantization size without performing quantization
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
void * tensor_types; // pointer to vector containing tensor types
diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp
index 6a97faf07..2d6f9e7c5 100644
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -109,9 +109,9 @@ std::string llama_format_tensor_shape(const std::vector & ne) {
std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
char buf[256];
- snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
+ snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
- snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
}
return buf;
}
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 680d8852c..9a176b290 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -479,6 +479,17 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
return new_size;
}
+static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type, const llama_ftype ftype) {
+ return (
+ dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
+ dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S ||
+ dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M ||
+ ( // Q2_K_S is the worst k-quant type - only allow it without imatrix for token embeddings
+ dst_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0
+ )
+ );
+}
+
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type;
llama_ftype ftype = params->ftype;
@@ -735,24 +746,36 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
};
const auto tn = LLM_TN(model.arch);
- new_ofstream(0);
+
+ // no output file for --dry-run
+ if (!params->dry_run) {
+ new_ofstream(0);
+ }
+
+ // flag for `--dry-run`, to let the user know if imatrix will be required for a real
+ // quantization, as a courtesy
+ bool will_require_imatrix = false;
+
for (const auto * it : tensors) {
const auto & weight = *it;
ggml_tensor * tensor = weight.tensor;
- if (weight.idx != cur_split && params->keep_split) {
+ if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
close_ofstream();
new_ofstream(weight.idx);
}
const std::string name = ggml_get_name(tensor);
+ const size_t tensor_size = ggml_nbytes(tensor);
- if (!ml.use_mmap) {
- if (read_data.size() < ggml_nbytes(tensor)) {
- read_data.resize(ggml_nbytes(tensor));
+ if (!params->dry_run) {
+ if (!ml.use_mmap) {
+ if (read_data.size() < tensor_size) {
+ read_data.resize(tensor_size);
+ }
+ tensor->data = read_data.data();
}
- tensor->data = read_data.data();
+ ml.load_data_for(tensor);
}
- ml.load_data_for(tensor);
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
++idx, ml.n_tensors,
@@ -903,129 +926,155 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
quantize = tensor->type != new_type;
}
- if (!quantize) {
- new_type = tensor->type;
- new_data = tensor->data;
- new_size = ggml_nbytes(tensor);
- LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
- } else {
- const int64_t nelements = ggml_nelements(tensor);
-
- const float * imatrix = nullptr;
- if (imatrix_data) {
- auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
- if (it == imatrix_data->end()) {
- LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
- } else {
- if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
- imatrix = it->second.data();
- } else {
- LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
- int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
-
- // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
- // this is a significant error and it may be good idea to abort the process if this happens,
- // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
- // tok_embd should be ignored in this case, since it always causes this warning
- if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
- throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
- int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
- }
- }
+ // we have now decided on the target type for this tensor
+ if (params->dry_run) {
+ // the --dry-run option calculates the final quantization size without quantizting
+ if (quantize) {
+ new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
+ tensor_size/1024.0/1024.0,
+ new_size/1024.0/1024.0,
+ ggml_type_name(new_type));
+ if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+ will_require_imatrix = true;
}
- }
- if ((new_type == GGML_TYPE_IQ2_XXS ||
- new_type == GGML_TYPE_IQ2_XS ||
- new_type == GGML_TYPE_IQ2_S ||
- new_type == GGML_TYPE_IQ1_S ||
- (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
- (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
- LLAMA_LOG_ERROR("\n\n============================================================\n");
- LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
- LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
- LLAMA_LOG_ERROR("============================================================\n\n");
- throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
- }
-
- float * f32_data;
-
- if (tensor->type == GGML_TYPE_F32) {
- f32_data = (float *) tensor->data;
- } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
- throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
- llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
- f32_data = (float *) f32_conv_buf.data();
+ new_size = tensor_size;
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
}
+ total_size_org += tensor_size;
+ total_size_new += new_size;
+ continue;
+ } else {
+ // no --dry-run, perform quantization
+ if (!quantize) {
+ new_type = tensor->type;
+ new_data = tensor->data;
+ new_size = tensor_size;
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
+ } else {
+ const int64_t nelements = ggml_nelements(tensor);
- LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
- fflush(stdout);
+ const float * imatrix = nullptr;
+ if (imatrix_data) {
+ auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+ if (it == imatrix_data->end()) {
+ LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
+ } else {
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
+ imatrix = it->second.data();
+ } else {
+ LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
- if (work.size() < (size_t)nelements * 4) {
- work.resize(nelements * 4); // upper bound on size
- }
- new_data = work.data();
-
- const int64_t n_per_row = tensor->ne[0];
- const int64_t nrows = tensor->ne[1];
-
- static const int64_t min_chunk_size = 32 * 512;
- const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
-
- const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
- const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
- const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
-
- // quantize each expert separately since they have different importance matrices
- new_size = 0;
- for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
- const float * f32_data_03 = f32_data + i03 * nelements_matrix;
- void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
- const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
-
- new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
-
- // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
-#if 0
- if (new_type == GGML_TYPE_MXFP4) {
- auto * x = f32_data_03;
-
- //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
- std::vector deq(nrows*n_per_row);
- const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
- qtype->to_float(new_data_03, deq.data(), deq.size());
-
- double err = 0.0f;
- for (int i = 0; i < (int) deq.size(); ++i) {
- err += fabsf(deq[i] - x[i]);
- //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
- if (deq[i] != x[i]) {
- LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
+ // this is a significant error and it may be good idea to abort the process if this happens,
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
+ // tok_embd should be ignored in this case, since it always causes this warning
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
+ }
}
}
- //LLAMA_LOG_INFO("err = %f\n", err);
- GGML_ASSERT(err == 0.00000);
}
+ if (!imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+ LLAMA_LOG_ERROR("\n\n============================================================\n");
+ LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+ LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
+ LLAMA_LOG_ERROR("============================================================\n\n");
+ throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
+ }
+
+ float * f32_data;
+
+ if (tensor->type == GGML_TYPE_F32) {
+ f32_data = (float *) tensor->data;
+ } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
+ } else {
+ llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
+ f32_data = (float *) f32_conv_buf.data();
+ }
+
+ LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
+ fflush(stdout);
+
+ if (work.size() < (size_t)nelements * 4) {
+ work.resize(nelements * 4); // upper bound on size
+ }
+ new_data = work.data();
+
+ const int64_t n_per_row = tensor->ne[0];
+ const int64_t nrows = tensor->ne[1];
+
+ static const int64_t min_chunk_size = 32 * 512;
+ const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
+
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
+
+ // quantize each expert separately since they have different importance matrices
+ new_size = 0;
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+
+ new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+
+ // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
+#if 0
+ if (new_type == GGML_TYPE_MXFP4) {
+ auto * x = f32_data_03;
+
+ //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
+ std::vector deq(nrows*n_per_row);
+ const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
+ qtype->to_float(new_data_03, deq.data(), deq.size());
+
+ double err = 0.0f;
+ for (int i = 0; i < (int) deq.size(); ++i) {
+ err += fabsf(deq[i] - x[i]);
+ //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
+ if (deq[i] != x[i]) {
+ LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+ }
+ }
+ //LLAMA_LOG_INFO("err = %f\n", err);
+ GGML_ASSERT(err == 0.00000);
+ }
#endif
+ }
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
}
- LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
- }
- total_size_org += ggml_nbytes(tensor);
- total_size_new += new_size;
+ total_size_org += tensor_size;
+ total_size_new += new_size;
- // update the gguf meta data as we go
- gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
- GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
- gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+ // update the gguf meta data as we go
+ gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
+ GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
+ gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
- // write tensor data + padding
- fout.write((const char *) new_data, new_size);
- zeros(fout, GGML_PAD(new_size, align) - new_size);
+ // write tensor data + padding
+ fout.write((const char *) new_data, new_size);
+ zeros(fout, GGML_PAD(new_size, align) - new_size);
+ } // no --dry-run
+ } // iterate over tensors
+
+ if (!params->dry_run) {
+ close_ofstream();
}
- close_ofstream();
- LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
- LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
+ LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
+
+ if (!params->imatrix && params->dry_run && will_require_imatrix) {
+ LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
+ __func__
+ );
+ }
if (qs.n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
@@ -1048,6 +1097,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
/*.only_copy =*/ false,
/*.pure =*/ false,
/*.keep_split =*/ false,
+ /*.dry_run =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.tensor_type =*/ nullptr,
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index d3755bcc8..9609ea32e 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -121,7 +121,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
- printf(" [--prune-layers] [--keep-split] [--override-kv]\n");
+ printf(" [--prune-layers] [--keep-split] [--override-kv] [--dry-run]\n");
printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
printf(" --allow-requantize\n");
printf(" allow requantizing tensors that have already been quantized\n");
@@ -157,7 +157,10 @@ static void usage(const char * executable) {
printf(" generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" override model metadata by key in the quantized model. may be specified multiple times.\n");
- printf(" WARNING: this is an advanced option, use with care.\n\n");
+ printf(" WARNING: this is an advanced option, use with care.\n");
+ printf(" --dry-run\n");
+ printf(" calculate and show the final quantization size without performing quantization\n");
+ printf(" example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n");
printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
printf("-----------------------------------------------------------------------------\n");
printf(" allowed quantization types\n");
@@ -533,6 +536,8 @@ int main(int argc, char ** argv) {
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]);
}
+ } else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
+ params.dry_run = true;
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
@@ -631,22 +636,26 @@ int main(int argc, char ** argv) {
std::string ftype_str;
std::string suffix = ".gguf";
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
- std::string fpath;
- const size_t pos = fname_inp.find_last_of("/\\");
- if (pos != std::string::npos) {
- fpath = fname_inp.substr(0, pos + 1);
- }
+ // argv[arg_idx] is the ftype directly:
+ if (!params.dry_run) {
+ std::string fpath;
+ const size_t pos = fname_inp.find_last_of("/\\");
+ if (pos != std::string::npos) {
+ fpath = fname_inp.substr(0, pos + 1);
+ }
- // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
- fname_out = fpath + "ggml-model-" + ftype_str;
- if (!params.keep_split) {
- fname_out += suffix;
+ // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+ fname_out = fpath + "ggml-model-" + ftype_str;
+ if (!params.keep_split) {
+ fname_out += suffix;
+ }
}
arg_idx++;
if (ftype_str == "COPY") {
params.only_copy = true;
}
} else {
+ // argv[arg_idx] is not a valid ftype, so treat it as output path: