kvcache-ai-ktransformers/third_party/llamafile/iqk_mul_mat_arm.inc
2025-07-22 10:58:25 +00:00

5866 lines
No EOL
286 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp fenc=utf-8 :vi
//
// Copyright 2024 Iwan Kawrakow
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstring>
#include <type_traits>
#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64)
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "sgemm.h"
// For i-quants, I had to explicitely specify which
// functions to inline / not inline (at least for some
// of the functions), else performance would be significantly
// lower. This is worrysome as things can change with,
// e.g., a different compiler version or running on a different
// CPU.
#ifdef _MSC_VER
#define IQK_NOINLINE __declspec(noinline)
#define IQK_ALWAYS_INLINE inline
#else
#define IQK_NOINLINE __attribute__((__noinline__))
#define IQK_ALWAYS_INLINE __attribute__((always_inline))
#endif
#define GGML_COMMON_IMPL_C
#include "llama.cpp/ggml-common.h"
// clang-format off
// This matrix - vector and matrix - matrix multiplication implementation
// for legacy quants, k-quants and i-quants makes prompt processing 150-200%
// (legacy and k-quants) or 250-400% (i-quants) faster.
// compared to mainline llama.cpp (and llamafile).
// It provides implementations for ARM_NEON (all quants) and AVX2
// (all quants except sub-4 bit i-quants).
//
// Main idea is that unpacking the quants and the block scales to
// be ready for dot products with the corresponding Q8_Y quants
// takes time (here 'Y' stands for K, 0, or 1, depending on quantization type).
// Hence, if we are performing a QX x Q8_Y matrix matrix
// multiplication (as needed for prompt processing), we can get
// a significant speedup by reusing the unpacked QX quants and scales
// for multiplication with several Q8_K columns. We also achieve fewer
// loads from memory, which is the main purpose of tiling in general
// purpose matrix multiplication packages.
#include <utility>
#include <array>
#endif
constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast<ggml_type>(98);
constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast<ggml_type>(99);
namespace {
#define GEMV_Q4K
#define GEMV_Q6K
#define GEMM_Q4K_Q6K
typedef struct {
int32_t i1;
int32_t i2;
} mmid_row_mapping;
struct DataInfo {
float * s;
const char * cy;
size_t bs;
size_t by;
int cur_y = 0;
int ne11;
const mmid_row_mapping * row_mapping = nullptr;
size_t bs2 = 0;
inline const char * src1_row(int iy) const {
if (!row_mapping) return cy + (cur_y + iy)*by;
int i11 = row_mapping[cur_y + iy].i1 % ne11;
int i12 = row_mapping[cur_y + iy].i2;
return cy + (i11 + i12*ne11)*by;
}
inline void store(int ix, int iy, float result) const {
*(dst_row(iy) + ix) = result;
//dst_row(iy)[ix] = result;
}
inline float* ptr(int ix, int iy) const {
return dst_row(iy) + ix;
}
inline float * dst_row(int iy) const {
if (!row_mapping) return s + (cur_y + iy)*bs;
int i12 = row_mapping[cur_y + iy].i2;
int i1 = row_mapping[cur_y + iy].i1;
int i2 = i12;
return s + i1*bs + i2*bs2;
}
};
/*
moonll
change param for set_mul_mat
add func16
*/
typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x);
typedef void (*mul_mat_t_v2)(int m, int n, int k, const void *vx, size_t bx, const DataInfo& info);
struct MulMat {
std::array<mul_mat_t, 8> funcs = {};
mul_mat_t func16 = nullptr;
mul_mat_t_v2 funcs_v2;
//inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
if (func16 && nrc_y >= 16) {
int n_step = (nrc_y - info.cur_y)/16;
for (int ix = 0; ix < nrc_x; ix += k_x_step) {
auto this_info = info;
this_info.s += ix;
int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
for (int iy = 0; iy < n_step; ++iy) {
func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
this_info.cur_y += 16;
}
}
info.cur_y += 16 * n_step;
if (info.cur_y == nrc_y) return;
}
int n_step = (nrc_y - info.cur_y)/funcs.size();
if (n_step > 0) {
for (int ix = 0; ix < nrc_x; ix += k_x_step) {
auto this_info = info;
this_info.s += ix;
int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
for (int iy = 0; iy < n_step; ++iy) {
funcs.back()(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
this_info.cur_y += funcs.size();
}
}
info.cur_y += funcs.size() * n_step;
}
int n_left = nrc_y - info.cur_y;
if (n_left > 0) {
funcs[n_left-1](n, vx, bx, info, nrc_x);
}
}
#if defined __x86_64__ || defined(_M_X64)
static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny);
#else
IQK_NOINLINE void mul_mat_NxM_v2(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
funcs_v2(nrc_x, nrc_y, n, vx, bx, info);
return;
}
static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny);
#endif
private:
template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
};
inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
const uint16_t * scales = (const uint16_t *)scales8;
const uint32_t a0 = scales[0] | (scales[1] << 16);
const uint32_t a1 = scales[2] | (scales[3] << 16);
const uint32_t a2 = scales[4] | (scales[5] << 16);
aux32[3] = ((a2 >> 4) & 0x0f0f0f0f) | ((a1 >> 2) & 0x30303030);
aux32[1] = ((a2 >> 0) & 0x0f0f0f0f) | ((a0 >> 2) & 0x30303030);
aux32[2] = a1 & 0x3f3f3f3f;
aux32[0] = a0 & 0x3f3f3f3f;
}
/*
moonll
decoding tables
*/
#ifdef __AVX2__
static const uint64_t iq1s_grid_us[2048] = {
0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200,
0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000,
0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101,
0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101,
0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202,
0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200,
0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001,
0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202,
0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201,
0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001,
0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101,
0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101,
0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202,
0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200,
0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201,
0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002,
0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101,
0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200,
0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102,
0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101,
0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001,
0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100,
0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200,
0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101,
0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100,
0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000,
0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202,
0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200,
0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101,
0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201,
0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002,
0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001,
0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001,
0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002,
0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000,
0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101,
0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000,
0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101,
0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202,
0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201,
0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000,
0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100,
0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102,
0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002,
0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000,
0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101,
0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101,
0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200,
0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002,
0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001,
0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101,
0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101,
0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101,
0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102,
0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100,
0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002,
0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100,
0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000,
0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101,
0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101,
0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001,
0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102,
0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201,
0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202,
0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001,
0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001,
0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101,
0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102,
0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200,
0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101,
0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101,
0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000,
0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201,
0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101,
0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202,
0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102,
0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101,
0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100,
0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002,
0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201,
0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101,
0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002,
0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202,
0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101,
0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000,
0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100,
0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102,
0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102,
0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101,
0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101,
0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001,
0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201,
0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002,
0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001,
0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100,
0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101,
0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001,
0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101,
0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000,
0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001,
0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101,
0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101,
0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000,
0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001,
0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001,
0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102,
0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102,
0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101,
0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201,
0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202,
0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202,
0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101,
0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001,
0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000,
0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101,
0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200,
0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100,
0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100,
0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202,
0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102,
0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201,
0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202,
0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002,
0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001,
0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001,
0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101,
0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202,
0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201,
0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102,
0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200,
0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001,
0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101,
0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201,
0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001,
0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002,
0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000,
0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202,
0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201,
0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201,
0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101,
0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100,
0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000,
0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101,
0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202,
0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101,
0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202,
0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202,
0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201,
0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002,
0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102,
0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102,
0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000,
0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000,
0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101,
0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101,
0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202,
0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200,
0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102,
0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101,
0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100,
0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001,
0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100,
0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101,
0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001,
0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200,
0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101,
0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101,
0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100,
0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101,
0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101,
0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101,
0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202,
0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100,
0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201,
0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202,
0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102,
0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200,
0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201,
0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000,
0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002,
0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100,
0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000,
0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100,
0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000,
0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102,
0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100,
0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002,
0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001,
0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201,
0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202,
0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100,
0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001,
0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002,
0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001,
0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201,
0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001,
0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101,
0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101,
0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101,
0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101,
0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102,
0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100,
0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001,
0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000,
0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001,
0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101,
0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100,
0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000,
0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202,
0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101,
0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100,
0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100,
0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200,
0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100,
0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101,
0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101,
0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201,
0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001,
0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201,
0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201,
0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001,
0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200,
0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100,
0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201,
0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200,
0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101,
0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001,
0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102,
0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001,
0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201,
0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100,
0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000,
0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102,
0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001,
0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202,
0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102,
0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101,
0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201,
0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101,
0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102,
0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101,
0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100,
0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202,
0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101,
0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202,
0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101,
0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200,
0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101,
0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100,
0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002,
0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201,
0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100,
0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202,
0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102,
0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002,
0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200,
0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002,
0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200,
0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001,
0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200,
0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100,
0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000,
0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102,
0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100,
0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000,
0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102,
0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100,
0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000,
0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101,
0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001,
0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201,
0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002,
0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200,
0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100,
0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101,
0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202,
0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002,
0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201,
0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201,
0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001,
0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202,
0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102,
0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002,
0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201,
0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200,
0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002,
0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100,
0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101,
0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102,
0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002,
0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200,
0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100,
0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001,
0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100,
0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201,
0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101,
0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102,
0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201,
0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200,
0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200,
0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002,
0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202,
0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102,
0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000,
0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202,
0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201,
0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001,
0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002,
0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102,
0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001,
0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101,
0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202,
0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102,
0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201,
0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101,
0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101,
0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001,
0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202,
0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000,
0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202,
0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102,
0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002,
0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201,
0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101,
0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001,
0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200,
0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102,
0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102,
0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100,
0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001,
0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201,
0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001,
0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202,
0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200,
0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000,
0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000,
0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001,
0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200,
0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200,
0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202,
0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201,
0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202,
0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001,
0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001,
0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200,
0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000,
0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102,
0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101,
0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100,
0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000,
0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100,
0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100,
0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102,
0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201,
0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202,
0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102,
0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102,
0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202,
0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202,
0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100,
0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000,
0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101,
0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202,
0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102,
0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100,
0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101,
0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100,
0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201,
0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101,
0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202,
0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200,
0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201,
0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200,
0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002,
0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201,
0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101,
0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201,
0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201,
0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102,
0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101,
0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101,
0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101,
0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001,
0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000,
0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102,
0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101,
0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202,
0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202,
0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101,
0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000,
0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101,
0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202,
0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100,
0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000,
0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101,
0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202,
0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100,
0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100,
0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002,
0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100,
0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101,
0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202,
0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200,
0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100,
0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200,
0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002,
0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001,
0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101,
0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101,
0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202,
0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102,
0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100,
0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101,
0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100,
0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101,
0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101,
0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101,
0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101,
0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102,
0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100,
0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102,
0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101,
0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101,
0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001,
0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101,
0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202,
0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102,
0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001,
0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102,
0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200,
0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101,
0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001,
0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201,
0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202,
0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102,
0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002,
0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200,
0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100,
0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001,
0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002,
0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201,
0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101,
0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100,
0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000,
0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200,
0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101,
0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200,
0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202,
0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100,
0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102,
0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102,
0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102,
0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101,
0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101,
0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000,
0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202,
0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102,
0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200,
0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101,
0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101,
0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100,
0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202,
0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101,
0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201,
0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001,
0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101,
0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200,
0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002,
0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001,
0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000,
0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101,
0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202,
0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100,
0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102,
0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200,
0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101,
0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201,
0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000,
0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202,
0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201,
0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200,
0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002,
0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101,
0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100,
0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001,
0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201,
0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000,
0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102,
0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001,
0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201,
0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100,
0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002,
0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001,
0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101,
0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002,
0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000,
0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101,
0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100,
0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200,
0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200,
0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102,
0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200,
0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002,
0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100,
0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001,
0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001,
0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102,
0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202,
0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202,
0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000,
0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101,
0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202,
};
#else
static const uint32_t iq1s_grid_us[2048] = {
0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
};
#endif
#ifndef HAVE_FANCY_SIMD
const uint64_t keven_signs[128] = {
0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff,
0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff,
0xff010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0xff010101ff01ffff,
0x01010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0x01010101ffffffff,
0xff0101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0xff0101ff0101ffff,
0x010101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0x010101ff01ffffff,
0x010101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0x010101ffff01ffff,
0xff0101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0xff0101ffffffffff,
0xff01ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0xff01ff010101ffff,
0x0101ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0x0101ff0101ffffff,
0x0101ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0x0101ff01ff01ffff,
0xff01ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0xff01ff01ffffffff,
0x0101ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0x0101ffff0101ffff,
0xff01ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0xff01ffff01ffffff,
0xff01ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0xff01ffffff01ffff,
0x0101ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0x0101ffffffffffff,
0xffff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0xffff01010101ffff,
0x01ff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0x01ff010101ffffff,
0x01ff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0x01ff0101ff01ffff,
0xffff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0xffff0101ffffffff,
0x01ff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0x01ff01ff0101ffff,
0xffff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0xffff01ff01ffffff,
0xffff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0xffff01ffff01ffff,
0x01ff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0x01ff01ffffffffff,
0x01ffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0x01ffff010101ffff,
0xffffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0xffffff0101ffffff,
0xffffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0xffffff01ff01ffff,
0x01ffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0x01ffff01ffffffff,
0xffffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0xffffffff0101ffff,
0x01ffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0x01ffffff01ffffff,
0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff,
0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff,
};
#endif
}
/* moonll change mulmat
add typeB and strideB
}*/
bool iqk_mul_mat(long Nx, long Ny, long ne00,
int typeA, const void * A, long strideA,
int typeB, const void * B, long strideB,
float * C, long stride_C, int ith, int nth) {
MulMat mm;
#if defined __x86_64__ || defined(_M_X64)
if (!MulMat::set_mul_mat(typeA, typeB, (int)ne00, mm, Ny)) {
return false;
}
#else
int row_size_q8;
if (!MulMat::set_mul_mat(typeA, (int)ne00, mm, row_size_q8, Ny)) {
return false;
}
#endif
size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA));
size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB));
auto nrc_x = (Nx + nth - 1)/nth;
auto first_x = ith*nrc_x;
if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0};
#ifdef __ARM_NEON
#ifdef GEMM_Q4K_Q6K
if (Ny >= 8 && (typeA == GGML_TYPE_Q4_K || typeA == GGML_TYPE_Q6_K)) {
mm.mul_mat_NxM_v2(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
} else
#endif
#endif
{
mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
}
return true;
}
bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B,
float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
assert(row_mapping != nullptr);
MulMat mm;
int row_size_q8;
/* moonll
if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
return false;
}*/
int row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
int nrc_x = (Nx + nth - 1)/nth;
int first_x = ith*nrc_x;
if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
DataInfo info{C + first_x, (const char *)B, nb1/sizeof(float), (size_t)row_size_q8, 0, ne11, row_mapping, nb2/sizeof(float)};
mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
return true;
}
#if defined __x86_64__ || defined(_M_X64)
#if defined HAVE_FANCY_SIMD
#undef HAVE_FANCY_SIMD
#endif
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
#define HAVE_FANCY_SIMD
#endif
//#define HAVE_FANCY_SIMD
namespace {
inline float hsum_float_4(__m128 x) {
x = _mm_add_ps(x, _mm_movehl_ps(x, x));
x = _mm_add_ss(x, _mm_movehdup_ps(x));
return _mm_cvtss_f32(x);
}
inline float hsum_float_8(__m256 x) {
return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1)));
}
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
constexpr static int nrc_y = nrc;
Q8(const DataInfo& info) {
for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
}
#ifdef HAVE_FANCY_SIMD
inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
#endif
inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
inline float scale(int iy, int i) const { return y[iy][i].d; }
const block_q8 * y[nrc_y];
};
// Handles q4_K and q5_K scales/mins
struct Scales8K {
template <typename Q8>
inline __m256i process_mins_and_scales(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
make_q4_scales(data, utmp);
const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
const __m128i mins128 = _mm256_extracti128_si256(mins_and_scales, 1);
accum_mins(mins128, q8, i, c, accd);
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
return MM256_SET_M128I(sc128, sc128);
}
#ifdef HAVE_FANCY_SIMD
template <typename Q8>
inline __m512i process_mins_and_scales_64(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) {
auto scales = process_mins_and_scales(data, c, i, q8, accd);
return _mm512_inserti32x8(_mm512_castsi256_si512(scales), scales, 1);
}
#endif
template <typename Q8>
inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i q8s = q8.load_bsums(iy, i);
const __m256i prod = _mm256_madd_epi16(mins, q8s);
accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
}
}
#ifdef HAVE_FANCY_SIMD
const __m512i shuffles512[2] = {
_mm512_set_epi64(0x0706070607060706, 0x0302030203020302, 0x0706070607060706, 0x0302030203020302,
0x0504050405040504, 0x0100010001000100, 0x0504050405040504, 0x0100010001000100),
_mm512_set_epi64(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a,
0x0d0c0d0c0d0c0d0c, 0x0908090809080908, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
};
#endif
const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
_mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};
uint32_t utmp[4];
};
template <typename Q8>
inline void process_mins_16(const __m256i& all_scales, const Q8& q8, int i, float d, __m256 * accm) {
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i prod = _mm256_madd_epi16(all_scales, q8.load_bsums(iy, i));
accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]);
}
}
inline void prepare_scales_16(const __m256i& all_scales, __m256i * scales) {
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
scales[0] = MM256_SET_M128I(l_scales, l_scales);
scales[1] = MM256_SET_M128I(h_scales, h_scales);
}
struct ScaleQ3 {
inline __m128i make_scales(const uint16_t * s8) const {
const uint16_t * scales16 = (const uint16_t *)s8;
uint32_t aux0 = scales16[0] | (scales16[1] << 16);
uint32_t aux1 = scales16[2] | (scales16[3] << 16);
uint32_t aux2 = scales16[4] | (scales16[5] << 16);
__m128i scales128 = _mm_set_epi32(
((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030),
((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030),
(aux1 & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030),
(aux0 & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030));
return _mm_add_epi8(scales128, m32);
}
const __m128i m32 = _mm_set1_epi8(-32);
};
struct ScaleIQ4XS {
inline __m128i make_scales(const uint32_t scales_l, const uint16_t scales_h) {
uint32_t tmp32 = scales_h | (scales_h << 14);
const __m128i sh = _mm_slli_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(tmp32), hshift), hmask), 4);
const __m128i sl = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(scales_l), lshift), lmask);
return _mm_add_epi16(_mm_or_si128(sh, _mm_cvtepi8_epi16(_mm_shuffle_epi8(sl, lshuffle))), m32);
}
const __m128i hshift = _mm_set_epi32(12, 8, 4, 0);
const __m128i lshift = _mm_set_epi32(4, 0, 4, 0);
const __m128i hmask = _mm_set1_epi16(0x03);
const __m128i lmask = _mm_set1_epi8(0xf);
const __m128i lshuffle = _mm_set_epi32(0x07030602, 0x05010400, 0x07030602, 0x05010400);
const __m128i m32 = _mm_set1_epi16(-32);
};
struct Scales8KBase {
template <typename Q8>
inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i q8s = q8.load_bsums(iy, i);
const __m256i prod = _mm256_madd_epi16(mins, q8s);
accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
}
}
inline __m256i shuffle(__m128i mins) const {
return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0]));
}
const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
_mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};
};
template <typename Block>
struct BaseDequantizer {
BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {}
inline void new_row(int ix) {
x = (const Block *)((const char *)vx + bx*ix);
}
const void * vx;
size_t bx;
const Block * x;
float d;
};
__m128i inline load_iq4nl_values_128() {
static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
return _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
}
__m256i inline load_iq4nl_values_256() {
auto val128 = load_iq4nl_values_128();
return MM256_SET_M128I(val128, val128);
}
#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================
struct BlockPermuter {
const __m512i permute1 = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0);
const __m512i permute2 = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4);
};
struct Q4Bits {
inline void prepare(const uint8_t * q4) {
auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
auto tmp1 = _mm512_and_si512(q4bits, ml);
auto tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
values[0] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
values[1] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
tmp1 = _mm512_and_si512(q4bits, ml);
tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
values[2] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2);
values[3] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2);
}
inline void prepare64(const uint8_t * q4) {
auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0);
values[0] = _mm512_and_si512(q4bits, ml);
values[1] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1);
values[2] = _mm512_and_si512(q4bits, ml);
values[3] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml);
}
__m512i values[4];
const __m512i ml = _mm512_set1_epi8(0xf);
BlockPermuter perm;
};
struct Q2Bits {
inline void prepare(const uint8_t * q2) {
auto q2bits = _mm512_loadu_si512((const __m512i*)q2);
auto tmp = _mm512_srli_epi16(q2bits, 2);
values[0] = _mm512_permutex2var_epi64(q2bits, perm.permute1, tmp);
values[2] = _mm512_permutex2var_epi64(q2bits, perm.permute2, tmp);
values[1] = _mm512_and_si512(_mm512_srli_epi16(values[0], 4), ml);
values[3] = _mm512_and_si512(_mm512_srli_epi16(values[2], 4), ml);
values[0] = _mm512_and_si512(values[0], ml);
values[2] = _mm512_and_si512(values[2], ml);
}
__m512i values[4];
const __m512i ml = _mm512_set1_epi8(0x03);
BlockPermuter perm;
};
struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
bits.prepare(x[i].qs);
auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
}
Q4Bits bits;
Scales8K s8k;
};
/*
moonll DequantizerIQ4XS
*/
__m512i inline load_iq4nl_values_512() {
auto val256 = load_iq4nl_values_256();
return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
}
struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
prepare(x[i].qs);
auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
auto scales256 = MM256_SET_M128I(scales128, scales128);
auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1);
scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]);
scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]);
scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]);
scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]);
}
inline void prepare(const uint8_t * q4) {
bits.prepare64(q4);
// We now have in bits.valuse[0]: 0...15, 32...47, 64...79, 96...111
// bits.valuse[1]: 16..31, 48...63, 80...95, 112..127
// etc.
auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]);
bits.values[1] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1]));
bits.values[0] = _mm512_shuffle_epi8(values, tmp);
tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]);
bits.values[3] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3]));
bits.values[2] = _mm512_shuffle_epi8(values, tmp);
}
Q4Bits bits;
Scales8KBase s8k;
ScaleIQ4XS siq4;
const __m512i values;
const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
const __m512i shuffles[4] = {
_mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1),
_mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1),
_mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1),
_mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1),
};
};
struct HighBit5 {
inline void apply(const uint8_t * h, Q4Bits& bits) {
auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh));
bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(hbits, mh));
bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
}
const __m512i mh = _mm512_set1_epi8(0x10);
};
struct HighBit3 {
inline void apply(const uint8_t * h, Q2Bits& bits) {
auto hbits256 = _mm256_loadu_si256((const __m256i *)h);
auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1);
bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh));
bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(hbits, mh));
bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh));
bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 4), mh));
}
const __m512i mh = _mm512_set1_epi8(0x04);
};
struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
bits.prepare(x[i].qs);
hbits.apply(x[i].qh, bits);
auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
}
Q4Bits bits;
HighBit5 hbits;
Scales8K s8k;
};
struct Scale16 {
inline void make_scales(const __m128i& scales8, __m512i * scales) const {
auto all_scales8 = MM256_SET_M128I(scales8, scales8);
auto scales1 = _mm256_shuffle_epi8(all_scales8, shuffle1);
auto scales2 = _mm256_shuffle_epi8(all_scales8, shuffle2);
scales[0] = _mm512_cvtepi8_epi16(scales1);
scales[1] = _mm512_cvtepi8_epi16(scales2);
}
template <typename Q8>
inline void process_mins_and_scales(int i, float c, const __m128i& mins8, const __m128i& scales8,
const Q8& q8, __m256 * accm, __m512i * scales) const {
process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, c, accm);
make_scales(scales8, scales);
}
const __m256i shuffle1 = _mm256_set_epi32(0x07070707, 0x03030303, 0x06060606, 0x02020202,
0x05050505, 0x01010101, 0x04040404, 0x00000000);
const __m256i shuffle2 = _mm256_set_epi32(0x0f0f0f0f, 0x0b0b0b0b, 0x0e0e0e0e, 0x0a0a0a0a,
0x0d0d0d0d, 0x09090909, 0x0c0c0c0c, 0x08080808);
};
struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
bits.prepare(x[i].qs);
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
sc16.process_mins_and_scales(i, -GGML_FP16_TO_FP32(x[i].dmin), mins8, scales8, q8, accm, scales);
}
Q2Bits bits;
Scale16 sc16;
const __m128i m4 = _mm_set1_epi8(0xf);
};
struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
bits.prepare(x[i].qs);
hbits.apply(x[i].hmask, bits);
auto scales128 = sc3.make_scales((const uint16_t *)x[i].scales);
sc16.process_mins_and_scales(i, -4.f*d, scales128, scales128, q8, accm, scales);
}
Q2Bits bits;
HighBit3 hbits;
ScaleQ3 sc3;
Scale16 sc16;
const __m128i m4 = _mm_set1_epi8(0xf);
const __m128i m32 = _mm_set1_epi8(-32);
};
struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
bits.prepare64(x[i].ql);
add_high_bits(x[i].qh, bits);
auto scales128 = _mm_loadu_si128((const __m128i *)x[i].scales);
sc16.process_mins_and_scales(i, -32.f*d, scales128, scales128, q8, accm, scales);
}
inline void add_high_bits(const uint8_t * qh, Q4Bits& bits) const {
auto hbits = _mm512_loadu_si512((const __m512i *)qh);
auto tmp1 = _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh);
auto tmp2 = _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh);
bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
tmp1 = _mm512_and_si512(hbits, mh);
tmp2 = _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh);
bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2));
bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2));
}
Q4Bits bits;
HighBit3 hbits;
Scale16 sc16;
const __m512i mh = _mm512_set1_epi8(0x30);
};
template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
Q8<nrc_y> q8(info);
Dequantizer deq(vx, bx);
__m256 accm[nrc_y];
__m512 accd[nrc_y];
__m512i scales[2];
for (int ix = 0; ix < nrc_x; ++ix) {
for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
deq.new_block(i, q8, accm, scales);
for (int iy = 0; iy < nrc_y; ++iy) {
const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants(iy, i, 0));
const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants(iy, i, 1));
const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants(iy, i, 2));
const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants(iy, i, 3));
auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
}
}
}
template <typename Q8>
inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) {
const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0));
const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1));
const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2));
const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3));
auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
Q8<nrc_y> q8(info);
Dequantizer deq(vx, bx);
__m256 accm[nrc_y];
__m512 accd[nrc_y];
__m512i scales[2];
for (int ix = 0; ix < nrc_x; ++ix) {
for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
deq.new_block(i, q8, accm, scales);
for (int iy = 0; iy < nrc_y; ++iy) {
const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0));
const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1));
const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2));
const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3));
auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
}
}
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
Q8<nrc_y> q8(info);
Dequantizer deq(vx, bx);
__m256 accm[nrc_y];
__m512 accd[nrc_y];
__m512i scales[4];
for (int ix = 0; ix < nrc_x; ++ix) {
for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
deq.new_block(i, q8, accm, scales);
for (int iy = 0; iy < nrc_y; ++iy) {
const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0));
const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1));
const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2));
const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3));
auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(),
p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]);
accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
}
}
}
template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
constexpr int k_nx = 2;
Q8<1> q8(info);
Dequantizer deq1(vx, bx);
Dequantizer deq2(vx, bx);
Dequantizer * deq[k_nx];
deq[0] = &deq1;
deq[1] = &deq2;
__m512i scales[2*k_nx];
for (int ix = 0; ix < nrc_x; ++ix) {
auto accd = _mm512_setzero_ps();
auto accm = _mm256_setzero_ps();
for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix);
for (int i = 0; i < nb/k_nx; ++i) {
for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx);
for (int kx = 0; kx < k_nx; ++kx) {
compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd);
}
}
if (2*(nb/2) < nb) {
int i0 = 2*(nb/2);
deq[0]->new_block(i0, q8, &accm, scales);
compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd);
}
auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1));
info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256)));
}
}
#else
// ===================================== Vanilla AVX2 =====================================
struct Q4Bits {
inline void prepare(const uint8_t * q4, int j) {
auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
values[0] = _mm256_and_si256(q4bits, ml);
values[1] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
values[2] = _mm256_and_si256(q4bits, ml);
values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
}
inline void prepare64(const uint8_t * q4, int j) {
auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0);
values[0] = _mm256_and_si256(q4bits, ml);
values[2] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1);
values[1] = _mm256_and_si256(q4bits, ml);
values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml);
}
inline void prepare16(const uint8_t * q4, int j) {
values[0] = dequant16(q4 + 64*j + 0);
values[1] = dequant16(q4 + 64*j + 16);
values[2] = dequant16(q4 + 64*j + 32);
values[3] = dequant16(q4 + 64*j + 48);
}
inline __m256i dequant16(const uint8_t * qs) const {
const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
const __m256i aux256 = MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128);
return _mm256_and_si256(ml, aux256);
};
__m256i values[4];
const __m256i ml = _mm256_set1_epi8(0xf);
};
struct Q2Bits {
inline void prepare(const uint8_t * q2, int j) {
auto q2bits = _mm256_loadu_si256((const __m256i *)q2 + j);
values[0] = _mm256_and_si256(q2bits, ml);
values[1] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), ml);
values[2] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), ml);
values[3] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), ml);
}
__m256i values[4];
const __m256i ml = _mm256_set1_epi8(0x03);
};
struct HighBit5 {
inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
inline void apply(Q4Bits& bits, bool do_shift) {
bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 3), mh));
bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
if (do_shift) {
hbits = _mm256_srli_epi16(hbits, 4);
}
}
const __m256i mh = _mm256_set1_epi8(0x10);
__m256i hbits;
};
struct HighBit3 {
inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); }
inline void apply(Q2Bits& bits, bool do_shift) {
bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh));
bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 1), mh));
if (do_shift) {
hbits = _mm256_srli_epi16(hbits, 4);
}
}
const __m256i mh = _mm256_set1_epi8(0x04);
__m256i hbits;
};
/*
template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
if (j == 0) {
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
}
} else {
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
}
}
}*/
struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
d = GGML_FP16_TO_FP32(x[i].d);
return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
}
inline void prepare(int i, int j) {
bits.prepare(x[i].qs, j);
}
Q4Bits bits;
Scales8K s8k;
};
struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {}
template <typename Q8>
inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
d = GGML_FP16_TO_FP32(x[i].d);
auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h);
s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
return MM256_SET_M128I(scales128, scales128);
}
inline void prepare(int i, int j) {
bits.prepare16(x[i].qs, j);
bits.values[0] = _mm256_shuffle_epi8(values, bits.values[0]);
bits.values[1] = _mm256_shuffle_epi8(values, bits.values[1]);
bits.values[2] = _mm256_shuffle_epi8(values, bits.values[2]);
bits.values[3] = _mm256_shuffle_epi8(values, bits.values[3]);
}
static __m256i load_values() {
static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
return MM256_SET_M128I(val128, val128);
}
Q4Bits bits;
Scales8K s8k;
ScaleIQ4XS siq4;
const __m256i values;
};
struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
d = GGML_FP16_TO_FP32(x[i].d);
hbits.load(x[i].qh);
return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd);
}
inline void prepare(int i, int j) {
bits.prepare(x[i].qs, j);
hbits.apply(bits, j == 0);
}
Q4Bits bits;
HighBit5 hbits;
Scales8K s8k;
};
template <typename Q8>
inline void process_mins_and_scales_16(const __m128i& scales128, const Q8& q8, int i, float d,
__m256 * accm, __m256i * scales) {
const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
process_mins_16(all_scales, q8, i, d, accm);
prepare_scales_16(all_scales, scales);
}
struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
hbits.load(x[i].hmask);
process_mins_and_scales_16(sc3.make_scales((const uint16_t *)x[i].scales), q8, i, -4.f*d, accm, scales);
}
inline void prepare(int i, int j) {
bits.prepare(x[i].qs, j);
hbits.apply(bits, j == 0);
}
Q2Bits bits;
HighBit3 hbits;
ScaleQ3 sc3;
const __m128i m32 = _mm_set1_epi8(-32);
};
struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, -GGML_FP16_TO_FP32(x[i].dmin), accm);
prepare_scales_16(_mm256_cvtepi8_epi16(scales8), scales);
}
inline void prepare(int i, int j) {
bits.prepare(x[i].qs, j);
}
Q2Bits bits;
const __m128i m4 = _mm_set1_epi8(0xf);
};
struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
process_mins_and_scales_16(_mm_loadu_si128((const __m128i *)x[i].scales), q8, i, -32.f*d, accm, scales);
}
inline void prepare(int i, int j) {
bits.prepare64(x[i].ql, j);
auto hbits = _mm256_loadu_si256((const __m256i *)x[i].qh + j);
bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh));
bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh));
bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh));
bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 2), mh));
}
Q4Bits bits;
const __m256i mh = _mm256_set1_epi8(0x30);
};
inline __m256i get_scale_shuffle_8(int i);
inline void set_scales_8(const __m256i& all_scales, int j, __m256i* scales);
inline __m256i get_scale_shuffle_16(int i);
inline void set_scales_16(const __m256i& all_scales, __m256i* scales);
template <typename Dequantizer, int nrc_y>
static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n%QK_K == 0);
const int nb = n/QK_K;
Q8<nrc_y> q8(info);
__m256i all_scales[2];
__m256i scales[4];
__m256 accd[nrc_y];
Dequantizer deq(vx, bx);
for (int ix = 0; ix < nrc_x; ++ix) {
deq.new_row(ix);
for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
deq.new_block(i, q8, accd, all_scales);
__m256i sumi[nrc_y];
for (int j = 0; j < QK_K/128; ++j) {
deq.prepare(i, j);
set_scales_16(all_scales[j], scales);
multiply_add(deq.bits, scales, j, i, q8, sumi);
}
for (int iy = 0; iy < nrc_y; ++iy) {
accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(iy, i)), _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, hsum_float_8(accd[iy]));
}
}
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
Q8<nrc_y> q8(info);
Dequantizer deq(vx, bx);
__m256 accd[nrc_y];
__m256i scales[4];
for (int ix = 0; ix < nrc_x; ++ix) {
for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
auto all_scales = deq.new_block(i, q8, accd);
__m256i sumi[nrc_y];
for (int j = 0; j < QK_K/128; ++j) {
deq.prepare(i, j);
set_scales_8(all_scales, j, scales);
multiply_add(deq.bits, scales, j, i, q8, sumi);
}
for (int iy = 0; iy < nrc_y; ++iy) {
const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, hsum_float_8(accd[iy]));
}
}
}
#endif // Zen4 or vanilla AVX2
//
// ============================== Legacy quants
//
struct DotHelper {
const __m256i m1 = _mm256_set1_epi16(1);
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
inline __m256i dot(__m256i x, __m256i y) const {
return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y);
}
#else
inline __m256i dot(__m256i x, __m256i y) const {
return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y));
}
#endif
};
struct SignedDot {
DotHelper helper;
inline __m256i compute(__m256i x, __m256i y) const {
return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x));
}
};
struct UnsignedDot {
DotHelper helper;
inline __m256i compute(__m256i x, __m256i y) const {
return helper.dot(x, y);
}
};
template <typename Q8, typename Dot> struct Sum4 {
Dot dot;
inline __m256i compute(const __m256i * qx, const Q8 * y) const {
const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1
const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3
return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
}
};
struct Sum4_Q8 {
SignedDot dot;
static inline __m256i add1(__m256i a, __m256i b) {
return _mm256_add_epi32(_mm256_unpacklo_epi32(a, b), _mm256_unpackhi_epi32(a, b));
}
static inline __m256i add2(__m256i a, __m256i b) {
return _mm256_add_epi32(_mm256_unpacklo_epi64(a, b), _mm256_unpackhi_epi64(a, b));
}
inline __m256i compute(const __m256i * qx, const block_q8_0 * y) const {
const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
const __m256i p01 = add1(p0, p1); // 0,1, 0,1, 0,1, 0,1
const __m256i p23 = add1(p2, p3); // 2,3, 2,3, 2,3, 2,3
return add2(p01, p23); // returns 0,1,2,3, 0,1,2,3
}
};
struct ScaleHelperQ_0 {
ggml_half scales8[4];
template <typename Q>
inline __m128 prepare4(const Q * y) {
for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
}
template <typename Q>
inline __m128 prepare4(__m128 other_scales, const Q * y) {
return _mm_mul_ps(other_scales, prepare4<Q>(y));
}
template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
};
template <int min_value>
struct ScaleHelperQ_0_1 {
ggml_half scales8[4];
template <typename Q>
inline __m256 prepare4(const Q * y) {
for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
}
template <typename Q>
inline __m256 prepare4(__m256 other_scales, const Q * y) {
return _mm_mul256_ps(other_scales, prepare4<Q>(y));
}
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
float d = GGML_FP16_TO_FP32(y->d);
return std::make_pair(d, -d*float(min_value));
}
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
}
const __m128 min = _mm_set1_ps(float(-min_value));
};
struct ScaleHelperQ_1 {
uint32_t scales8[4];
const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);
template <typename Q>
inline __m256 prepare4(const Q * y) {
for (int j = 0; j < 4; ++j) {
// it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers
// complain that this breaks strict-aliasing rules.
memcpy(scales8 + j, &y[j].d, sizeof(uint32_t));
}
return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle));
}
template <typename Q>
inline __m256 prepare4(__m256 other_scales, const Q * y) {
return _mm256_mul_ps(other_scales, prepare4<Q>(y));
}
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
}
template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
}
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
}
};
struct MinusType0 {
inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); }
inline float compute(float d, int) const { return d; }
inline float result(__m256 acc, int) const { return hsum_float_8(acc); }
};
template <int nrc_y> struct MinusType1 {
__m128 accm[nrc_y];
MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); }
inline __m256 compute(__m256 dm, int iy) {
const __m128 d = _mm256_castps256_ps128(dm);
const __m128 m = _mm256_extractf128_ps(dm, 1);
accm[iy] = _mm_add_ps(accm[iy], m);
return _mm256_set_m128(d, d);
}
inline float compute(const std::pair<float, float>& dm, int iy) {
accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f));
return dm.first;
}
inline float result(__m256 acc, int iy) const {
const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
return hsum_float_4(_mm_add_ps(sum, accm[iy]));
}
};
template <typename Minus, int nrc_y, bool is_multiple_of_4> struct AccumT {
__m256 acc[nrc_y];
Minus accm;
AccumT() { for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); }
template <typename Unpacker, typename Scales, typename Sum, typename Q8>
inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) {
auto qx = unp.quants();
__m256 dall[nrc_y];
for (int i = 0; i < nb/4; ++i) {
auto other_scales = unp.set_block_4(i);
for (int iy = 0; iy < nrc_y; ++iy) {
auto s12 = scales.prepare4(other_scales, y[iy] + 4*i);
dall[iy] = accm.compute(s12, iy);
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto pall = sum.compute(qx, y[iy] + 4*i);
acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]);
}
}
if (!is_multiple_of_4) {
for (int i = 4*(nb/4); i < nb; ++i) {
auto other_scales = unp.set_block(i);
for (int iy = 0; iy < nrc_y; ++iy) {
auto s12 = scales.prepare1(other_scales, y[iy] + i);
auto d = accm.compute(s12, iy);
const __m256i p0 = sum.dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs));
acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]);
}
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, accm.result(acc[iy], iy));
//s[iy*bs] = accm.result(acc[iy], iy);
}
}
};
template <int nrc_y, bool is_multiple_of_4>
using AccumType0 = AccumT<MinusType0, nrc_y, is_multiple_of_4>;
template <int nrc_y, bool is_multiple_of_4>
using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>;
using Sum4Type0 = Sum4<block_q8_0, SignedDot>;
using Sum4Type1 = Sum4<block_q8_1, UnsignedDot>;
template <typename Unpacker, typename Sum4Type, typename AccumType, typename Scales, typename Q8, int nrc_y>
void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
Unpacker unp(vx, bx);
Sum4Type sum4;
Scales scales;
for (int ix = 0; ix < nrc_x; ++ix) {
unp.set_row(ix);
AccumType accum;
accum.compute(nb, unp, scales, sum4, y, info, ix);
}
}
template <typename Unpacker, int nrc_y>
void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n%Unpacker::block_size() == 0);
Q8<nrc_y, block_q8_0> q8(info);
int nb = n/Unpacker::block_size();
if (nb%4 == 0) {
mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
nb, vx, bx, info, q8.y, nrc_x
);
} else {
mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
nb, vx, bx, info, q8.y, nrc_x
);
}
}
template <typename Unpacker, int nrc_y>
void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n%Unpacker::block_size() == 0);
Q8<nrc_y, block_q8_1> q8(info);
int nb = n/Unpacker::block_size();
if (nb%4 == 0) {
mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, true>, ScaleHelperQ_1, block_q8_1, nrc_y>(
nb, vx, bx, info, q8.y, nrc_x
);
} else {
mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, false>, ScaleHelperQ_1, block_q8_1, nrc_y>(
nb, vx, bx, info, q8.y, nrc_x
);
}
}
struct Dequantizer4bit {
const __m256i m4 = _mm256_set1_epi8(0xf);
inline __m256i dequant(const uint8_t * qs) const {
const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4);
}
};
struct Q8_0_Dequantizer {
inline __m256i dequant(const block_q8_0 * x) const {
return _mm256_loadu_si256((const __m256i *)x->qs);
}
};
struct Q8_0_1_Dequantizer {
inline __m256i dequant(const block_q8_0 * x) const {
return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs));
}
};
struct Q4_0_Dequantizer {
Dequantizer4bit b4;
const __m256i m8 = _mm256_set1_epi8(-8);
inline __m256i dequant(const block_q4_0 * x) const {
return _mm256_add_epi8(b4.dequant(x->qs), m8);
}
};
struct Q4_1_Dequantizer {
Dequantizer4bit b4;
inline __m256i dequant(const block_q4_1 * x) const {
return b4.dequant(x->qs);
}
};
struct HBitDequantizer {
const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
const __m256i minus1 = _mm256_set1_epi64x(-1);
inline __m256i to_bytes(const uint8_t * bits) const {
// Note: Data in all ggml quants is at least 2-byte aligned.
// => we can cast to uint16_t and use or on two consecutive entries
// which is faster than memcpy
const uint16_t * aux16 = (const uint16_t *)bits;
const uint32_t aux32 = aux16[0] | (aux16[1] << 16);
//uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t));
__m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle);
bytes = _mm256_or_si256(bytes, mask);
return _mm256_cmpeq_epi8(bytes, minus1);
}
};
struct Q5_0_Dequantizer {
Dequantizer4bit b4;
HBitDequantizer hbit;
const __m256i mh = _mm256_set1_epi8((char)0xF0);
inline __m256i dequant(const block_q5_0 * x) const {
const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh);
return _mm256_or_si256(b4.dequant(x->qs), vqh);
}
};
struct Q5_1_Dequantizer {
Dequantizer4bit b4;
HBitDequantizer hbit;
const __m256i mh = _mm256_set1_epi8(0x10);
inline __m256i dequant(const block_q5_1 * x) const {
const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh);
return _mm256_or_si256(b4.dequant(x->qs), vqh);
}
};
template <typename Q, typename Scales, typename Dequantizer>
struct Q_Unpacker {
Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {}
const char * cx_0;
const Q * x;
size_t bx;
Scales scales;
Dequantizer deq;
__m256i qx[4];
inline const __m256i* quants() const { return qx; }
inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); }
inline auto set_block_4(int i) {
for (int j = 0; j < 4; ++j) {
qx[j] = deq.dequant(x + 4*i + j);
}
return scales.prepare4(x + 4*i);
}
inline auto set_block(int i) {
qx[0] = deq.dequant(x + i);
return scales.prepare1(x + i);
}
};
struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> {
Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
inline static int block_size() { return QK4_0; }
};
struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
// using Sum4T = Sum4TypeQ81;
inline static int block_size() { return QK8_0; }
};
struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
inline static int block_size() { return QK4_0; }
};
struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> {
Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
inline static int block_size() { return QK5_0; }
};
struct Q4_1_Unpacker final : public Q_Unpacker<block_q4_1, ScaleHelperQ_1, Q4_1_Dequantizer> {
Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
inline static int block_size() { return QK4_1; }
};
struct Q5_1_Unpacker final : public Q_Unpacker<block_q5_1, ScaleHelperQ_1, Q5_1_Dequantizer> {
Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
inline static int block_size() { return QK4_1; }
};
template <int nrc_y>
void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n%Q8_0_Unpacker::block_size() == 0);
Q8<nrc_y, block_q8_0> q8(info);
int nb = n/Q8_0_Unpacker::block_size();
if (nb%4 == 0) {
mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
nb, vx, bx, info, q8.y, nrc_x
);
} else {
mul_mat_qX_q8_Helper<Q8_0_Unpacker, Sum4_Q8, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
nb, vx, bx, info, q8.y, nrc_x
);
}
}
/*
moonll
add some structs for DequantizerIQ2XXS
SimpleBits
EvenSignHelper
*/
struct SimpleBits {
__m256i values[4];
};
// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测
#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
#define HAVE_AVX512_POPCNT 1
#else
#define HAVE_AVX512_POPCNT 0
#endif
struct EvenSignHelper {
#if defined HAVE_FANCY_SIMD
// #pragma message("Using AVX512VPOPCNTDQ in even sign helper")
union sbits_t {
__m128i vec;
__mmask32 mask[4];
};
IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
// fix for #829: 兼容Intel Cascade Lake架构的CPU如果不支持AVX512VPOPCNTDQ扩展则使用替代实现
#if HAVE_AVX512_POPCNT
auto pcnt = _mm256_popcnt_epi32(aux);
#else
// 提供替代实现,使用标准的位计数方法
__m256i pcnt;
int* pcnt_ptr = reinterpret_cast<int*>(&pcnt);
int* aux_ptr = reinterpret_cast<int*>(&aux); // 直接获取 aux 的地址,避免不必要的复制
#pragma unroll 8 // 提示编译器展开循环,提高 SIMD 计算吞吐量
for (int i = 0; i < 8; i++) {
pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount
}
#endif
sbits_t sbits;
sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]);
//auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
//const __mmask32 * m32 = (const __mmask32 *)&sign_bits;
//values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]);
//values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]);
}
const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0);
const __m256i mask = _mm256_set1_epi32(127);
const __m256i mone = _mm256_set1_epi32(1);
#else
inline void sign_value(uint32_t aux32, __m256i& value) const {
auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127],
keven_signs[(aux32 >> 7) & 127], keven_signs[(aux32 >> 0) & 127]);
value = _mm256_sign_epi8(value, signs);
}
#endif
};
/*
moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1
add func
get_scale_shuffle_8
get_scale_shuffle_16
set_scales_16
*/
inline __m256i get_scale_shuffle_8(int i) {
return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
}
inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
}
inline __m256i get_scale_shuffle_16(int i) {
static const uint8_t k_shuffle[128] = {
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
};
return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
}
inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
}
template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
if (j == 0) {
#ifdef HAVE_FANCY_SIMD
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
}
#else
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
}
#endif
} else {
#ifdef HAVE_FANCY_SIMD
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
}
#else
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
}
#endif
}
}
/*
moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1
add func
set_scales_8_iq
set_scales_16_iq
add MUL_MAT
mul_mat_qX_K_q8_K_IQ_1
mul_mat_qX_K_q8_K_IQ_N
mul_mat_qX_K_q8_K_IQ
*/
template <typename Bits>
inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) {
if (j == 0) {
#ifdef HAVE_FANCY_SIMD
auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2));
sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4));
#else
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
sumi[0] = _mm256_add_epi32(p1, p3);
sumi[1] = _mm256_add_epi32(p2, p4);
#endif
} else {
#ifdef HAVE_FANCY_SIMD
auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2));
sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4));
#else
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3));
sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4));
#endif
}
}
inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) {
//#ifdef HAVE_FANCY_SIMD
auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100)
: _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908);
scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)));
//#else
// set_scales_8(all_scales, j, scales);
//#endif
}
inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) {
#ifdef HAVE_FANCY_SIMD
auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100);
scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8)));
#else
set_scales_16(all_scales, scales);
#endif
}
template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
const int nb = n / QK_K;
Q8<1> q8(info);
Dequantizer deq(vx, bx);
__m256i scales[2];
__m256i q8_quants[4];
for (int ix = 0; ix < nrc_x; ++ix) {
__m256 accd = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
__m256i sumi[2], all_scales[Dequantizer::num_blocks/8];
deq.new_block(i, all_scales);
for (int j = 0; j < QK_K/128; ++j) {
deq.prepare(i, j, q8, q8_quants);
if constexpr (Dequantizer::num_blocks == 8) {
set_scales_8_iq(j, all_scales[0], scales);
} else {
set_scales_16_iq(all_scales[j], scales);
}
multiply_add_1(j, deq.bits, scales, q8_quants, sumi);
}
accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd);
}
info.store(ix, 0, hsum_float_8(accd));
}
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
const int nb = n / QK_K;
Q8<nrc_y> q8(info);
Dequantizer deq(vx, bx);
__m256i scales[4];
__m256 accd[nrc_y];
for (int ix = 0; ix < nrc_x; ++ix) {
for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
__m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8];
//for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256();
__m256i mins;
float dmin = deq.new_block(i, all_scales, mins);
for (int iy = 0; iy < nrc_y; ++iy) {
auto bsums = q8.load_bsums(iy, i);
auto prod = _mm256_madd_epi16(mins, bsums);
accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
}
for (int j = 0; j < QK_K/128; ++j) {
deq.prepare(i, j);
if constexpr (Dequantizer::num_blocks == 8) {
set_scales_8(all_scales[0], j, scales);
} else {
set_scales_16(all_scales[j], scales);
}
//multiply_add_iq(deq.bits, scales, j, i, q8, sumi);
multiply_add(deq.bits, scales, j, i, q8, sumi);
}
for (int iy = 0; iy < nrc_y; ++iy) {
const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, hsum_float_8(accd[iy]));
}
}
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
#ifdef HAVE_FANCY_SIMD
if constexpr (nrc_y == 1) {
mul_mat_qX_K_q8_K_IQ_1<Dequantizer>(n, vx, bx, info, nrc_x);
} else {
mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
}
#else
mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
#endif
}
/*
moonll iq1s
core func for iq1s mul_mat_iq1_s_q8_K
*/
template <int nrc_y>
static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
GGML_ASSERT(n%QK_K == 0);
Q8<nrc_y, block_q8_K> q8(info);
__m256i qx[8];
__m256i scales[4];
__m256 acc[nrc_y] = {};
auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000
__m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100);
for (int ix = 0; ix < nrc_x; ++ix) {
auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx);
for (int ibl = 0; ibl < n/QK_K; ++ibl) {
float d = GGML_FP16_TO_FP32(iq1s[ibl].d);
auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh);
auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7));
scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1));
#ifdef HAVE_FANCY_SIMD
auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask);
auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9));
#else
auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask);
auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7)));
#endif
deltas128 = _mm_mullo_epi16(scales128, deltas128);
scales128 = _mm_slli_epi16(scales128, 3);
auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128);
auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128);
auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7
auto all_scales = MM256_SET_M128I(scales128, scales128);
auto shuffle = shuffle0;
for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle);
shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4));
}
const uint8_t * qs = iq1s[ibl].qs;
const uint16_t * qh = iq1s[ibl].qh;
for (int ib = 0; ib < QK_K/32; ib += 2) {
qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)],
iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)],
iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
qs += 8;
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto bsums = q8.load_bsums(iy, ibl);
auto sumi = _mm256_setzero_si256();
for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0);
auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1);
#ifdef HAVE_FANCY_SIMD
auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1);
auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2);
sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2));
#else
auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1);
auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2);
auto dot = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2));
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot));
#endif
}
#ifdef HAVE_FANCY_SIMD
sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas);
#else
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas));
#endif
acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, 0.125f*hsum_float_8(acc[iy]));
acc[iy] = _mm256_setzero_ps();
}
}
}
/*
moonll iq1s
DequantizerIQ2XXS
DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S
*/
struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
constexpr static int num_blocks = 8;
union Data {
__m256i vec;
uint32_t val[8];
};
inline __m128i load_scales(int i) {
d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
const uint16_t * a16 = (const uint16_t *)x[i].qs;
auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12);
return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1));
}
inline void new_block(int i, __m256i * scales) {
auto sc16 = load_scales(i);
scales[0] = MM256_SET_M128I(sc16, sc16);
}
inline float new_block(int i, __m256i * scales, __m256i& mins) {
auto sc16 = load_scales(i);
mins = scb.shuffle(sc16);
scales[0] = MM256_SET_M128I(sc16, sc16);
return -d*minv;
}
inline static void make4(const uint32_t * aux32, __m256i * values) {
const uint8_t * aux8 = (const uint8_t *)aux32;
values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]);
values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]);
values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]);
values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]);
}
IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const {
#ifdef HAVE_FANCY_SIMD
esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0);
esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2);
#else
esh.sign_value(aux32[1], values[0]);
esh.sign_value(aux32[3], values[1]);
esh.sign_value(aux32[5], values[2]);
esh.sign_value(aux32[7], values[3]);
#endif
}
inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const {
make4(aux32, values);
sign_values(aux32, values);
for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value);
}
inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const {
make4(aux32, values);
sign_values(aux32, q8);
}
inline void prepare(int i, int j) {
Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
make4_signed(data.val, min_value, bits.values);
}
inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) {
for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k);
Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
make4(data.val, bits.values, q8_quants);
}
constexpr static int minv = 43;
SimpleBits bits;
Scales8KBase scb;
EvenSignHelper esh;
const __m256i min_value = _mm256_set1_epi8(minv);
const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1);
};
/*
moonll
add Q8_0_Unpacker && DequantizerIQ2XXS support
add func mul_mat_qX_K_q8_K_IQ
*/
template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_0_q8_0_T<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_0_q8_0_T<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_0_q8_0_T<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
}
else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>|| std::is_same_v<Dequantizer, Q8_0_1_Unpacker>) {
m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_1_q8_1_T<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_1_q8_1_T<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_1_q8_1_T<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>;
}
else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS>) {
m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>;
}
else {
#ifdef HAVE_FANCY_SIMD
if constexpr (std::is_same_v<Dequantizer, DequantizerIQ4XS>) {
m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 1>;
m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 2>;
m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 3>;
m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 4>;
m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 5>;
m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 6>;
m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 7>;
m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 8>;
} else {
m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1<Dequantizer>;
m.funcs[1] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 8>;
}
#else
if constexpr (std::is_same_v<Dequantizer, DequantizerQ2K> ||
std::is_same_v<Dequantizer, DequantizerQ3K> ||
std::is_same_v<Dequantizer, DequantizerQ6K>) {
m.funcs[0] = mul_mat_qY_K_q8_K_T<Dequantizer, 1>;
m.funcs[1] = mul_mat_qY_K_q8_K_T<Dequantizer, 2>;
m.funcs[2] = mul_mat_qY_K_q8_K_T<Dequantizer, 3>;
m.funcs[3] = mul_mat_qY_K_q8_K_T<Dequantizer, 4>;
m.funcs[4] = mul_mat_qY_K_q8_K_T<Dequantizer, 5>;
m.funcs[5] = mul_mat_qY_K_q8_K_T<Dequantizer, 6>;
m.funcs[6] = mul_mat_qY_K_q8_K_T<Dequantizer, 7>;
m.funcs[7] = mul_mat_qY_K_q8_K_T<Dequantizer, 8>;
} else {
m.funcs[0] = mul_mat_qX_K_q8_K_T<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_K_q8_K_T<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_K_q8_K_T<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_K_q8_K_T<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_K_q8_K_T<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_K_q8_K_T<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_K_q8_K_T<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_K_q8_K_T<Dequantizer, 8>;
}
#endif
}
}
struct QFBase {
#ifdef __AVX512F__
constexpr static int k_step = 16;
using Data = __m512;
using Acc = __m512;
static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); }
static inline Data load(const float * x) { return _mm512_loadu_ps(x); }
static inline Data load(const ggml_bf16_t * x) {
return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16));
}
static inline Acc acc(Acc prev, const Data& y, const Data& x) {
return _mm512_fmadd_ps(y, x, prev);
}
static inline Acc acc_first(const Data& y, const Data& x) {
return _mm512_mul_ps(y, x);
}
static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); }
static inline float hsum(Acc acc) {
return _mm512_reduce_add_ps(acc);
}
template <typename Float>
static inline Data load4Floats(const Float * x) {
return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0);
}
static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc);
acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
return acc;
}
static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00));
acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
return acc;
}
static inline __m128 hsum_r4(Acc acc) {
auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1));
auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3));
return _mm_add_ps(sum1, sum2);
}
#else
constexpr static int k_step = 8;
using Data = __m256;
using Acc = __m256;
static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); }
static inline Data load(const float * x) { return _mm256_loadu_ps(x); }
static inline Data load(const ggml_bf16_t * x) {
return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16));
}
static inline Acc acc(Acc prev, const Data& y, const Data& x) {
return _mm256_fmadd_ps(y, x, prev);
}
static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); }
static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc);
acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
return acc;
}
static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00));
acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
return acc;
}
static inline Acc acc_first(const Data& y, const Data& x) {
return _mm256_mul_ps(y, x);
}
static inline float hsum(Acc acc) {
return hsum_float_8(acc);
}
static inline __m128 hsum_r4(Acc acc) {
return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
}
template <typename Float>
static inline Data load4Floats(const Float * x) {
return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0);
}
#endif
static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); }
static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); }
static inline __m128 load128(const ggml_bf16_t * x) {
return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16));
}
};
template <typename Float, int nrc_in> struct QFT final : public QFBase {
constexpr static int nrc = nrc_in;
QFT(const DataInfo& info) {
for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy);
}
QFT(const char * cx, size_t bx) {
for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx);
}
IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); }
IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); }
IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const {
xv[0] = load1(ix+0, i);
xv[1] = load1(ix+1, i);
xv[2] = load1(ix+2, i);
xv[3] = load1(ix+3, i);
#ifdef __AVX512F__
auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]);
auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]);
auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]);
auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]);
xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
#else
auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]);
auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]);
auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]);
auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]);
xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
#endif
}
const Float * y[nrc];
};
template <typename Qy, typename Qx>
IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) {
int nb = n/QFBase::k_step;
int nb4 = n/4;
Qy y(info);
Qx x(cx + ix0*bx, bx);
QFBase::Data xv[Qx::nrc];
QFBase::Acc acc[Qx::nrc*Qy::nrc];
auto yv = y.load1(0, 0);
for (int ix = 0; ix < Qx::nrc; ++ix) {
xv[ix] = x.load1(ix, 0);
acc[ix] = QFBase::acc_first(yv, xv[ix]);
}
for (int iy = 1; iy < Qy::nrc; ++iy) {
yv = y.load1(iy, 0);
for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]);
}
for (int i = 1; i < nb; ++i) {
yv = y.load1(0, i);
for (int ix = 0; ix < Qx::nrc; ++ix) {
xv[ix] = x.load1(ix, i);
acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
}
for (int iy = 1; iy < Qy::nrc; ++iy) {
yv = y.load1(iy, i);
for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
}
}
for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) {
yv = y.load_tail(0, i);
for (int ix = 0; ix < Qx::nrc; ++ix) {
xv[ix] = x.load_tail(ix, i);
acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
}
for (int iy = 1; iy < Qy::nrc; ++iy) {
yv = y.load_tail(iy, i);
for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
}
}
for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix]));
}
// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done
// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in
// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now.
template <int nrc_y, typename FloatX, typename FloatY>
void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
const char * cx = (const char *)vx;
// TBD if we want this
//if constexpr (nrc_y == 1) {
// constexpr int k_nx = 2;
// for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
// mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
// }
// if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) {
// int nx = nrc_x - lastx;
// switch (nx) {
// case 1: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info); break;
// case 2: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, lastx, info); break;
// case 3: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, lastx, info); break;
// }
// //mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info);
// }
// return;
//}
#ifdef __AVX512F__
constexpr int k_nx = 5;
#else
constexpr int k_nx = nrc_y == 1 ? 4 : 2;
#endif
for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
}
int last_x = k_nx*(nrc_x/k_nx);
if (last_x == nrc_x) return;
int nx = nrc_x - last_x;
#ifdef __AVX512F__
switch (nx) {
case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
case 4: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 4>>(n, cx, bx, last_x, info); break;
}
#else
if constexpr (nrc_y == 1) {
switch (nx) {
case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
}
} else {
switch (nx) {
case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
}
}
#endif
}
template <typename FloatX, typename FloatY>
void set_mul_mat_f(MulMat& mm) {
for (auto& f : mm.funcs) f = nullptr;
mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>;
mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>;
mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>;
mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>;
mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>;
#ifndef __AVX512F__
mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>;
#endif
}
/*
moonll
add typeb TO compare return not expected type of weight matrix
add IQ2XSS
add IQ1_S
add GGML_TYPE_IQ4_XS
*/
bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
(void)Ny;
auto expected_typeB = GGML_TYPE_Q8_K;
switch (typeA) {
case GGML_TYPE_Q2_K:
assert (ne00 % QK_K == 0);
MulMat::set_functions<DequantizerQ2K>(mm);
break;
case GGML_TYPE_Q3_K:
assert (ne00 % QK_K == 0);
MulMat::set_functions<DequantizerQ3K>(mm);
break;
case GGML_TYPE_Q4_K:
assert (ne00 % QK_K == 0);
MulMat::set_functions<DequantizerQ4K>(mm);
break;
case GGML_TYPE_Q5_K:
assert (ne00 % QK_K == 0);
MulMat::set_functions<DequantizerQ5K>(mm);
break;
case GGML_TYPE_Q6_K:
assert (ne00 % QK_K == 0);
MulMat::set_functions<DequantizerQ6K>(mm);
break;
case GGML_TYPE_IQ4_XS:
assert (ne00 % QK_K == 0);
MulMat::set_functions<DequantizerIQ4XS>(mm);
break;
case GGML_TYPE_IQ2_XXS:
assert (ne00 % QK_K == 0);
MulMat::set_functions<DequantizerIQ2XXS>(mm);
break;
case GGML_TYPE_Q4_0:
assert (ne00 % QK4_0 == 0);
MulMat::set_functions<Q4_0_Unpacker>(mm);
expected_typeB = GGML_TYPE_Q8_0;
break;
case GGML_TYPE_Q4_1:
assert (ne00 % QK4_1 == 0);
MulMat::set_functions<Q4_1_Unpacker>(mm);
expected_typeB = GGML_TYPE_Q8_1_X4;
break;
case GGML_TYPE_Q5_0:
assert (ne00 % QK5_0 == 0);
MulMat::set_functions<Q5_0_Unpacker>(mm);
expected_typeB = GGML_TYPE_Q8_0;
break;
case GGML_TYPE_Q5_1:
assert (ne00 % QK5_1 == 0);
MulMat::set_functions<Q5_1_Unpacker>(mm);
expected_typeB = GGML_TYPE_Q8_1_X4;
break;
case GGML_TYPE_Q8_0:
assert (ne00 % QK8_0 == 0);
#ifdef HAVE_FANCY_SIMD
MulMat::set_functions<Q8_0_1_Unpacker>(mm);
expected_typeB = GGML_TYPE_Q8_1_X4;
#else
MulMat::set_functions<Q8_0_Unpacker>(mm);
expected_typeB = GGML_TYPE_Q8_0_X4;
#endif
break;
case GGML_TYPE_IQ1_S:
mm.funcs[0] = mul_mat_iq1_s_q8_K<1>;
mm.funcs[1] = mul_mat_iq1_s_q8_K<2>;
mm.funcs[2] = mul_mat_iq1_s_q8_K<3>;
mm.funcs[3] = mul_mat_iq1_s_q8_K<4>;
mm.funcs[4] = mul_mat_iq1_s_q8_K<5>;
mm.funcs[5] = mul_mat_iq1_s_q8_K<6>;
mm.funcs[6] = mul_mat_iq1_s_q8_K<7>;
mm.funcs[7] = mul_mat_iq1_s_q8_K<8>;
#ifdef HAVE_FANCY_SIMD
mm.func16 = mul_mat_iq1_s_q8_K<16>;
#endif
// row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
expected_typeB = GGML_TYPE_Q8_K;
break;
default:
{
printf("case:%d",typeA);
return false;
}
}
return ggml_type(typeB) == expected_typeB;
}
} // namespace
/*
iq1_s is not support for arm
*/
#else // __aarch64__
#include <arm_neon.h>
namespace {
template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
constexpr static int nrc_y = nrc;
Q8(const DataInfo& info) {
for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
}
inline int8x16_t load_quants_16(int iy, int i, int j) const { return vld1q_s8(y[iy][i].qs + 16*j); }
inline int8x16x2_t load_quants(int iy, int i, int j) const { return vld1q_s8_x2(y[iy][i].qs + 32*j); }
inline int8x16x4_t load_quants_64(int iy, int i, int j) const { return vld1q_s8_x4(y[iy][i].qs + 64*j); }
inline int16x8x2_t load_bsums(int iy, int i) const { return vld1q_s16_x2(y[iy][i].bsums); }
inline int16x8_t load_bsums8(int iy, int i) const {
auto q8s = vld1q_s16_x2(y[iy][i].bsums);
return vpaddq_s16(q8s.val[0], q8s.val[1]);
}
inline float scale(int iy, int i) const { return y[iy][i].d; }
const block_q8 * y[nrc_y];
};
template <typename block_q>
struct BaseDequantizer {
BaseDequantizer(const void * vx, size_t bx, int nrc) : vx(vx), x(nullptr), bx(bx), nrc(nrc) {}
inline void new_row(int ix) { x = (const block_q *)((const char *)vx + ix*bx); }
const void * vx;
const block_q * x;
const size_t bx;
const int nrc;
};
struct Q4bits {
const uint8x16_t m4b = vdupq_n_u8(0xf);
uint8x16x4_t b1, b2;
inline void prepare4(uint8x16x4_t& b, const uint8x16_t * val) const {
b.val[0] = vandq_u8(val[0], m4b);
b.val[2] = vshrq_n_u8(val[0], 4);
b.val[1] = vandq_u8(val[1], m4b);
b.val[3] = vshrq_n_u8(val[1], 4);
}
inline void prepare4_16(uint8x16x4_t& b, const uint8x16_t * val) const {
b.val[0] = vandq_u8(val[0], m4b);
b.val[1] = vshrq_n_u8(val[0], 4);
b.val[2] = vandq_u8(val[1], m4b);
b.val[3] = vshrq_n_u8(val[1], 4);
}
inline void prepare(const uint8_t * qs) {
auto q4bits = vld1q_u8_x2(qs);
prepare4(b1, q4bits.val);
q4bits = vld1q_u8_x2(qs+32);
prepare4(b2, q4bits.val);
}
inline void prepare_v2(const uint8_t * qs) {
auto q4bits = vld1q_u8_x4(qs);
prepare4(b1, q4bits.val+0);
prepare4(b2, q4bits.val+2);
}
inline void prepare64(const uint8_t * qs) {
auto q4bits = vld1q_u8_x4(qs);
b1.val[0] = vandq_u8(q4bits.val[0], m4b);
b1.val[1] = vandq_u8(q4bits.val[1], m4b);
b1.val[2] = vandq_u8(q4bits.val[2], m4b);
b1.val[3] = vandq_u8(q4bits.val[3], m4b);
b2.val[0] = vshrq_n_u8(q4bits.val[0], 4);
b2.val[1] = vshrq_n_u8(q4bits.val[1], 4);
b2.val[2] = vshrq_n_u8(q4bits.val[2], 4);
b2.val[3] = vshrq_n_u8(q4bits.val[3], 4);
}
inline void prepare16(const uint8_t * qs) {
auto q4bits = vld1q_u8_x2(qs);
prepare4_16(b1, q4bits.val);
q4bits = vld1q_u8_x2(qs+32);
prepare4_16(b2, q4bits.val);
}
inline void prepare16_v2(const uint8_t * qs) {
auto q4bits = vld1q_u8_x4(qs);
prepare4_16(b1, q4bits.val+0);
prepare4_16(b2, q4bits.val+2);
}
};
struct Scales8 {
uint32_t utmp[4];
const uint8_t * sc8 = (const uint8_t *)utmp;
template <typename Q8, typename Qx>
inline int32x4x2_t process_scales_mins(const Qx& x, const Q8& q8, int i, float32x4_t * acc) {
make_q4_scales(x.scales, utmp);
int16x8_t mins = vmovl_s8(vld1_s8((const int8_t *)sc8 + 8));
accum_mins_8(mins, q8, acc, i, -GGML_FP16_TO_FP32(x.dmin));
uint8x8_t scales8 = vld1_u8(sc8);
uint16x8_t scales16 = vmovl_u8(scales8);
int32x4x2_t scales = {vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales16))),
vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales16)))};
return scales;
}
};
struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
DequantizerQ4K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
constexpr static int num_blocks() { return 8; }
constexpr static bool should_scale_quants() { return false; }
template <typename Q8>
inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
d = GGML_FP16_TO_FP32(x[i].d);
return s8.process_scales_mins(x[i], q8, i, acc);
}
inline void prepare(int i, int j) {
if (nrc == 1) bits.prepare_v2(x[i].qs+64*j);
else bits.prepare(x[i].qs+64*j);
}
Q4bits bits;
Scales8 s8;
float d;
};
struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
DequantizerQ6K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
constexpr static int num_blocks() { return 16; }
constexpr static bool should_scale_quants() { return false; }
template <typename Q8>
inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
d = GGML_FP16_TO_FP32(x[i].d);
return process_scales_mins_16(vld1q_s8(x[i].scales), q8, acc, i, -32.f*d);
}
inline void prepare(int i, int j) {
auto hbits = vld1q_u8_x2(x[i].qh + 32*j);
bits.prepare64(x[i].ql+64*j);
bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(vshlq_n_u8(hbits.val[0], 4), mhb));
bits.b1.val[1] = vorrq_u8(bits.b1.val[1], vandq_u8(vshlq_n_u8(hbits.val[1], 4), mhb));
bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(vshlq_n_u8(hbits.val[0], 2), mhb));
bits.b1.val[3] = vorrq_u8(bits.b1.val[3], vandq_u8(vshlq_n_u8(hbits.val[1], 2), mhb));
bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(hbits.val[0], mhb));
bits.b2.val[1] = vorrq_u8(bits.b2.val[1], vandq_u8(hbits.val[1], mhb));
bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(vshrq_n_u8(hbits.val[0], 2), mhb));
bits.b2.val[3] = vorrq_u8(bits.b2.val[3], vandq_u8(vshrq_n_u8(hbits.val[1], 2), mhb));
}
Q4bits bits;
const uint8x16_t mhb = vdupq_n_u8(0x30);
float d;
};
template <typename Dequantizer>
struct BlockQxK {
inline BlockQxK(const int maxn, const int maxk): maxn(maxn), maxk(maxk) {
values = (int8_t*)aligned_alloc(256, maxn * maxk * sizeof(int8_t));
scales = (int*)aligned_alloc(256, maxn * maxk / SS * sizeof(int));
ds = (float*)aligned_alloc(256, maxn * maxk / QK * sizeof(int));
if constexpr (NeedSum) {
dmins = (float*)aligned_alloc(256, maxn * maxk / QK * sizeof(int));
scalems = (int16_t*)aligned_alloc(256, maxn * maxk / SS * sizeof(int16_t));
}
}
inline ~BlockQxK() {
free(values);
free(scales);
free(ds);
if constexpr (NeedSum) {
free(dmins);
free(scalems);
}
}
inline int FromDequantizer(const void * vx, size_t bx, int idx, int n_, int k_) {
n = n_;
k = k_;
bn = n / BS;
bk = k / QK;
Dequantizer deq(vx, bx, 1);
for (int i = 0; i < n; i += BS) {
for (int j = 0; j < BS; j ++) {
deq.new_row(j + i + idx);
for (int x = 0; x < bk; x ++) {
{
int8x16_t base = NeedSum ? vdupq_n_s8(0) : vdupq_n_s8(32);
int32_t *dst = (int32_t*)(values + i*k + j*4 + x*QK*BS);
deq.prepare(x, 0);
int8x16_t v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[0]), base);
int8x16_t v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[1]), base);
int8x16_t v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[2]), base);
int8x16_t v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[3]), base);
*(dst + (0 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0);
*(dst + (1 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1);
*(dst + (2 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2);
*(dst + (3 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3);
*(dst + (0 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0);
*(dst + (1 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1);
*(dst + (2 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2);
*(dst + (3 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3);
*(dst + (0 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0);
*(dst + (1 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1);
*(dst + (2 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2);
*(dst + (3 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3);
*(dst + (0 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0);
*(dst + (1 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1);
*(dst + (2 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2);
*(dst + (3 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3);
v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[0]), base);
v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[1]), base);
v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[2]), base);
v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[3]), base);
*(dst + (0 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0);
*(dst + (1 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1);
*(dst + (2 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2);
*(dst + (3 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3);
*(dst + (0 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0);
*(dst + (1 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1);
*(dst + (2 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2);
*(dst + (3 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3);
*(dst + (0 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0);
*(dst + (1 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1);
*(dst + (2 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2);
*(dst + (3 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3);
*(dst + (0 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0);
*(dst + (1 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1);
*(dst + (2 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2);
*(dst + (3 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3);
deq.prepare(x, 1);
v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[0]), base);
v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[1]), base);
v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[2]), base);
v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[3]), base);
*(dst + (0 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0);
*(dst + (1 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1);
*(dst + (2 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2);
*(dst + (3 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3);
*(dst + (0 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0);
*(dst + (1 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1);
*(dst + (2 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2);
*(dst + (3 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3);
*(dst + (0 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0);
*(dst + (1 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1);
*(dst + (2 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2);
*(dst + (3 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3);
*(dst + (0 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0);
*(dst + (1 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1);
*(dst + (2 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2);
*(dst + (3 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3);
v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[0]), base);
v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[1]), base);
v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[2]), base);
v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[3]), base);
*(dst + (0 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0);
*(dst + (1 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1);
*(dst + (2 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2);
*(dst + (3 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3);
*(dst + (0 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0);
*(dst + (1 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1);
*(dst + (2 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2);
*(dst + (3 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3);
*(dst + (0 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0);
*(dst + (1 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1);
*(dst + (2 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2);
*(dst + (3 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3);
*(dst + (0 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0);
*(dst + (1 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1);
*(dst + (2 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2);
*(dst + (3 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3);
}
if constexpr (std::is_same_v<DequantizerQ6K, Dequantizer>)
{
int32_t *dst = (int32_t*)(scales + i*(k/SS) + j + x*QK/SS*BS);
int8x16_t ss = vld1q_s8(deq.x[x].scales);
int16x8_t s16_0 = vmovl_s8(vget_low_s8(ss));
int16x8_t s16_1 = vmovl_s8(vget_high_s8(ss));
int32x4_t s32_0 = vmovl_s16(vget_low_s16(s16_0));
int32x4_t s32_1 = vmovl_s16(vget_high_s16(s16_0));
int32x4_t s32_2 = vmovl_s16(vget_low_s16(s16_1));
int32x4_t s32_3 = vmovl_s16(vget_high_s16(s16_1));
*(dst + (0+0*4)*BS) = vgetq_lane_s32(s32_0, 0);
*(dst + (1+0*4)*BS) = vgetq_lane_s32(s32_0, 1);
*(dst + (2+0*4)*BS) = vgetq_lane_s32(s32_0, 2);
*(dst + (3+0*4)*BS) = vgetq_lane_s32(s32_0, 3);
*(dst + (0+1*4)*BS) = vgetq_lane_s32(s32_1, 0);
*(dst + (1+1*4)*BS) = vgetq_lane_s32(s32_1, 1);
*(dst + (2+1*4)*BS) = vgetq_lane_s32(s32_1, 2);
*(dst + (3+1*4)*BS) = vgetq_lane_s32(s32_1, 3);
*(dst + (0+2*4)*BS) = vgetq_lane_s32(s32_2, 0);
*(dst + (1+2*4)*BS) = vgetq_lane_s32(s32_2, 1);
*(dst + (2+2*4)*BS) = vgetq_lane_s32(s32_2, 2);
*(dst + (3+2*4)*BS) = vgetq_lane_s32(s32_2, 3);
*(dst + (0+3*4)*BS) = vgetq_lane_s32(s32_3, 0);
*(dst + (1+3*4)*BS) = vgetq_lane_s32(s32_3, 1);
*(dst + (2+3*4)*BS) = vgetq_lane_s32(s32_3, 2);
*(dst + (3+3*4)*BS) = vgetq_lane_s32(s32_3, 3);
}
if constexpr (std::is_same_v<DequantizerQ4K, Dequantizer>)
{
int32_t *dst = (int32_t*)(scales + i*(k/SS) + j + x*QK/SS*BS);
int16_t *dst2 = (int16_t*)(scalems + i*(k/SS) + j + x*QK/SS*BS);
uint32_t utmp[4];
const uint8_t * sc8 = (const uint8_t *)utmp;
make_q4_scales(deq.x[x].scales, utmp);
int8x16_t ss = vld1q_s8((const int8_t *)sc8);
int16x8_t scale = vmovl_s8(vget_low_s8(ss));
int16x8_t scale_min = vmovl_high_s8(ss);
int32x4_t s32_0 = vmovl_s16(vget_low_s16(scale));
int32x4_t s32_1 = vmovl_s16(vget_high_s16(scale));
*(dst + (0+0*4)*BS) = vgetq_lane_s32(s32_0, 0);
*(dst + (1+0*4)*BS) = vgetq_lane_s32(s32_0, 1);
*(dst + (2+0*4)*BS) = vgetq_lane_s32(s32_0, 2);
*(dst + (3+0*4)*BS) = vgetq_lane_s32(s32_0, 3);
*(dst + (0+1*4)*BS) = vgetq_lane_s32(s32_1, 0);
*(dst + (1+1*4)*BS) = vgetq_lane_s32(s32_1, 1);
*(dst + (2+1*4)*BS) = vgetq_lane_s32(s32_1, 2);
*(dst + (3+1*4)*BS) = vgetq_lane_s32(s32_1, 3);
*(dst2 + 0*BS) = vgetq_lane_s16(scale_min, 0);
*(dst2 + 1*BS) = vgetq_lane_s16(scale_min, 1);
*(dst2 + 2*BS) = vgetq_lane_s16(scale_min, 2);
*(dst2 + 3*BS) = vgetq_lane_s16(scale_min, 3);
*(dst2 + 4*BS) = vgetq_lane_s16(scale_min, 4);
*(dst2 + 5*BS) = vgetq_lane_s16(scale_min, 5);
*(dst2 + 6*BS) = vgetq_lane_s16(scale_min, 6);
*(dst2 + 7*BS) = vgetq_lane_s16(scale_min, 7);
}
{
float *dst = ds + i*bk + j + x*BS;
*dst = GGML_FP16_TO_FP32(deq.x[x].d);
}
if constexpr (std::is_same_v<DequantizerQ4K, Dequantizer>)
{
float *dst = dmins + i*bk + j + x*BS;
*dst = - GGML_FP16_TO_FP32(deq.x[x].dmin);
}
}
}
}
return 0;
}
int8_t *values; // [bn][k/4][BS][4]
int *scales; // [bn][k/SS][BS]
float *ds; // [bn][bk][BS]
float *dmins; // [bn][bk][BS]
int16_t *scalems; // [bn][k/SS][BS]
static constexpr int BS = 8;
static constexpr int QK = 256;
static constexpr int SS = std::is_same_v<Dequantizer, DequantizerQ6K> ? 16 : 32;
static constexpr int NeedSum = std::is_same_v<Dequantizer, DequantizerQ6K> ? 0 : 1;
const int maxn;
const int maxk;
int n;
int k;
int bn;
int bk;
};
template <typename Dequantizer, int BN>
IQK_NOINLINE void matmul_v2_kernel(const Dequantizer *a, const block_q8_K *y[BN], const DataInfo &info, int idx, int idy) {
constexpr int BS = a->BS;
constexpr int QK = a->QK;
constexpr int SS = a->SS;
for (int s = 0; s < a->n; s += BS) {
float32x4_t cc[BN][BS/4];
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/4; j ++) {
cc[i][j] = vdupq_n_f32(0);
}
}
const int8_t *a_ptr = a->values + s*a->k;
const int8_t *b_ptr[BN];
for (int k = 0; k < a->bk; k ++) {
for (int i = 0; i < BN; i ++) {
b_ptr[i] = y[i][k].qs;
}
int32x4_t cci[BN][BS/4];
if constexpr (BN == 4 && SS == 16) {
int64_t length = QK/SS;
auto ap = a_ptr;
auto sp = a->scales + s*a->k/SS + (k*QK/SS)*BS;
// asm volatile (
asm volatile (
" eor %[c00].16b, %[c00].16b, %[c00].16b \n"
" eor %[c10].16b, %[c10].16b, %[c10].16b \n"
" eor %[c20].16b, %[c20].16b, %[c20].16b \n"
" eor %[c30].16b, %[c30].16b, %[c30].16b \n"
" eor %[c01].16b, %[c01].16b, %[c01].16b \n"
" eor %[c11].16b, %[c11].16b, %[c11].16b \n"
" eor %[c21].16b, %[c21].16b, %[c21].16b \n"
" eor %[c31].16b, %[c31].16b, %[c31].16b \n"
" loop_%=: \n"
" subs %[len], %[len], #1 \n"
" ld1 {v12.16b}, [%[bp0]], #16 \n"
" ld1 {v13.16b}, [%[bp1]], #16 \n"
" ld1 {v14.16b}, [%[bp2]], #16 \n"
" ld1 {v15.16b}, [%[bp3]], #16 \n"
" prfm pldl1strm, [%[ap], #256] \n"
" ld1 {v8.16b}, [%[ap]], #16 \n"
" ld1 {v9.16b}, [%[ap]], #16 \n"
" eor v0.16b, v0.16b, v0.16b \n"
" eor v1.16b, v1.16b, v1.16b \n"
" eor v2.16b, v2.16b, v2.16b \n"
" eor v3.16b, v3.16b, v3.16b \n"
" eor v4.16b, v4.16b, v4.16b \n"
" eor v5.16b, v5.16b, v5.16b \n"
" eor v6.16b, v6.16b, v6.16b \n"
" eor v7.16b, v7.16b, v7.16b \n"
" ld1 {v10.16b}, [%[ap]], #16 \n"
" ld1 {v11.16b}, [%[ap]], #16 \n"
" sdot v0.4s, v8.16b, v12.4b[0] \n"
" sdot v1.4s, v8.16b, v13.4b[0] \n"
" sdot v2.4s, v8.16b, v14.4b[0] \n"
" sdot v3.4s, v8.16b, v15.4b[0] \n"
" sdot v4.4s, v9.16b, v12.4b[0] \n"
" sdot v5.4s, v9.16b, v13.4b[0] \n"
" sdot v6.4s, v9.16b, v14.4b[0] \n"
" sdot v7.4s, v9.16b, v15.4b[0] \n"
" prfm pldl1strm, [%[ap], #256] \n"
" ld1 {v8.16b}, [%[ap]], #16 \n"
" ld1 {v9.16b}, [%[ap]], #16 \n"
" sdot v0.4s, v10.16b, v12.4b[1] \n"
" sdot v1.4s, v10.16b, v13.4b[1] \n"
" sdot v2.4s, v10.16b, v14.4b[1] \n"
" sdot v3.4s, v10.16b, v15.4b[1] \n"
" sdot v4.4s, v11.16b, v12.4b[1] \n"
" sdot v5.4s, v11.16b, v13.4b[1] \n"
" sdot v6.4s, v11.16b, v14.4b[1] \n"
" sdot v7.4s, v11.16b, v15.4b[1] \n"
" ld1 {v10.16b}, [%[ap]], #16 \n"
" ld1 {v11.16b}, [%[ap]], #16 \n"
" sdot v0.4s, v8.16b, v12.4b[2] \n"
" sdot v1.4s, v8.16b, v13.4b[2] \n"
" sdot v2.4s, v8.16b, v14.4b[2] \n"
" sdot v3.4s, v8.16b, v15.4b[2] \n"
" sdot v4.4s, v9.16b, v12.4b[2] \n"
" sdot v5.4s, v9.16b, v13.4b[2] \n"
" sdot v6.4s, v9.16b, v14.4b[2] \n"
" sdot v7.4s, v9.16b, v15.4b[2] \n"
" ld1 {v8.4s}, [%[sp]], #16 \n"
" ld1 {v9.4s}, [%[sp]], #16 \n"
" sdot v0.4s, v10.16b, v12.4b[3] \n"
" sdot v1.4s, v10.16b, v13.4b[3] \n"
" sdot v2.4s, v10.16b, v14.4b[3] \n"
" sdot v3.4s, v10.16b, v15.4b[3] \n"
" sdot v4.4s, v11.16b, v12.4b[3] \n"
" sdot v5.4s, v11.16b, v13.4b[3] \n"
" sdot v6.4s, v11.16b, v14.4b[3] \n"
" sdot v7.4s, v11.16b, v15.4b[3] \n"
" mla %[c00].4s, v0.4s, v8.4s \n"
" mla %[c10].4s, v1.4s, v8.4s \n"
" mla %[c20].4s, v2.4s, v8.4s \n"
" mla %[c30].4s, v3.4s, v8.4s \n"
" mla %[c01].4s, v4.4s, v9.4s \n"
" mla %[c11].4s, v5.4s, v9.4s \n"
" mla %[c21].4s, v6.4s, v9.4s \n"
" mla %[c31].4s, v7.4s, v9.4s \n"
" bne loop_%= \n"
" exit_%=:\n"
: [len] "+r" (length)
, [ap] "+r" (ap)
, [bp0] "+r" (b_ptr[0])
, [bp1] "+r" (b_ptr[1])
, [bp2] "+r" (b_ptr[2])
, [bp3] "+r" (b_ptr[3])
, [sp] "+r" (sp)
, [c00] "+w" (cci[0][0])
, [c10] "+w" (cci[1][0])
, [c20] "+w" (cci[2][0])
, [c30] "+w" (cci[3][0])
, [c01] "+w" (cci[0][1])
, [c11] "+w" (cci[1][1])
, [c21] "+w" (cci[2][1])
, [c31] "+w" (cci[3][1])
:
: "v0", "v1", "v2", "v3"
, "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11"
, "v12", "v13", "v14", "v15"
, "memory", "cc"
);
a_ptr += BS * QK;
} else if (BN == 4 && SS == 32) {
int64_t length = QK/SS;
auto ap = a_ptr;
auto sp = a->scales + s*a->k/SS + (k*QK/SS)*BS;
// asm volatile (
asm volatile (
" eor %[c00].16b, %[c00].16b, %[c00].16b \n"
" eor %[c10].16b, %[c10].16b, %[c10].16b \n"
" eor %[c20].16b, %[c20].16b, %[c20].16b \n"
" eor %[c30].16b, %[c30].16b, %[c30].16b \n"
" eor %[c01].16b, %[c01].16b, %[c01].16b \n"
" eor %[c11].16b, %[c11].16b, %[c11].16b \n"
" eor %[c21].16b, %[c21].16b, %[c21].16b \n"
" eor %[c31].16b, %[c31].16b, %[c31].16b \n"
" loop_%=: \n"
" subs %[len], %[len], #1 \n"
" ld1 {v12.16b}, [%[bp0]], #16 \n"
" ld1 {v13.16b}, [%[bp1]], #16 \n"
" ld1 {v14.16b}, [%[bp2]], #16 \n"
" ld1 {v15.16b}, [%[bp3]], #16 \n"
" prfm pldl1strm, [%[ap], #256] \n"
" ld1 {v8.16b}, [%[ap]], #16 \n"
" ld1 {v9.16b}, [%[ap]], #16 \n"
" eor v0.16b, v0.16b, v0.16b \n"
" eor v1.16b, v1.16b, v1.16b \n"
" eor v2.16b, v2.16b, v2.16b \n"
" eor v3.16b, v3.16b, v3.16b \n"
" eor v4.16b, v4.16b, v4.16b \n"
" eor v5.16b, v5.16b, v5.16b \n"
" eor v6.16b, v6.16b, v6.16b \n"
" eor v7.16b, v7.16b, v7.16b \n"
" ld1 {v10.16b}, [%[ap]], #16 \n"
" ld1 {v11.16b}, [%[ap]], #16 \n"
" sdot v0.4s, v8.16b, v12.4b[0] \n"
" sdot v1.4s, v8.16b, v13.4b[0] \n"
" sdot v2.4s, v8.16b, v14.4b[0] \n"
" sdot v3.4s, v8.16b, v15.4b[0] \n"
" sdot v4.4s, v9.16b, v12.4b[0] \n"
" sdot v5.4s, v9.16b, v13.4b[0] \n"
" sdot v6.4s, v9.16b, v14.4b[0] \n"
" sdot v7.4s, v9.16b, v15.4b[0] \n"
" prfm pldl1strm, [%[ap], #256] \n"
" ld1 {v8.16b}, [%[ap]], #16 \n"
" ld1 {v9.16b}, [%[ap]], #16 \n"
" sdot v0.4s, v10.16b, v12.4b[1] \n"
" sdot v1.4s, v10.16b, v13.4b[1] \n"
" sdot v2.4s, v10.16b, v14.4b[1] \n"
" sdot v3.4s, v10.16b, v15.4b[1] \n"
" sdot v4.4s, v11.16b, v12.4b[1] \n"
" sdot v5.4s, v11.16b, v13.4b[1] \n"
" sdot v6.4s, v11.16b, v14.4b[1] \n"
" sdot v7.4s, v11.16b, v15.4b[1] \n"
" ld1 {v10.16b}, [%[ap]], #16 \n"
" ld1 {v11.16b}, [%[ap]], #16 \n"
" sdot v0.4s, v8.16b, v12.4b[2] \n"
" sdot v1.4s, v8.16b, v13.4b[2] \n"
" sdot v2.4s, v8.16b, v14.4b[2] \n"
" sdot v3.4s, v8.16b, v15.4b[2] \n"
" sdot v4.4s, v9.16b, v12.4b[2] \n"
" sdot v5.4s, v9.16b, v13.4b[2] \n"
" sdot v6.4s, v9.16b, v14.4b[2] \n"
" sdot v7.4s, v9.16b, v15.4b[2] \n"
" prfm pldl1strm, [%[ap], #256] \n"
" ld1 {v8.16b}, [%[ap]], #16 \n"
" ld1 {v9.16b}, [%[ap]], #16 \n"
" sdot v0.4s, v10.16b, v12.4b[3] \n"
" sdot v1.4s, v10.16b, v13.4b[3] \n"
" sdot v2.4s, v10.16b, v14.4b[3] \n"
" sdot v3.4s, v10.16b, v15.4b[3] \n"
" sdot v4.4s, v11.16b, v12.4b[3] \n"
" sdot v5.4s, v11.16b, v13.4b[3] \n"
" sdot v6.4s, v11.16b, v14.4b[3] \n"
" sdot v7.4s, v11.16b, v15.4b[3] \n"
" ld1 {v10.16b}, [%[ap]], #16 \n"
" ld1 {v11.16b}, [%[ap]], #16 \n"
" ld1 {v12.16b}, [%[bp0]], #16 \n"
" ld1 {v13.16b}, [%[bp1]], #16 \n"
" ld1 {v14.16b}, [%[bp2]], #16 \n"
" ld1 {v15.16b}, [%[bp3]], #16 \n"
" sdot v0.4s, v8.16b, v12.4b[0] \n"
" sdot v1.4s, v8.16b, v13.4b[0] \n"
" sdot v2.4s, v8.16b, v14.4b[0] \n"
" sdot v3.4s, v8.16b, v15.4b[0] \n"
" sdot v4.4s, v9.16b, v12.4b[0] \n"
" sdot v5.4s, v9.16b, v13.4b[0] \n"
" sdot v6.4s, v9.16b, v14.4b[0] \n"
" sdot v7.4s, v9.16b, v15.4b[0] \n"
" prfm pldl1strm, [%[ap], #256] \n"
" ld1 {v8.16b}, [%[ap]], #16 \n"
" ld1 {v9.16b}, [%[ap]], #16 \n"
" sdot v0.4s, v10.16b, v12.4b[1] \n"
" sdot v1.4s, v10.16b, v13.4b[1] \n"
" sdot v2.4s, v10.16b, v14.4b[1] \n"
" sdot v3.4s, v10.16b, v15.4b[1] \n"
" sdot v4.4s, v11.16b, v12.4b[1] \n"
" sdot v5.4s, v11.16b, v13.4b[1] \n"
" sdot v6.4s, v11.16b, v14.4b[1] \n"
" sdot v7.4s, v11.16b, v15.4b[1] \n"
" ld1 {v10.16b}, [%[ap]], #16 \n"
" ld1 {v11.16b}, [%[ap]], #16 \n"
" sdot v0.4s, v8.16b, v12.4b[2] \n"
" sdot v1.4s, v8.16b, v13.4b[2] \n"
" sdot v2.4s, v8.16b, v14.4b[2] \n"
" sdot v3.4s, v8.16b, v15.4b[2] \n"
" sdot v4.4s, v9.16b, v12.4b[2] \n"
" sdot v5.4s, v9.16b, v13.4b[2] \n"
" sdot v6.4s, v9.16b, v14.4b[2] \n"
" sdot v7.4s, v9.16b, v15.4b[2] \n"
" ld1 {v8.4s}, [%[sp]], #16 \n"
" ld1 {v9.4s}, [%[sp]], #16 \n"
" sdot v0.4s, v10.16b, v12.4b[3] \n"
" sdot v1.4s, v10.16b, v13.4b[3] \n"
" sdot v2.4s, v10.16b, v14.4b[3] \n"
" sdot v3.4s, v10.16b, v15.4b[3] \n"
" sdot v4.4s, v11.16b, v12.4b[3] \n"
" sdot v5.4s, v11.16b, v13.4b[3] \n"
" sdot v6.4s, v11.16b, v14.4b[3] \n"
" sdot v7.4s, v11.16b, v15.4b[3] \n"
" mla %[c00].4s, v0.4s, v8.4s \n"
" mla %[c10].4s, v1.4s, v8.4s \n"
" mla %[c20].4s, v2.4s, v8.4s \n"
" mla %[c30].4s, v3.4s, v8.4s \n"
" mla %[c01].4s, v4.4s, v9.4s \n"
" mla %[c11].4s, v5.4s, v9.4s \n"
" mla %[c21].4s, v6.4s, v9.4s \n"
" mla %[c31].4s, v7.4s, v9.4s \n"
" bne loop_%= \n"
" exit_%=:\n"
: [len] "+r" (length)
, [ap] "+r" (ap)
, [bp0] "+r" (b_ptr[0])
, [bp1] "+r" (b_ptr[1])
, [bp2] "+r" (b_ptr[2])
, [bp3] "+r" (b_ptr[3])
, [sp] "+r" (sp)
, [c00] "+w" (cci[0][0])
, [c10] "+w" (cci[1][0])
, [c20] "+w" (cci[2][0])
, [c30] "+w" (cci[3][0])
, [c01] "+w" (cci[0][1])
, [c11] "+w" (cci[1][1])
, [c21] "+w" (cci[2][1])
, [c31] "+w" (cci[3][1])
:
: "v0", "v1", "v2", "v3"
, "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11"
, "v12", "v13", "v14", "v15"
, "memory", "cc"
);
a_ptr += BS * QK;
} else
{
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/4; j ++) {
cci[i][j] = vdupq_n_s32(0);
}
}
for (int k0 = 0; k0 < QK/SS; k0 ++) {
int32x4_t ccv[BN][BS/4];
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/4; j ++) {
ccv[i][j] = vdupq_n_s32(0);
}
}
#pragma unroll
for (int k2 = 0; k2 < SS; k2 += 16) {
const int OFFSET = 256;
__builtin_prefetch((a_ptr + OFFSET + 0*64), 0, 0);
__builtin_prefetch((a_ptr + OFFSET + 1*64), 0, 0);
int8x16_t bb[BN];
int8x16_t aa[BS/4];
for (int i = 0; i < BN; i ++) {
bb[i] = vld1q_s8(b_ptr[i]); b_ptr[i] += 16;
}
for (int k1 = 0; k1 < 4; k1 ++) {
for (int i = 0; i < BS/4; i ++) {
aa[i] = vld1q_s8(a_ptr); a_ptr += 16;
}
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/4; j ++) {
ccv[i][j] = vdotq_laneq_s32(ccv[i][j], aa[j], bb[i], k1);
}
}
}
}
int32x4_t scal[BS/4];
for (int i = 0; i < BS/4; i ++) {
scal[i] = vld1q_s32(a->scales + s*a->k/SS + (k*QK/SS+k0)*BS + i*4);
}
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/4; j ++) {
cci[i][j] = vmlaq_s32(cci[i][j], ccv[i][j], scal[j]);
}
}
}
}
float32x4_t scalf[BS/4];
for (int i = 0; i < BS/4; i ++) {
scalf[i] = vld1q_f32(a->ds + s*a->bk + k*BS + i*4);
}
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/4; j ++) {
cc[i][j] = vfmaq_f32(cc[i][j], vcvtq_f32_s32(cci[i][j]), vmulq_n_f32(scalf[j], y[i][k].d));
}
}
}
if constexpr (a->NeedSum) {
const int16_t *a_ptr = a->scalems + s*a->k/SS;
const int16_t *b_ptr[BN];
for (int k = 0; k < a->bk; k ++) {
for (int i = 0; i < BN; i ++) {
b_ptr[i] = y[i][k].bsums;
}
int32x4_t cci[BN][BS/4];
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/4; j ++) {
cci[i][j] = vdupq_n_s32(0);
}
}
for (int k0 = 0; k0 < QK/SS/4; k0 ++) {
int16x8_t bb[BN];
int16x8_t aa[BS/8];
for (int i = 0; i < BN; i ++) {
bb[i] = vld1q_s16(b_ptr[i]); b_ptr[i] += 8;
}
for (int k1 = 0; k1 < 4; k1 ++) {
for (int i = 0; i < BS/8; i ++) {
aa[i] = vld1q_s16(a_ptr); a_ptr += 8;
}
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/8; j ++) {
cci[i][2*j+0] = vmlal_laneq_s16(cci[i][2*j+0], vget_low_s16(aa[j]), bb[i], 2*k1+0);
cci[i][2*j+1] = vmlal_high_laneq_s16(cci[i][2*j+1], aa[j], bb[i], 2*k1+0);
cci[i][2*j+0] = vmlal_laneq_s16(cci[i][2*j+0], vget_low_s16(aa[j]), bb[i], 2*k1+1);
cci[i][2*j+1] = vmlal_high_laneq_s16(cci[i][2*j+1], aa[j], bb[i], 2*k1+1);
}
}
}
}
float32x4_t scalf[BS/4];
for (int i = 0; i < BS/4; i ++) {
scalf[i] = vld1q_f32(a->dmins + s*a->bk + k*BS + i*4);
}
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/4; j ++) {
cc[i][j] = vfmaq_f32(cc[i][j], vcvtq_f32_s32(cci[i][j]), vmulq_n_f32(scalf[j], y[i][k].d));
}
}
}
}
for (int i = 0; i < BN; i ++) {
for (int j = 0; j < BS/4; j ++) {
vst1q_f32(info.ptr(j*4+s+idx, i), cc[i][j]);
}
}
}
return;
}
template <typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_T_v2(int m, int n, int k, const void * vx, size_t bx, const DataInfo& info) {
constexpr int m_step = 64;
constexpr int n_step = 4;
assert(m%m_step == 0);
int n2 = n - (n%n_step);
int left = n%n_step;
BlockQxK<Dequantizer> xx(m_step, k);
for (int i = 0; i < m; i += m_step) {
auto this_info = info;
int bm = (m - i) < m_step ? (m - i) : m_step;
xx.FromDequantizer(vx, bx, i, bm, k);
for (int j = 0; j < n2; j += n_step) {
Q8<n_step, block_q8_K> q8(this_info);
matmul_v2_kernel<BlockQxK<Dequantizer>, n_step>(&xx, q8.y, this_info, i, j);
this_info.cur_y += n_step;
}
if (left) {
switch (left) {
case 1:
{
Q8<1, block_q8_K> q8(this_info);
matmul_v2_kernel<BlockQxK<Dequantizer>, 1>(&xx, q8.y, this_info, i, n2);
this_info.cur_y += 1;
break;
}
case 2:
{
Q8<2, block_q8_K> q8(this_info);
matmul_v2_kernel<BlockQxK<Dequantizer>, 2>(&xx, q8.y, this_info, i, n2);
this_info.cur_y += 2;
break;
}
case 3:
{
Q8<3, block_q8_K> q8(this_info);
matmul_v2_kernel<BlockQxK<Dequantizer>, 3>(&xx, q8.y, this_info, i, n2);
this_info.cur_y += 3;
break;
}
}
}
}
return;
}
template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
const int32x4x2_t& scales, int iy, int i, int j, int32x4_t& sumi) {
auto mzero = vdupq_n_s32(0);
const int8x16_t * qs_1 = (const int8x16_t *)qx_1.val;
const int8x16_t * qs_2 = (const int8x16_t *)qx_2.val;
auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[0], q8b_1.val[0]), qs_1[1], q8b_1.val[1]); // block 1
auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[2], q8b_2.val[0]), qs_1[3], q8b_2.val[1]); // block 2
auto p12 = vpaddq_s32(p1, p2);
auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[0], q8b_3.val[0]), qs_2[1], q8b_3.val[1]); // block 3
auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[2], q8b_4.val[0]), qs_2[3], q8b_4.val[1]); // block 4
auto p34 = vpaddq_s32(p3, p4);
auto pall = vpaddq_s32(p12, p34);
sumi = vmlaq_s32(sumi, scales.val[j], pall);
}
template <typename Q8>
IQK_ALWAYS_INLINE void compute_8_blocks(const int8x16_t * qx, const Q8& q8,
const int32x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {
auto mzero = vdupq_n_s32(0);
auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[0], q8b_1.val[0]), qx[1], q8b_1.val[1]); // block 1
auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[2], q8b_2.val[0]), qx[3], q8b_2.val[1]); // block 2
auto p12 = vpaddq_s32(p1, p2);
auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[4], q8b_3.val[0]), qx[5], q8b_3.val[1]); // block 3
auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[6], q8b_4.val[0]), qx[7], q8b_4.val[1]); // block 4
auto p34 = vpaddq_s32(p3, p4);
auto pall = vpaddq_s32(p12, p34);
sumi = vmlaq_s32(sumi, scales, pall);
}
template <typename Q8>
IQK_ALWAYS_INLINE void compute_16_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8,
const int32x4x4_t& scales, int iy, int i, int j, int32x4_t& sumi) {
auto mzero = vdupq_n_s32(0);
auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
auto p1 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[0]), q8b_1.val[0]),
ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[1]), q8b_1.val[1])); // blocks 0, 0, 1, 1,
auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
auto p2 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[2]), q8b_2.val[0]),
ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[3]), q8b_2.val[1])); // blocks 3, 3, 4, 4,
auto p12 = vpaddq_s32(p1, p2); // blocks 0, 1, 2, 3
sumi = vmlaq_s32(sumi, scales.val[2*j+0], p12);
auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
auto p3 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[0]), q8b_3.val[0]),
ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[1]), q8b_3.val[1])); // block 4, 4, 5, 5,
auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
auto p4 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[2]), q8b_4.val[0]),
ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[3]), q8b_4.val[1])); // block 6, 6, 7, 7,
auto p34 = vpaddq_s32(p3, p4); // blocks 4, 5, 6, 7
sumi = vmlaq_s32(sumi, scales.val[2*j+1], p34);
}
template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
Q8<nrc_y, block_q8_K> q8(info);
Dequantizer deq(vx, bx, nrc_y);
for (int ix = 0; ix < nrc_x; ++ix) {
deq.new_row(ix);
float32x4_t acc[nrc_y];
for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);
for (int i = 0; i < nb; ++i) {
int32x4_t sumi[nrc_y];
for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);
if constexpr (Dequantizer::num_blocks() == 8) {
auto scales = deq.new_block(i);
deq.prepare(i, 0);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
deq.prepare(i, 1);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
}
else if constexpr (Dequantizer::num_blocks() == 16) {
auto scales = deq.new_block(i);
deq.prepare(i, 0);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
deq.prepare(i, 1);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
}
else {
GGML_ASSERT(false);
}
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) {
acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
}
}
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, vaddvq_f32(acc[iy]));
}
}
}
template <typename Q8>
inline void accum_mins_8(const int16x8_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
auto q8s = q8.load_bsums8(iy, i);
int32x4_t b1 = vmull_s16(vget_low_s16(mins), vget_low_s16(q8s));
int32x4_t b2 = vmull_s16(vget_high_s16(mins), vget_high_s16(q8s));
float32x4_t prod = vcvtq_f32_s32(vaddq_s32(b1, b2));
acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
}
}
template <typename Q8>
inline void accum_mins_16(const int16x8x2_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) {
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
auto q8s = q8.load_bsums(iy, i);
int32x4_t b1 = vmull_s16(vget_low_s16 (mins.val[0]), vget_low_s16 (q8s.val[0]));
int32x4_t b2 = vmull_s16(vget_high_s16(mins.val[0]), vget_high_s16(q8s.val[0]));
int32x4_t b3 = vmull_s16(vget_low_s16 (mins.val[1]), vget_low_s16 (q8s.val[1]));
int32x4_t b4 = vmull_s16(vget_high_s16(mins.val[1]), vget_high_s16(q8s.val[1]));
float32x4_t prod = vcvtq_f32_s32(vaddq_s32(vaddq_s32(b1, b2), vaddq_s32(b3, b4)));
acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i)));
}
}
struct Q2bits {
const uint8x16_t m4b = vdupq_n_u8(0x03);
uint8x16x4_t b1, b2;
inline void prepare(const uint8_t * qs) {
auto q2bits = vld1q_u8_x2(qs);
b1.val[0] = vandq_u8(q2bits.val[0], m4b);
b1.val[1] = vandq_u8(q2bits.val[1], m4b);
q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
b1.val[2] = vandq_u8(q2bits.val[0], m4b);
b1.val[3] = vandq_u8(q2bits.val[1], m4b);
q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
b2.val[0] = vandq_u8(q2bits.val[0], m4b);
b2.val[1] = vandq_u8(q2bits.val[1], m4b);
q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2);
q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2);
b2.val[2] = vandq_u8(q2bits.val[0], m4b);
b2.val[3] = vandq_u8(q2bits.val[1], m4b);
}
};
struct HighBit5 {
const uint8x16_t mhb = vdupq_n_u8(0x10);
uint8x16x2_t bits;
inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 4), mhb));
b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 4), mhb));
b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 3), mhb));
b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 3), mhb));
b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));
if (do_shift) {
bits.val[0] = vshrq_n_u8(bits.val[0], 4);
bits.val[1] = vshrq_n_u8(bits.val[1], 4);
}
}
};
struct HighBit3 {
const uint8x16_t mhb = vdupq_n_u8(0x04);
uint8x16x2_t bits;
inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) {
b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb));
b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb));
b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb));
b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb));
b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(bits.val[0], mhb));
b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(bits.val[1], mhb));
b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshrq_n_u8(bits.val[0], 1), mhb));
b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshrq_n_u8(bits.val[1], 1), mhb));
if (do_shift) {
bits.val[0] = vshrq_n_u8(bits.val[0], 4);
bits.val[1] = vshrq_n_u8(bits.val[1], 4);
}
}
};
struct DequantizerQ5K final : public BaseDequantizer<block_q5_K> {
DequantizerQ5K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
constexpr static int num_blocks() { return 8; }
constexpr static bool should_scale_quants() { return false; }
template <typename Q8>
inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
d = GGML_FP16_TO_FP32(x[i].d);
h.bits = vld1q_u8_x2(x[i].qh);
return s8.process_scales_mins(x[i], q8, i, acc);
}
inline void prepare(int i, int j) {
bits.prepare(x[i].qs+64*j);
h.apply(bits.b1, bits.b2, j == 0);
}
Q4bits bits;
HighBit5 h;
Scales8 s8;
uint8x16x2_t hbits;
float d;
};
inline int32x4x4_t make_wider(const int16x8x2_t& scales16) {
int32x4x4_t scales = {
vmovl_s16(vget_low_s16 (scales16.val[0])),
vmovl_s16(vget_high_s16(scales16.val[0])),
vmovl_s16(vget_low_s16 (scales16.val[1])),
vmovl_s16(vget_high_s16(scales16.val[1])),
};
return scales;
}
template <typename Q8>
inline int32x4x4_t process_scales_mins_16(const int8x16_t& scales8, const Q8& q8, float32x4_t * acc, int i, float c) {
int16x8x2_t scales16;
scales16.val[0] = vmovl_s8(vget_low_s8(scales8));
scales16.val[1] = vmovl_s8(vget_high_s8(scales8));
accum_mins_16(scales16, q8, acc, i, c);
return make_wider(scales16);
}
struct DequantizerQ3K final : public BaseDequantizer<block_q3_K> {
DequantizerQ3K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
constexpr static int num_blocks() { return 16; }
constexpr static bool should_scale_quants() { return false; }
template <typename Q8>
inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
d = GGML_FP16_TO_FP32(x[i].d);
h.bits = vld1q_u8_x2(x[i].hmask);
const uint16_t * sc16 = (const uint16_t *)x[i].scales;
uint32_t aux0 = sc16[0] | (sc16[1] << 16);
uint32_t aux1 = sc16[2] | (sc16[3] << 16);
uint32_t aux2 = sc16[4] | (sc16[5] << 16);
aux32[0] = (aux0 & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030);
aux32[1] = (aux1 & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030);
aux32[2] = ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030);
aux32[3] = ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030);
return process_scales_mins_16(vaddq_s8(vld1q_s8((const int8_t *)aux32), vdupq_n_s8(-32)), q8, acc, i, -4.f*d);
}
inline void prepare(int i, int j) {
bits.prepare(x[i].qs+32*j);
h.apply(bits.b1, bits.b2, j == 0);
}
uint32_t aux32[4];
Q2bits bits;
HighBit3 h;
float d;
};
struct DequantizerQ2K final : public BaseDequantizer<block_q2_K> {
DequantizerQ2K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
constexpr static int num_blocks() { return 16; }
constexpr static bool should_scale_quants() { return true; }
template <typename Q8>
inline void process_scales(int i, const Q8& q8, float32x4_t * acc) {
d = GGML_FP16_TO_FP32(x[i].d);
auto scales_and_mins = vld1q_u8(x[i].scales);
auto mins8 = vreinterpretq_s8_u8(vshrq_n_u8(scales_and_mins, 4));
int16x8x2_t scales16;
scales16.val[0] = vmovl_s8(vget_low_s8(mins8));
scales16.val[1] = vmovl_s8(vget_high_s8(mins8));
accum_mins_16(scales16, q8, acc, i, -GGML_FP16_TO_FP32(x[i].dmin));
scales8 = vandq_u8(scales_and_mins, vdupq_n_u8(0xf));
}
template <typename Q8>
inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) {
process_scales(i, q8, acc);
int16x8x2_t scales16;
scales16.val[0] = vmovl_s8(vget_low_s8(vreinterpretq_s8_u8(scales8)));
scales16.val[1] = vmovl_s8(vget_high_s8(vreinterpretq_s8_u8(scales8)));
return make_wider(scales16);
}
template <typename Q8>
inline void compute(const Q8& q8, int i, int j, int32x4_t * sumi) {
auto m1 = vdupq_n_u8(1);
auto shuffle = vdupq_n_u8(8*j);
bits.b1.val[0] = vmulq_u8(bits.b1.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
bits.b1.val[1] = vmulq_u8(bits.b1.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
bits.b1.val[2] = vmulq_u8(bits.b1.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
bits.b1.val[3] = vmulq_u8(bits.b1.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
bits.b2.val[0] = vmulq_u8(bits.b2.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
bits.b2.val[1] = vmulq_u8(bits.b2.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
bits.b2.val[2] = vmulq_u8(bits.b2.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
bits.b2.val[3] = vmulq_u8(bits.b2.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1);
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
auto q8b_1 = q8.load_quants(iy, i, 4*j+0);
sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]),
vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]);
auto q8b_2 = q8.load_quants(iy, i, 4*j+1);
sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]),
vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]);
auto q8b_3 = q8.load_quants(iy, i, 4*j+2);
sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_3.val[0]),
vreinterpretq_s8_u8(bits.b2.val[1]), q8b_3.val[1]);
auto q8b_4 = q8.load_quants(iy, i, 4*j+3);
sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_4.val[0]),
vreinterpretq_s8_u8(bits.b2.val[3]), q8b_4.val[1]);
}
}
inline void prepare(int i, int j) {
bits.prepare(x[i].qs+32*j);
}
uint32_t aux32[4];
uint8x16_t scales8;
Q2bits bits;
float d;
};
IQK_ALWAYS_INLINE void fusion_mul_mat_qX_K_q8_K_T_y1_d6k(
float32x4_t &acc,
const uint8_t *x_ql, // [128] 4bit
const uint8_t *x_qh, // [64] 2bit
const int8_t *x_scale, // [16] 8bit
float x_d,
const int8_t *y_qs, // [256] 8bit
const int16_t *y_bsums, // [16] 16bit
float y_d)
{
float c0 = x_d * y_d;
float c1 = -32.0f * c0;
const int OFFSET = 1024;
__builtin_prefetch((x_ql + OFFSET + 0*64), 0, 0);
__builtin_prefetch((x_ql + OFFSET + 1*64), 0, 0);
__builtin_prefetch((x_ql + OFFSET + 2*64), 0, 0);
int16x8_t scale16_0, scale16_1;
{
int8x16_t tmp = vld1q_s8(x_scale);
scale16_0 = vmovl_s8(vget_low_s8(tmp));
scale16_1 = vmovl_high_s8(tmp);
}
{
int16x8_t q8s0 = vld1q_s16(y_bsums + 0);
int16x8_t q8s1 = vld1q_s16(y_bsums + 8);
int32x4_t b0 = vmull_s16(vget_low_s16(scale16_0), vget_low_s16(q8s0));
b0 = vmlal_high_s16(b0, scale16_0, q8s0);
b0 = vmlal_s16(b0, vget_low_s16(scale16_1), vget_low_s16(q8s1));
b0 = vmlal_high_s16(b0, scale16_1, q8s1);
acc = vfmaq_n_f32(acc, vcvtq_f32_s32(b0), c1);
}
uint8x16_t x0, x1, x2, x3, x4, x5, x6, x7;
int32x4_t sumi = vdupq_n_s32(0);
{
const uint8x16_t m0 = vdupq_n_u8(0x3f);
const uint8x16_t m1 = vdupq_n_u8(0x30);
const uint8x16_t m2 = vdupq_n_u8(0x0f);
x0 = vld1q_u8(x_ql + 0*16 + 0*64);
x1 = vld1q_u8(x_ql + 1*16 + 0*64);
x2 = vld1q_u8(x_ql + 2*16 + 0*64);
x3 = vld1q_u8(x_ql + 3*16 + 0*64);
uint8x16_t hbits0 = vld1q_u8(x_qh + 0*16 + 0*32);
uint8x16_t hbits1 = vld1q_u8(x_qh + 1*16 + 0*32);
x4 = vandq_u8(hbits0, m0);
x4 = vsriq_n_u8(x4, x0, 4);
x5 = vandq_u8(hbits1, m0);
x5 = vsriq_n_u8(x5, x1, 4);
x6 = vshrq_n_u8(hbits0, 2);
x6 = vsriq_n_u8(x6, x2, 4);
x7 = vshrq_n_u8(hbits1, 2);
x7 = vsriq_n_u8(x7, x3, 4);
x0 = vsliq_n_u8(x0, hbits0, 4);
x0 = vandq_u8(x0, m0);
x1 = vsliq_n_u8(x1, hbits1, 4);
x1 = vandq_u8(x1, m0);
hbits0 = vshlq_n_u8(hbits0, 2);
hbits0 = vandq_u8(hbits0, m1);
x2 = vandq_u8(x2, m2);
x2 = vorrq_u8(x2, hbits0);
hbits1 = vshlq_n_u8(hbits1, 2);
hbits1 = vandq_u8(hbits1, m1);
x3 = vandq_u8(x3, m2);
x3 = vorrq_u8(x3, hbits1);
}
{
int8x16_t base = vdupq_n_s8(32);
int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 0*128);
int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 0*128);
int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 0*128);
int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 0*128);
int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 0*128);
int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 0*128);
int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 0*128);
int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 0*128);
int32x4_t p00 = vdupq_n_s32(0);
int32x4_t p01 = vdupq_n_s32(0);
int32x4_t p10 = vdupq_n_s32(0);
int32x4_t p11 = vdupq_n_s32(0);
int32x4_t p20 = vdupq_n_s32(0);
int32x4_t p21 = vdupq_n_s32(0);
int32x4_t p30 = vdupq_n_s32(0);
int32x4_t p31 = vdupq_n_s32(0);
p00 = vdotq_s32(p00, vreinterpretq_s8_u8(x0), y0);
p01 = vdotq_s32(p01, vreinterpretq_s8_u8(x1), y1);
p10 = vdotq_s32(p10, vreinterpretq_s8_u8(x2), y2);
p11 = vdotq_s32(p11, vreinterpretq_s8_u8(x3), y3);
p20 = vdotq_s32(p20, vreinterpretq_s8_u8(x4), y4);
p21 = vdotq_s32(p21, vreinterpretq_s8_u8(x5), y5);
p30 = vdotq_s32(p30, vreinterpretq_s8_u8(x6), y6);
p31 = vdotq_s32(p31, vreinterpretq_s8_u8(x7), y7);
p00 = vpaddq_s32(p00, p01);
p10 = vpaddq_s32(p10, p11);
p20 = vpaddq_s32(p20, p21);
p30 = vpaddq_s32(p30, p31);
p00 = vpaddq_s32(p00, p10);
p20 = vpaddq_s32(p20, p30);
sumi = vmlaq_s32(sumi, vmovl_s16(vget_low_s16(scale16_0)), p00);
sumi = vmlaq_s32(sumi, vmovl_high_s16(scale16_0), p20);
}
{
const uint8x16_t m0 = vdupq_n_u8(0x3f);
const uint8x16_t m1 = vdupq_n_u8(0x30);
const uint8x16_t m2 = vdupq_n_u8(0x0f);
x0 = vld1q_u8(x_ql + 0*16 + 1*64);
x1 = vld1q_u8(x_ql + 1*16 + 1*64);
x2 = vld1q_u8(x_ql + 2*16 + 1*64);
x3 = vld1q_u8(x_ql + 3*16 + 1*64);
uint8x16_t hbits0 = vld1q_u8(x_qh + 0*16 + 1*32);
uint8x16_t hbits1 = vld1q_u8(x_qh + 1*16 + 1*32);
x4 = vandq_u8(hbits0, m0);
x4 = vsriq_n_u8(x4, x0, 4);
x5 = vandq_u8(hbits1, m0);
x5 = vsriq_n_u8(x5, x1, 4);
x6 = vshrq_n_u8(hbits0, 2);
x6 = vsriq_n_u8(x6, x2, 4);
x7 = vshrq_n_u8(hbits1, 2);
x7 = vsriq_n_u8(x7, x3, 4);
x0 = vsliq_n_u8(x0, hbits0, 4);
x0 = vandq_u8(x0, m0);
x1 = vsliq_n_u8(x1, hbits1, 4);
x1 = vandq_u8(x1, m0);
hbits0 = vshlq_n_u8(hbits0, 2);
hbits0 = vandq_u8(hbits0, m1);
x2 = vandq_u8(x2, m2);
x2 = vorrq_u8(x2, hbits0);
hbits1 = vshlq_n_u8(hbits1, 2);
hbits1 = vandq_u8(hbits1, m1);
x3 = vandq_u8(x3, m2);
x3 = vorrq_u8(x3, hbits1);
}
{
int8x16_t base = vdupq_n_s8(32);
int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 1*128);
int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 1*128);
int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 1*128);
int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 1*128);
int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 1*128);
int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 1*128);
int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 1*128);
int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 1*128);
int32x4_t p00 = vdupq_n_s32(0);
int32x4_t p01 = vdupq_n_s32(0);
int32x4_t p10 = vdupq_n_s32(0);
int32x4_t p11 = vdupq_n_s32(0);
int32x4_t p20 = vdupq_n_s32(0);
int32x4_t p21 = vdupq_n_s32(0);
int32x4_t p30 = vdupq_n_s32(0);
int32x4_t p31 = vdupq_n_s32(0);
p00 = vdotq_s32(p00, vreinterpretq_s8_u8(x0), y0);
p01 = vdotq_s32(p01, vreinterpretq_s8_u8(x1), y1);
p10 = vdotq_s32(p10, vreinterpretq_s8_u8(x2), y2);
p11 = vdotq_s32(p11, vreinterpretq_s8_u8(x3), y3);
p20 = vdotq_s32(p20, vreinterpretq_s8_u8(x4), y4);
p21 = vdotq_s32(p21, vreinterpretq_s8_u8(x5), y5);
p30 = vdotq_s32(p30, vreinterpretq_s8_u8(x6), y6);
p31 = vdotq_s32(p31, vreinterpretq_s8_u8(x7), y7);
p00 = vpaddq_s32(p00, p01);
p10 = vpaddq_s32(p10, p11);
p20 = vpaddq_s32(p20, p21);
p30 = vpaddq_s32(p30, p31);
p00 = vpaddq_s32(p00, p10);
p20 = vpaddq_s32(p20, p30);
sumi = vmlaq_s32(sumi, vmovl_s16(vget_low_s16(scale16_1)), p00);
sumi = vmlaq_s32(sumi, vmovl_high_s16(scale16_1), p20);
}
{
acc = vfmaq_n_f32(acc, vcvtq_f32_s32(sumi), c0);
}
return;
}
IQK_ALWAYS_INLINE void fusion_mul_mat_qX_K_q8_K_T_y1_d4k(
float32x4_t &acc,
const uint8_t *x_scale,
const uint8_t *x_qs,
float x_d,
float x_dmin,
const int8_t *y_qs,
const int16_t *y_bsums,
float y_d)
{
float c0 = x_d * y_d;
float c1 = -x_dmin * y_d;
const int OFFSET = 1024;
__builtin_prefetch((x_scale + OFFSET + 0*64), 0, 0);
__builtin_prefetch((x_scale + OFFSET + 1*64), 0, 0);
int16x8_t scale_min;
int16x8_t scale;
{
uint32_t utmp[4];
const uint8_t * sc8 = (const uint8_t *)utmp;
make_q4_scales(x_scale, utmp);
int8x16_t ss = vld1q_s8((const int8_t *)sc8);
scale = vmovl_s8(vget_low_s8(ss));
scale_min = vmovl_high_s8(ss);
}
{
int16x8_t q8s0 = vld1q_s16(y_bsums + 0);
int16x8_t q8s1 = vld1q_s16(y_bsums + 8);
q8s0 = vpaddq_s16(q8s0, q8s1);
int32x4_t b0 = vmull_s16(vget_low_s16(scale_min), vget_low_s16(q8s0));
b0 = vmlal_high_s16(b0, scale_min, q8s0);
acc = vfmaq_n_f32(acc, vcvtq_f32_s32(b0), c1);
}
int32x4_t sumi = vdupq_n_s32(0);
const uint8x16_t m4b = vdupq_n_u8(0x0f);
uint8x16_t x0, x1, x2, x3, x4, x5, x6, x7;
{
x0 = vld1q_u8(x_qs + 0*16 + 0*64);
x1 = vld1q_u8(x_qs + 1*16 + 0*64);
x4 = vld1q_u8(x_qs + 2*16 + 0*64);
x5 = vld1q_u8(x_qs + 3*16 + 0*64);
x2 = vshrq_n_u8(x0, 4);
x3 = vshrq_n_u8(x1, 4);
x6 = vshrq_n_u8(x4, 4);
x7 = vshrq_n_u8(x5, 4);
x0 = vandq_u8(x0, m4b);
x1 = vandq_u8(x1, m4b);
x4 = vandq_u8(x4, m4b);
x5 = vandq_u8(x5, m4b);
}
{
int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 0*128);
int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 0*128);
int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 0*128);
int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 0*128);
int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 0*128);
int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 0*128);
int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 0*128);
int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 0*128);
int32x4_t p0 = vdupq_n_s32(0);
int32x4_t p1 = vdupq_n_s32(0);
int32x4_t p2 = vdupq_n_s32(0);
int32x4_t p3 = vdupq_n_s32(0);
p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x0), y0);
p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x2), y2);
p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x4), y4);
p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x6), y6);
p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x1), y1);
p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x3), y3);
p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x5), y5);
p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x7), y7);
p0 = vpaddq_s32(p0, p1);
p2 = vpaddq_s32(p2, p3);
p0 = vpaddq_s32(p0, p2);
sumi = vmlaq_s32(sumi, vmovl_s16(vget_low_s16(scale)), p0);
}
{
x0 = vld1q_u8(x_qs + 0*16 + 1*64);
x1 = vld1q_u8(x_qs + 1*16 + 1*64);
x4 = vld1q_u8(x_qs + 2*16 + 1*64);
x5 = vld1q_u8(x_qs + 3*16 + 1*64);
x2 = vshrq_n_u8(x0, 4);
x3 = vshrq_n_u8(x1, 4);
x6 = vshrq_n_u8(x4, 4);
x7 = vshrq_n_u8(x5, 4);
x0 = vandq_u8(x0, m4b);
x1 = vandq_u8(x1, m4b);
x4 = vandq_u8(x4, m4b);
x5 = vandq_u8(x5, m4b);
}
{
int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 1*128);
int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 1*128);
int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 1*128);
int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 1*128);
int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 1*128);
int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 1*128);
int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 1*128);
int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 1*128);
int32x4_t p0 = vdupq_n_s32(0);
int32x4_t p1 = vdupq_n_s32(0);
int32x4_t p2 = vdupq_n_s32(0);
int32x4_t p3 = vdupq_n_s32(0);
p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x0), y0);
p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x2), y2);
p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x4), y4);
p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x6), y6);
p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x1), y1);
p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x3), y3);
p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x5), y5);
p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x7), y7);
p0 = vpaddq_s32(p0, p1);
p2 = vpaddq_s32(p2, p3);
p0 = vpaddq_s32(p0, p2);
sumi = vmlaq_s32(sumi, vmovl_high_s16(scale), p0);
}
{
acc = vfmaq_n_f32(acc, vcvtq_f32_s32(sumi), c0);
}
}
template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
Q8<nrc_y, block_q8_K> q8(info);
Dequantizer deq(vx, bx, nrc_y);
for (int ix = 0; ix < nrc_x; ++ix) {
deq.new_row(ix);
float32x4_t acc[nrc_y];
for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);
//#pragma GCC unroll 4
for (int i = 0; i < nb; ++i) {
#ifdef GEMV_Q4K
if constexpr (nrc_y == 1 && std::is_same<Dequantizer, DequantizerQ6K>::value) {
fusion_mul_mat_qX_K_q8_K_T_y1_d6k(
acc[0],
deq.x[i].ql,
deq.x[i].qh,
deq.x[i].scales,
GGML_FP16_TO_FP32(deq.x[i].d),
q8.y[0][i].qs,
q8.y[0][i].bsums,
q8.y[0][i].d);
} else
#endif
#ifdef GEMV_Q6K
if constexpr (nrc_y == 1 && std::is_same<Dequantizer, DequantizerQ4K>::value) {
fusion_mul_mat_qX_K_q8_K_T_y1_d4k(
acc[0],
deq.x[i].scales,
deq.x[i].qs,
GGML_FP16_TO_FP32(deq.x[i].d),
GGML_FP16_TO_FP32(deq.x[i].dmin),
q8.y[0][i].qs,
q8.y[0][i].bsums,
q8.y[0][i].d);
} else
#endif
{
int32x4_t sumi[nrc_y];
for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0);
if constexpr (nrc_y > 1 && Dequantizer::should_scale_quants()) {
deq.process_scales(i, q8, acc);
deq.prepare(i, 0);
deq.compute(q8, i, 0, sumi);
deq.prepare(i, 1);
deq.compute(q8, i, 1, sumi);
} else {
if constexpr (Dequantizer::num_blocks() == 8) {
auto scales = deq.new_block(i, q8, acc);
deq.prepare(i, 0);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
deq.prepare(i, 1);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
}
else if constexpr (Dequantizer::num_blocks() == 16) {
auto scales = deq.new_block(i, q8, acc);
deq.prepare(i, 0);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]);
deq.prepare(i, 1);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]);
}
else {
GGML_ASSERT(false);
}
}
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) {
acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i)));
}
}
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, vaddvq_f32(acc[iy]));
}
}
}
}
// ============================= i-quants
struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
static int8x16_t load_values() {
static const int8_t iq4nl_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
return vld1q_s8(iq4nl_values);
}
DequantizerIQ4XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), values(load_values()) {}
constexpr static int num_blocks() { return 8; }
constexpr static bool should_scale_quants() { return false; }
inline void new_row(int ix) { x = (const block_iq4_xs *)((const char *)vx + bx*ix); }
template <typename Q8>
inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) {
(void)q8;
(void)acc;
d = GGML_FP16_TO_FP32(x[i].d);
const uint16_t scales_h = x[i].scales_h;
const uint16_t * scales_l = (const uint16_t *)x[i].scales_l;
aux32[0] = scales_l[0] | (scales_l[1] << 16);
aux32[1] = aux32[0] >> 4;
// scl is ordered as 0, 2, 4, 6, 1, 3, 5, 7
uint8x8_t scl8 = vand_u8(vld1_u8((const uint8_t *)aux32), vdup_n_u8(0xf));
uint16_t * aux16 = (uint16_t *)aux32;
aux16[0] = scales_h << 4; aux16[1] = scales_h << 2; aux16[2] = scales_h; aux16[3] = scales_h >> 2;
// sch is ordered as 0, 4, 1, 5, 2, 6, 3, 7
uint8x8_t sch8 = vand_u8(vld1_u8((const uint8_t *)aux16), vdup_n_u8(0x30));
int8x8_t scales8 = vadd_s8(vreinterpret_s8_u8(vorr_u8(scl8, vtbl1_u8(sch8, vreinterpret_u8_u32(hshuff)))), vdup_n_s8(-32));
// shuffle 0, 2, 4, 6, 1, 3, 5, 7 -> 0, 1, 2, 3, 4, 5, 6, 7
scales8 = vtbl1_s8(scales8, vreinterpret_s8_u32(hshuff));
int16x8_t scales16 = vmovl_s8(scales8);
int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))};
return scales;
}
inline void prepare(int i, int j) {
bits.prepare16(x[i].qs+64*j);
for (int k = 0; k < 4; ++k) {
bits.b1.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b1.val[k]));
bits.b2.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b2.val[k]));
}
}
Q4bits bits;
const int8x16_t values;
uint32_t aux32[2];
constexpr static uint32x2_t hshuff = {0x05010400, 0x07030602};
float d;
};
struct SimpleBits {
uint8x16x4_t b1;
uint8x16x4_t b2;
};
IQK_ALWAYS_INLINE int32x4x2_t prepare_scales_8(const uint32x4_t& v1, const uint32x4_t& v2) {
int32x4x2_t scales;
auto one = vdupq_n_u32(1);
scales.val[0] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v1, 28), 1));
scales.val[1] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v2, 28), 1));
return scales;
}
inline void apply_signs_2(uint8x16_t * b, const uint64_t * signs, uint32_t sidx) {
auto s1 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >> 0) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >> 7) & 127))));
auto s2 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >>14) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >>21) & 127))));
b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s1));
b[1] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[1]), s2));
}
IQK_ALWAYS_INLINE int32x4_t prepare_scales_8(const uint32x4_t& v1) {
return vreinterpretq_s32_u32(vsliq_n_u32(vdupq_n_u32(1), vshrq_n_u32(v1, 28), 1));
}
struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
DequantizerIQ2XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
IQK_ALWAYS_INLINE float new_block(int i) const { return 0.125f * GGML_FP16_TO_FP32(x[i].d); }
inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
auto data = vld1q_u32_x2((const uint32_t *)(x[i].qs + 16*j));
prepare_all(data, q);
return prepare_scales_8(vuzp2q_u32(data.val[0], data.val[1]));
}
private:
static inline void prepare2(uint8x16_t * b, const uint32_t * bits, const uint64_t * signs) {
const uint8_t * idx = (const uint8_t *)bits;
b[0] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[0]], iq2xxs_grid[idx[1]]});
b[1] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[2]], iq2xxs_grid[idx[3]]});
apply_signs_2(b, signs, bits[1]);
}
inline static void prepare_all(const uint32x4x2_t& data, uint8x16_t * quants) {
const uint32_t * q2 = (const uint32_t *)data.val;
prepare2(quants+0, q2+0, keven_signs);
prepare2(quants+2, q2+2, keven_signs);
prepare2(quants+4, q2+4, keven_signs);
prepare2(quants+6, q2+6, keven_signs);
}
};
inline int32x4x4_t prepare_4bit_scales16(const uint8_t * sc) {
auto aux = vld1_u8(sc);
auto scales_l = vand_u8(aux, vdup_n_u8(0xf));
auto scales_h = vshr_n_u8(aux, 4);
auto aux1 = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
auto scales8 = vreinterpretq_s8_u8(vorrq_u8(vshlq_n_u8(aux1, 1), vdupq_n_u8(1)));
int16x8x2_t scales16 = { vmovl_s8(vget_low_s8(scales8)), vmovl_s8(vget_high_s8(scales8)) };
return make_wider(scales16);
}
struct DequantizerIQ2XS final : public BaseDequantizer<block_iq2_xs> {
DequantizerIQ2XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
constexpr static int num_blocks() { return 16; }
constexpr static bool should_scale_quants() { return false; }
SimpleBits bits;
float d;
inline int32x4x4_t new_block(int i) {
d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
prepare_internal(i, 0);
return prepare_4bit_scales16(x[i].scales);
}
inline void prepare(int i, int j) {
if (j == 1) prepare_internal(i, 1);
}
private:
static void make2(const uint16_t * qs, uint8x16_t * b) {
auto v1 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[0] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[1] & 511))));
auto v2 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[2] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[3] & 511))));
auto s1 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[0] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[1] >> 9))));
auto s2 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[2] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[3] >> 9))));
b[0] = vreinterpretq_u8_s8(vmulq_s8(v1, s1));
b[1] = vreinterpretq_u8_s8(vmulq_s8(v2, s2));
}
inline static void make4(const uint16_t * qs, uint8x16_t * b) {
make2(qs + 0, b + 0);
make2(qs + 4, b + 2);
}
IQK_ALWAYS_INLINE void prepare_internal(int i, int j) {
make4(x[i].qs + 16*j + 0, bits.b1.val);
make4(x[i].qs + 16*j + 8, bits.b2.val);
}
};
// So, I hate to include this table, but with the GCC 12.3 compiler
// bundled in the Cosmopolitan tools, loading the unpacked sign bytes
// from this table using the packed 8 sign bits as index is faster than
// using the standard trick of vceqq_u8(vandq_u8(bits, mask), mask) to
// expand the bits to bytes.
static const uint64_t kall_signs[256] = {
0x0101010101010101, 0x01010101010101ff, 0x010101010101ff01, 0x010101010101ffff,
0x0101010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0x0101010101ffffff,
0x01010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0x01010101ff01ffff,
0x01010101ffff0101, 0x01010101ffff01ff, 0x01010101ffffff01, 0x01010101ffffffff,
0x010101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0x010101ff0101ffff,
0x010101ff01ff0101, 0x010101ff01ff01ff, 0x010101ff01ffff01, 0x010101ff01ffffff,
0x010101ffff010101, 0x010101ffff0101ff, 0x010101ffff01ff01, 0x010101ffff01ffff,
0x010101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0x010101ffffffffff,
0x0101ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0x0101ff010101ffff,
0x0101ff0101ff0101, 0x0101ff0101ff01ff, 0x0101ff0101ffff01, 0x0101ff0101ffffff,
0x0101ff01ff010101, 0x0101ff01ff0101ff, 0x0101ff01ff01ff01, 0x0101ff01ff01ffff,
0x0101ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0x0101ff01ffffffff,
0x0101ffff01010101, 0x0101ffff010101ff, 0x0101ffff0101ff01, 0x0101ffff0101ffff,
0x0101ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0x0101ffff01ffffff,
0x0101ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0x0101ffffff01ffff,
0x0101ffffffff0101, 0x0101ffffffff01ff, 0x0101ffffffffff01, 0x0101ffffffffffff,
0x01ff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0x01ff01010101ffff,
0x01ff010101ff0101, 0x01ff010101ff01ff, 0x01ff010101ffff01, 0x01ff010101ffffff,
0x01ff0101ff010101, 0x01ff0101ff0101ff, 0x01ff0101ff01ff01, 0x01ff0101ff01ffff,
0x01ff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0x01ff0101ffffffff,
0x01ff01ff01010101, 0x01ff01ff010101ff, 0x01ff01ff0101ff01, 0x01ff01ff0101ffff,
0x01ff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0x01ff01ff01ffffff,
0x01ff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0x01ff01ffff01ffff,
0x01ff01ffffff0101, 0x01ff01ffffff01ff, 0x01ff01ffffffff01, 0x01ff01ffffffffff,
0x01ffff0101010101, 0x01ffff01010101ff, 0x01ffff010101ff01, 0x01ffff010101ffff,
0x01ffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0x01ffff0101ffffff,
0x01ffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0x01ffff01ff01ffff,
0x01ffff01ffff0101, 0x01ffff01ffff01ff, 0x01ffff01ffffff01, 0x01ffff01ffffffff,
0x01ffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0x01ffffff0101ffff,
0x01ffffff01ff0101, 0x01ffffff01ff01ff, 0x01ffffff01ffff01, 0x01ffffff01ffffff,
0x01ffffffff010101, 0x01ffffffff0101ff, 0x01ffffffff01ff01, 0x01ffffffff01ffff,
0x01ffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0x01ffffffffffffff,
0xff01010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0xff0101010101ffff,
0xff01010101ff0101, 0xff01010101ff01ff, 0xff01010101ffff01, 0xff01010101ffffff,
0xff010101ff010101, 0xff010101ff0101ff, 0xff010101ff01ff01, 0xff010101ff01ffff,
0xff010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0xff010101ffffffff,
0xff0101ff01010101, 0xff0101ff010101ff, 0xff0101ff0101ff01, 0xff0101ff0101ffff,
0xff0101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0xff0101ff01ffffff,
0xff0101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0xff0101ffff01ffff,
0xff0101ffffff0101, 0xff0101ffffff01ff, 0xff0101ffffffff01, 0xff0101ffffffffff,
0xff01ff0101010101, 0xff01ff01010101ff, 0xff01ff010101ff01, 0xff01ff010101ffff,
0xff01ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0xff01ff0101ffffff,
0xff01ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0xff01ff01ff01ffff,
0xff01ff01ffff0101, 0xff01ff01ffff01ff, 0xff01ff01ffffff01, 0xff01ff01ffffffff,
0xff01ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0xff01ffff0101ffff,
0xff01ffff01ff0101, 0xff01ffff01ff01ff, 0xff01ffff01ffff01, 0xff01ffff01ffffff,
0xff01ffffff010101, 0xff01ffffff0101ff, 0xff01ffffff01ff01, 0xff01ffffff01ffff,
0xff01ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0xff01ffffffffffff,
0xffff010101010101, 0xffff0101010101ff, 0xffff01010101ff01, 0xffff01010101ffff,
0xffff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0xffff010101ffffff,
0xffff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0xffff0101ff01ffff,
0xffff0101ffff0101, 0xffff0101ffff01ff, 0xffff0101ffffff01, 0xffff0101ffffffff,
0xffff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0xffff01ff0101ffff,
0xffff01ff01ff0101, 0xffff01ff01ff01ff, 0xffff01ff01ffff01, 0xffff01ff01ffffff,
0xffff01ffff010101, 0xffff01ffff0101ff, 0xffff01ffff01ff01, 0xffff01ffff01ffff,
0xffff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0xffff01ffffffffff,
0xffffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0xffffff010101ffff,
0xffffff0101ff0101, 0xffffff0101ff01ff, 0xffffff0101ffff01, 0xffffff0101ffffff,
0xffffff01ff010101, 0xffffff01ff0101ff, 0xffffff01ff01ff01, 0xffffff01ff01ffff,
0xffffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0xffffff01ffffffff,
0xffffffff01010101, 0xffffffff010101ff, 0xffffffff0101ff01, 0xffffffff0101ffff,
0xffffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0xffffffff01ffffff,
0xffffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0xffffffffff01ffff,
0xffffffffffff0101, 0xffffffffffff01ff, 0xffffffffffffff01, 0xffffffffffffffff,
};
struct SignHelper {
IQK_ALWAYS_INLINE void apply_signs_1x(uint8x16_t * b, const uint8_t * sign_bits) const {
auto s = vreinterpretq_s8_u64(uint64x2_t{kall_signs[sign_bits[0]], kall_signs[sign_bits[1]]});
// Normally we would expect this to be faster, but it isn't.
// auto aux = vcombine_u8(vdup_n_u8(sign_bits[0]), vdup_n_u8(sign_bits[1]));
// auto s = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(aux, smask), smask), m1));
b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s));
}
// We would need these two if we weren't loading from the unpacked sign table.
//const uint8x16_t smask = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201));
//const uint8x16_t m1 = vdupq_n_u8(1);
};
struct DequantizerIQ2S final : public BaseDequantizer<block_iq2_s> {
DequantizerIQ2S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
constexpr static int num_blocks() { return 16; }
constexpr static bool should_scale_quants() { return false; }
SimpleBits bits;
float d;
inline int32x4x4_t new_block(int i) {
d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
prepare_internal(i, 0, bits);
return prepare_4bit_scales16(x[i].scales);
}
inline void prepare(int i, int j) {
if (j == 1) prepare_internal(i, 1, bits);
}
private:
static void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, uint8x16_t * b) {
uint32_t aux32[2];
const uint16_t * aux16 = (const uint16_t *)aux32;
for (int k = 0; k < 2; ++k) {
aux32[1] = (qh[k] << 4) | (qh[k] << 18);
aux32[0] = (aux32[1] << 4) & 0x03000300;
aux32[1] &= 0x03000300;
b[2*k+0] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+0] | aux16[0]))),
vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+1] | aux16[1]))));
b[2*k+1] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+2] | aux16[2]))),
vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+3] | aux16[3]))));
sh.apply_signs_1x(b+2*k+0, sign_bits); sign_bits += 2;
sh.apply_signs_1x(b+2*k+1, sign_bits); sign_bits += 2;
}
}
void prepare_internal(int i, int j, SimpleBits& sb) {
const auto * qs = x[i].qs + 16*j;
const auto * qh = x[i].qh + 4*j;
const auto * sign_bits = qs + QK_K/8;
make4(sh, sign_bits+0, qs+0, qh+0, sb.b1.val);
make4(sh, sign_bits+8, qs+8, qh+2, sb.b2.val);
}
SignHelper sh;
};
struct DequantizerIQ3XXS final : public BaseDequantizer<block_iq3_xxs> {
DequantizerIQ3XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
IQK_ALWAYS_INLINE float new_block(int i) const { return 0.25f * GGML_FP16_TO_FP32(x[i].d); }
inline int32x4_t unpack(int i, int j, uint8x16_t * q) const {
auto q3data = vld1q_u8_x2(x[i].qs + 32*j);
auto gas = vld1q_u32((const uint32_t *)(x[i].qs + QK_K/4 + 16*j));
prepare_block((const uint8_t *)q3data.val, (const uint32_t *)&gas, q);
return prepare_scales_8(gas);
}
private:
inline static void make2(const uint8_t * q3, const uint32_t sidx, uint8x16_t * b) {
b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[0]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[3]]});
b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[4]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[7]]});
apply_signs_2(b, keven_signs, sidx);
}
inline static void prepare_block(const uint8_t * q3, const uint32_t * signs, uint8x16_t * quants) {
make2(q3+ 0, signs[0], quants + 0);
make2(q3+ 8, signs[1], quants + 2);
make2(q3+16, signs[2], quants + 4);
make2(q3+24, signs[3], quants + 6);
}
};
struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> {
DequantizerIQ3S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {}
constexpr static int num_blocks() { return 8; }
constexpr static bool should_scale_quants() { return false; }
SimpleBits bits;
float d;
inline int32x4x2_t new_block(int i) {
d = GGML_FP16_TO_FP32(x[i].d);
uint32_t scales32[2];
auto qs = vld1q_u8_x2(x[i].qs);
auto signs = vld1q_u8(x[i].signs);
prepare_block((const uint8_t *)qs.val, x[i].qh, (const uint8_t *)&signs);
std::memcpy(scales32, x[i].scales, 4);
scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
auto scales8 = vld1_u8((const uint8_t *)scales32); // 0, 2, 4, 6, 1, 3, 5, 7
scales8 = vtbl1_u8(scales8, vreinterpret_u8_u64(vdup_n_u64(0x0703060205010400)));
auto scales16 = vreinterpretq_s16_u16(vmovl_u8(scales8));
int32x4x2_t scales;
scales.val[0] = vmovl_s16(vget_low_s16(scales16));
scales.val[1] = vmovl_s16(vget_high_s16(scales16));
return scales;
}
inline void prepare(int i, int j) {
if (j == 1) {
auto qs = vld1q_u8_x2(x[i].qs + 32);
auto signs = vld1q_u8(x[i].signs + 16);
prepare_block((const uint8_t *)qs.val, x[i].qh + 4, (const uint8_t *)&signs);
}
}
private:
static inline void make2(const SignHelper& sh, const uint8_t * sign_bits, const uint16x8_t& idx_l, uint8_t qh,
const int16x8_t& hshift, uint8x16_t * b) {
auto vindex = vorrq_u16(idx_l, vandq_u16(vshlq_u16(vdupq_n_u16(qh), hshift), vdupq_n_u16(256)));
const uint16_t * idx = (const uint16_t *)&vindex;
b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[0]], iq3s_grid[idx[1]], iq3s_grid[idx[2]], iq3s_grid[idx[3]]});
sh.apply_signs_1x(b+0, sign_bits+0);
b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[4]], iq3s_grid[idx[5]], iq3s_grid[idx[6]], iq3s_grid[idx[7]]});
sh.apply_signs_1x(b+1, sign_bits+2);
}
static inline void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh,
const int16x8_t& hshift, uint8x16_t * b) {
auto idx_l = vld1q_u8(qs);
make2(sh, sign_bits+0, vmovl_u8(vget_low_u8 (idx_l)), qh[0], hshift, b+0);
make2(sh, sign_bits+4, vmovl_u8(vget_high_u8(idx_l)), qh[1], hshift, b+2);
}
static int16x8_t load_shift() {
static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
return vld1q_s16(k_shift);
}
inline void prepare_block(const uint8_t * qs, const uint8_t * qh, const uint8_t * sign_bits) {
auto signs = vld1q_u8(sign_bits);
auto s = (const uint8_t *)&signs;
make4(sh, s + 0, qs+ 0, qh+0, hshift, bits.b1.val);
make4(sh, s + 8, qs+16, qh+2, hshift, bits.b2.val);
}
SignHelper sh;
const int16x8_t hshift = load_shift();
};
template <int nrc_y, typename Dequantizer>
IQK_NOINLINE void mul_mat_qX_K_q8_K_IQXXS(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
Q8<nrc_y, block_q8_K> q8(info);
Dequantizer deq(vx, bx, nrc_y);
uint8x16_t qx[8];
int32x4_t sumi[nrc_y];
float32x4_t acc[nrc_y];
for (int ix = 0; ix < nrc_x; ++ix) {
deq.new_row(ix);
for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);
for (int i = 0; i < nb; ++i) {
float d = deq.new_block(i);
auto scales = deq.unpack(i, 0, qx);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) {
sumi[iy] = vdupq_n_s32(0);
compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 0, sumi[iy]);
}
scales = deq.unpack(i, 1, qx);
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) {
compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 1, sumi[iy]);
acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*q8.scale(iy, i)), vcvtq_f32_s32(sumi[iy]));
}
}
#pragma GCC unroll 8
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, vaddvq_f32(acc[iy]));
}
}
}
// =========================================== Legacy quants
template <typename Block>
inline float16x4_t load_scales_q0(const Block * x, ggml_half * aux) {
for (int k = 0; k < 4; ++k) aux[k] = x[k].d;
return vld1_f16((const float16_t *)aux);
}
template <typename Block>
inline float16x8_t load_scales_q1(const Block * x, ggml_half * aux) {
if constexpr (std::is_same_v<Block, block_q8_1>) {
for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].s; }
} else {
for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].m; }
}
return vld1q_f16((const float16_t *)aux);
}
struct Q4LegacyBits {
template <typename Block>
inline void prepare(const Block * x) {
for (int i = 0; i < 4; ++i) {
auto q4bits = vld1q_u8(x[i].qs);
b[2*i+0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
b[2*i+1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
}
}
inline void prepare1(const uint8_t * qs, int8x16_t * q) const {
auto q4bits = vld1q_u8(qs);
q[0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b));
q[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4));
}
inline void prepare1(const uint8_t * qs) {
prepare1(qs, b);
}
const uint8x16_t m4b = vdupq_n_u8(0xf);
int8x16_t b[8];
};
// One would think this commented out version would do better than the one below
// because it offers more opportunities to execute instructions in parallel.
// Instead, it runs significantly slower. Why? If the compiler is running out of vector registers
// cannot it just do the sequential version below on its own?
//inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
// const auto q8b_1 = vld1q_s8_x2(qs + 0);
// auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b_1.val[0]), b[1], q8b_1.val[1]);
// const auto q8b_2 = vld1q_s8_x2(qs + 32);
// auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b_2.val[0]), b[3], q8b_2.val[1]);
// auto p1234 = vpaddq_s32(p12, p34);
// const auto q8b_3 = vld1q_s8_x2(qs + 64);
// auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b_3.val[0]), b[5], q8b_3.val[1]);
// const auto q8b_4 = vld1q_s8_x2(qs + 96);
// auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b_4.val[0]), b[7], q8b_4.val[1]);
// return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
//}
inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
auto q8b = vld1q_s8_x2(qs + 0);
auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b.val[0]), b[1], q8b.val[1]);
q8b = vld1q_s8_x2(qs + 32);
auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b.val[0]), b[3], q8b.val[1]);
auto p1234 = vpaddq_s32(p12, p34);
q8b = vld1q_s8_x2(qs + 64);
auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b.val[0]), b[5], q8b.val[1]);
q8b = vld1q_s8_x2(qs + 96);
auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b.val[0]), b[7], q8b.val[1]);
return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
}
typedef struct {
ggml_half d[4];
int8_t qs[4*QK8_0];
} block_q8_0_x4;
static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");
template <int nrc> struct Q80 {
constexpr static int nrc_y = nrc;
Q80(const DataInfo& info) {
for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_0 *)info.src1_row(iy);
}
inline const int8_t * quant_data(int iy, int i) const {
const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
return y4->qs;
}
inline float16x4_t load_scales(int iy, int i) const {
const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i;
return vld1_f16((const float16_t *)y4->d);
}
template <typename Dequantizer>
inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * /*acc*/) const {
auto qx_scales = deq.new_block(i);
for (int iy = 0; iy < nrc; ++iy) {
auto q8_scales = load_scales(iy, i);
sc16[iy] = vmul_f16(qx_scales, q8_scales);
}
}
template <typename Dequantizer>
inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
deq.prepare1(i);
float d = GGML_FP16_TO_FP32(deq.x[i].d);
for (int iy = 0; iy < nrc; ++iy) {
auto q8b = vld1q_s8_x2(y[iy][i].qs);
auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
}
}
const block_q8_0 * y[nrc_y];
};
typedef struct {
ggml_half d[8];
int8_t qs[4*QK8_1];
} block_q8_1_x4;
static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");
template <int nrc> struct Q81 {
constexpr static int nrc_y = nrc;
Q81(const DataInfo& info) {
for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_1 *)info.src1_row(iy);
}
inline const int8_t * quant_data(int iy, int i) const {
const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
return y4->qs;
}
inline float16x8_t load_scales(int iy, int i) const {
const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i;
return vld1q_f16((const float16_t *)y4->d);
}
template <typename Dequantizer>
inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * acc) const {
auto qx_scales = deq.new_block(i);
for (int iy = 0; iy < nrc; ++iy) {
auto q8_scales = load_scales(iy, i);
auto m = vmul_f16(vget_high_f16(qx_scales), vget_high_f16(q8_scales));
acc[iy] = vaddq_f32(acc[iy], vcvt_f32_f16(m));
sc16[iy] = vmul_f16(vget_low_f16(qx_scales), vget_low_f16(q8_scales));
}
}
template <typename Dequantizer>
inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const {
deq.prepare1(i);
float d = GGML_FP16_TO_FP32(deq.x[i].d), m = 0.25f*GGML_FP16_TO_FP32(deq.x[i].m);
for (int iy = 0; iy < nrc; ++iy) {
auto q8b = vld1q_s8_x2(y[iy][i].qs);
auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]);
acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p));
acc[iy] = vaddq_f32(acc[iy], vdupq_n_f32(m*GGML_FP16_TO_FP32(y[iy][i].s)));
}
}
const block_q8_1 * y[nrc_y];
};
template <typename block_q>
struct BaseLegacyDequantizer {
BaseLegacyDequantizer(const void * vx, size_t bx) : vx(vx), x(nullptr), bx(bx) {}
inline void new_row(int ix) { x = (const block_q *)((const char *)vx + bx*ix); }
Q4LegacyBits bits;
const void * vx;
const block_q * x;
size_t bx;
};
struct DequantizerQ40 final : public BaseLegacyDequantizer<block_q4_0> {
DequantizerQ40(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}
inline void prepare1(int i, int8x16_t * q) const {
bits.prepare1(x[i].qs, q);
q[0] = vaddq_s8(q[0], m8);
q[1] = vaddq_s8(q[1], m8);
}
inline void prepare1(int i) {
prepare1(i, bits.b);
}
inline float16x4_t new_block(int i) {
ggml_half aux[4];
for (int k = 0; k < 4; ++k) {
aux[k] = x[4*i+k].d;
prepare1(4*i+k, bits.b + 2*k);
}
return vld1_f16((const float16_t *)aux);
}
const int8x16_t m8 = vdupq_n_s8(-8);
//ggml_half aux[4];
};
struct DequantizerQ41 : public BaseLegacyDequantizer<block_q4_1> {
DequantizerQ41(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}
inline void prepare1(int i) {
bits.prepare1(x[i].qs);
}
inline float16x8_t new_block(int i) {
uint32_t aux32[4];
const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
for (int k = 0; k < 4; ++k) {
aux32[k] = *s32; s32 += sizeof(block_q4_1)/4;
bits.prepare1(x[4*i+k].qs, bits.b + 2*k);
}
return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
}
// Leaving this commented out attempt to be reminded that I already tried this.
// It has basically the same performance as the version above.
//inline float16x8_t new_block(int i) {
// uint32x4_t scales = {};
// const block_q4_1 * xi = x + 4*i;
// const uint32_t * s32 = (const uint32_t *)&xi->d;
// scales = vsetq_lane_u32(*s32, scales, 0); s32 += sizeof(block_q4_1)/4;
// bits.prepare1(xi[0].qs, bits.b + 0);
// scales = vsetq_lane_u32(*s32, scales, 1); s32 += sizeof(block_q4_1)/4;
// bits.prepare1(xi[1].qs, bits.b + 2);
// scales = vsetq_lane_u32(*s32, scales, 2); s32 += sizeof(block_q4_1)/4;
// bits.prepare1(xi[2].qs, bits.b + 4);
// scales = vsetq_lane_u32(*s32, scales, 3);
// bits.prepare1(xi[3].qs, bits.b + 6);
// return vreinterpretq_f16_u8(vqtbl1q_u8(vreinterpretq_u8_u32(scales), vreinterpretq_u8_u64(shuffle)));
//}
const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};
};
struct HighBit5Legacy {
inline uint8x16_t to_bytes(const uint8_t * qh) const {
uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vreinterpretq_u8_u64(mask));
}
inline uint8x16_t to_negated_bytes(const uint8_t * qh) const {
uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle);
return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vdupq_n_u8(0));
}
const uint64x2_t mask = vdupq_n_u64(0x8040201008040201);
const uint8x16_t shuffle = vcombine_u8(vdup_n_u8(0), vdup_n_u8(1));
};
struct DequantizerQ50 final : public BaseLegacyDequantizer<block_q5_0> {
DequantizerQ50(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}
inline void prepare1(int i, int8x16_t * q) const {
bits.prepare1(x[i].qs, q);
auto qh = x[i].qh;
q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_negated_bytes(qh+0))));
q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_negated_bytes(qh+2))));
}
inline void prepare1(int i) {
prepare1(i, bits.b);
}
inline float16x4_t new_block(int i) {
ggml_half aux[4];
for (int k = 0; k < 4; ++k) {
aux[k] = x[4*i+k].d;
prepare1(4*i+k, bits.b + 2*k);
}
return vld1_f16((const float16_t *)aux);
}
HighBit5Legacy hbits;
const uint8x16_t mh = vdupq_n_u8(0xf0);
};
struct DequantizerQ80 final : public BaseLegacyDequantizer<block_q8_0> {
DequantizerQ80(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}
inline void prepare1(int i) {
bits.b[0] = vld1q_s8(x[i].qs);
bits.b[1] = vld1q_s8(x[i].qs+16);
}
inline float16x4_t new_block(int i) {
ggml_half aux[4];
for (int k = 0; k < 4; ++k) {
aux[k] = x[4*i+k].d;
bits.b[2*k+0] = vld1q_s8(x[4*i+k].qs);
bits.b[2*k+1] = vld1q_s8(x[4*i+k].qs+16);
}
return vld1_f16((const float16_t *)aux);
}
};
struct DequantizerQ51 final : public BaseLegacyDequantizer<block_q5_1> {
DequantizerQ51(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {}
inline void prepare1(int i, int8x16_t * q) const {
bits.prepare1(x[i].qs, q);
auto qh = x[i].qh;
q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_bytes(qh+0))));
q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_bytes(qh+2))));
}
inline void prepare1(int i) {
bits.prepare1(x[i].qs, bits.b);
}
inline float16x8_t new_block(int i) {
uint32_t aux32[4];
const uint32_t * s32 = (const uint32_t *)&x[4*i].d;
for (int k = 0; k < 4; ++k) {
aux32[k] = *s32; s32 += sizeof(block_q5_1)/4;
prepare1(4*i+k, bits.b + 2*k);
}
return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle)));
}
HighBit5Legacy hbits;
const uint8x16_t mh = vdupq_n_u8(0x10);
const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302};
};
template <typename Dequantizer, typename Q8>
inline void sum_4(int i, Dequantizer& deq, const Q8& q8, const float16x4_t * sc16, float32x4_t * acc) {
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
auto pall = sum_4_blocks(deq.bits.b, q8.quant_data(iy, i));
auto scale = vcvt_f32_f16(sc16[iy]);
acc[iy] = vmlaq_f32(acc[iy], scale, vcvtq_f32_s32(pall));
}
}
template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y(int n, Dequantizer& deq, Q8& q8, const DataInfo& info, int nrc_x) {
const int nb = n / QK4_1;
float16x4_t sc16[Q8::nrc_y];
for (int ix = 0; ix < nrc_x; ++ix) {
deq.new_row(ix);
float32x4_t acc[Q8::nrc_y];
for (int iy = 0; iy < Q8::nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f);
for (int i = 0; i < nb/4; ++i) {
q8.process_scales(i, deq, sc16, acc);
sum_4(i, deq, q8, sc16, acc);
}
for (int i = 4*(nb/4); i < nb; ++i) {
q8.process_1_block(i, deq, acc);
}
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
info.store(ix, iy, vaddvq_f32(acc[iy]));
}
}
}
template <typename Dequantizer, typename Q8>
inline void mul_mat_qX_Y_q8_Y_1(int n, Dequantizer& deq1, Dequantizer& deq2, Q8& q8, const DataInfo& info, int nrc_x) {
const int nb = n / QK4_1;
float16x4_t sc16[2];
for (int ix = 0; ix < nrc_x; ++ix) {
deq1.new_row(ix);
deq2.new_row(ix);
float32x4_t acc[2] = { vdupq_n_f32(0.f), vdupq_n_f32(0.f) };
for (int i = 0; i < nb/8; ++i) {
q8.process_scales(2*i+0, deq1, sc16+0, acc+0);
q8.process_scales(2*i+1, deq2, sc16+1, acc+1);
sum_4(2*i+0, deq1, q8, sc16+0, acc+0);
sum_4(2*i+1, deq2, q8, sc16+1, acc+1);
}
for (int i = 2*(nb/8); i < nb/4; ++i) {
q8.process_scales(i, deq1, sc16, acc);
sum_4(i, deq1, q8, sc16, acc);
}
for (int i = 4*(nb/4); i < nb; ++i) {
q8.process_1_block(i, deq1, acc);
}
info.store(ix, 0, vaddvq_f32(vaddq_f32(acc[0], acc[1])));
}
}
template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_1_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
Q81<nrc_y> q8(info);
if constexpr (nrc_y == 1) {
Dequantizer deq1(vx, bx), deq2(vx, bx);
mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
} else {
Dequantizer deq(vx, bx);
mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
}
}
template <typename Dequantizer, int nrc_y>
static void IQK_NOINLINE mul_mat_qX_0_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
Q80<nrc_y> q8(info);
if constexpr (nrc_y == 1) {
Dequantizer deq1(vx, bx), deq2(vx, bx);
mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
} else {
Dequantizer deq(vx, bx);
mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x);
}
}
template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_1_q8_1_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
Dequantizer deq1(vx, bx), deq2(vx, bx);
Q81<1> q8(info);
mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x);
}
template <typename Dequantizer>
static void IQK_NOINLINE mul_mat_qX_0_q8_0_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
Dequantizer deq1(vx, bx), deq2(vx, bx);
Q80<1> q8(info);
mul_mat_qX_Y_q8_Y(n, deq1, deq2, q8, info, nrc_x);
}
template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
if constexpr (std::is_same_v<Dequantizer, DequantizerQ40> || std::is_same_v<Dequantizer, DequantizerQ50> ||
std::is_same_v<Dequantizer, DequantizerQ80>) {
m.funcs[0] = mul_mat_qX_0_q8_0<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_0_q8_0<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_0_q8_0<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_0_q8_0<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_0_q8_0<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_0_q8_0<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_0_q8_0<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_0_q8_0<Dequantizer, 8>;
}
else if constexpr (std::is_same_v<Dequantizer, DequantizerQ41> || std::is_same_v<Dequantizer, DequantizerQ51>) {
m.funcs[0] = mul_mat_qX_1_q8_1<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_1_q8_1<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_1_q8_1<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_1_q8_1<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_1_q8_1<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_1_q8_1<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_1_q8_1<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_1_q8_1<Dequantizer, 8>;
}
else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS> || std::is_same_v<Dequantizer, DequantizerIQ3XXS>) {
m.funcs[0] = mul_mat_qX_K_q8_K_IQXXS<1, Dequantizer>;
m.funcs[1] = mul_mat_qX_K_q8_K_IQXXS<2, Dequantizer>;
m.funcs[2] = mul_mat_qX_K_q8_K_IQXXS<3, Dequantizer>;
m.funcs[3] = mul_mat_qX_K_q8_K_IQXXS<4, Dequantizer>;
m.funcs[4] = mul_mat_qX_K_q8_K_IQXXS<5, Dequantizer>;
m.funcs[5] = mul_mat_qX_K_q8_K_IQXXS<6, Dequantizer>;
m.funcs[6] = mul_mat_qX_K_q8_K_IQXXS<7, Dequantizer>;
m.funcs[7] = mul_mat_qX_K_q8_K_IQXXS<8, Dequantizer>;
}
else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2S> ||
std::is_same_v<Dequantizer, DequantizerIQ3S> ||
std::is_same_v<Dequantizer, DequantizerIQ2XS>) {
m.funcs[0] = mul_mat_qX_K_q8_K_IQ<1, Dequantizer>;
m.funcs[1] = mul_mat_qX_K_q8_K_IQ<2, Dequantizer>;
m.funcs[2] = mul_mat_qX_K_q8_K_IQ<3, Dequantizer>;
m.funcs[3] = mul_mat_qX_K_q8_K_IQ<4, Dequantizer>;
m.funcs[4] = mul_mat_qX_K_q8_K_IQ<5, Dequantizer>;
m.funcs[5] = mul_mat_qX_K_q8_K_IQ<6, Dequantizer>;
m.funcs[6] = mul_mat_qX_K_q8_K_IQ<7, Dequantizer>;
m.funcs[7] = mul_mat_qX_K_q8_K_IQ<8, Dequantizer>;
}
else {
m.funcs[0] = mul_mat_qX_K_q8_K_T<1, Dequantizer>;
m.funcs[1] = mul_mat_qX_K_q8_K_T<2, Dequantizer>;
m.funcs[2] = mul_mat_qX_K_q8_K_T<3, Dequantizer>;
m.funcs[3] = mul_mat_qX_K_q8_K_T<4, Dequantizer>;
m.funcs[4] = mul_mat_qX_K_q8_K_T<5, Dequantizer>;
m.funcs[5] = mul_mat_qX_K_q8_K_T<6, Dequantizer>;
m.funcs[6] = mul_mat_qX_K_q8_K_T<7, Dequantizer>;
m.funcs[7] = mul_mat_qX_K_q8_K_T<8, Dequantizer>;
m.funcs_v2 = mul_mat_qX_K_q8_K_T_v2<Dequantizer>;
}
}
bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny) {
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
(void)Ny;
// Uncommenting out this would disable iqk_mul_mat for matrix x vector multiplications.
//if (Ny == 1 && (typeA == GGML_TYPE_IQ2_XXS || typeA == GGML_TYPE_IQ2_XS || typeA == GGML_TYPE_IQ2_S ||
// typeA == GGML_TYPE_IQ3_XXS || typeA == GGML_TYPE_IQ3_S)) return false;
switch (typeA) {
case GGML_TYPE_Q2_K:
MulMat::set_functions<DequantizerQ2K>(m);
break;
case GGML_TYPE_Q3_K:
MulMat::set_functions<DequantizerQ3K>(m);
break;
case GGML_TYPE_Q4_K:
MulMat::set_functions<DequantizerQ4K>(m);
break;
case GGML_TYPE_Q5_K:
MulMat::set_functions<DequantizerQ5K>(m);
break;
case GGML_TYPE_Q6_K:
MulMat::set_functions<DequantizerQ6K>(m);
break;
case GGML_TYPE_IQ4_XS:
MulMat::set_functions<DequantizerIQ4XS>(m);
break;
case GGML_TYPE_IQ3_S:
MulMat::set_functions<DequantizerIQ3S>(m);
break;
case GGML_TYPE_IQ3_XXS:
MulMat::set_functions<DequantizerIQ3XXS>(m);
break;
case GGML_TYPE_IQ2_S:
MulMat::set_functions<DequantizerIQ2S>(m);
break;
case GGML_TYPE_IQ2_XS:
MulMat::set_functions<DequantizerIQ2XS>(m);
break;
case GGML_TYPE_IQ2_XXS:
MulMat::set_functions<DequantizerIQ2XXS>(m);
break;
case GGML_TYPE_Q4_0:
MulMat::set_functions<DequantizerQ40>(m);
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
break;
case GGML_TYPE_Q4_1:
MulMat::set_functions<DequantizerQ41>(m);
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
break;
case GGML_TYPE_Q5_0:
MulMat::set_functions<DequantizerQ50>(m);
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
break;
case GGML_TYPE_Q5_1:
MulMat::set_functions<DequantizerQ51>(m);
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
break;
case GGML_TYPE_Q8_0:
MulMat::set_functions<DequantizerQ80>(m);
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
break;
default:
return false;
}
return true;
}
}
#endif // __x86_64__ or __aarch64__