mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-14 10:59:41 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .gitignore # CMakeLists.txt # Makefile # README.md # ci/run.sh # flake.lock # flake.nix # ggml-cuda.cu # ggml-cuda.h # scripts/get-wikitext-2.sh # tests/CMakeLists.txt
This commit is contained in:
commit
1cb8a5e955
79 changed files with 6273 additions and 2982 deletions
258
ggml-quants.c
258
ggml-quants.c
|
@ -517,6 +517,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||
quantize_row_q4_0_reference(x, y, k);
|
||||
}
|
||||
|
||||
|
||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
|
||||
const int qk = QK4_1;
|
||||
|
||||
|
@ -1275,7 +1276,12 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|||
}
|
||||
float sumlx = 0;
|
||||
float suml2 = 0;
|
||||
#ifdef HAVE_BUGGY_APPLE_LINKER
|
||||
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
||||
for (volatile int i = 0; i < n; ++i) {
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
#endif
|
||||
int l = nearest_int(iscale * x[i]);
|
||||
l = MAX(-nmax, MIN(nmax-1, l));
|
||||
L[i] = l + nmax;
|
||||
|
@ -1650,7 +1656,12 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
|
|||
float max = x[0];
|
||||
float sum_w = weights ? weights[0] : x[0]*x[0];
|
||||
float sum_x = sum_w * x[0];
|
||||
#ifdef HAVE_BUGGY_APPLE_LINKER
|
||||
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
||||
for (volatile int i = 1; i < n; ++i) {
|
||||
#else
|
||||
for (int i = 1; i < n; ++i) {
|
||||
#endif
|
||||
if (x[i] < min) min = x[i];
|
||||
if (x[i] > max) max = x[i];
|
||||
float w = weights ? weights[i] : x[i]*x[i];
|
||||
|
@ -1661,7 +1672,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
|
|||
min = 0;
|
||||
}
|
||||
if (max <= min) {
|
||||
for (int i = 0; i < n; ++i) L[i] = 0;
|
||||
memset(L, 0, n);
|
||||
*the_min = -min;
|
||||
return 0.f;
|
||||
}
|
||||
|
@ -1863,7 +1874,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|||
|
||||
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -2182,7 +2193,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|||
|
||||
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -2449,7 +2460,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|||
|
||||
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -2772,7 +2783,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|||
|
||||
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -3026,7 +3037,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|||
|
||||
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -3041,6 +3052,197 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
|
|||
return nrow * row_size;
|
||||
}
|
||||
|
||||
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
|
||||
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_reference(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
float weight[QK4_0];
|
||||
int8_t L[QK4_0];
|
||||
|
||||
float sum_x2 = 0;
|
||||
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
||||
float sigma2 = sum_x2/n_per_row;
|
||||
|
||||
const int nb = n_per_row/QK4_0;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float * xb = x + QK4_0 * ib;
|
||||
const float * qw = quant_weights + QK4_0 * ib;
|
||||
for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||
float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
|
||||
y[ib].d = GGML_FP32_TO_FP16(d);
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
y[ib].qs[j] = L[j] | (L[j+16] << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
||||
src += n_per_row;
|
||||
qrow += row_size;
|
||||
}
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
|
||||
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_1_reference(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
float weight[QK4_1];
|
||||
uint8_t L[QK4_1], Laux[QK4_1];
|
||||
|
||||
float sum_x2 = 0;
|
||||
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
||||
float sigma2 = sum_x2/n_per_row;
|
||||
|
||||
const int nb = n_per_row/QK4_1;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float * xb = x + QK4_1 * ib;
|
||||
const float * qw = quant_weights + QK4_1 * ib;
|
||||
for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||
float min;
|
||||
float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
|
||||
y[ib].d = GGML_FP32_TO_FP16(d);
|
||||
y[ib].m = GGML_FP32_TO_FP16(-min);
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
y[ib].qs[j] = L[j] | (L[j+16] << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
||||
src += n_per_row;
|
||||
qrow += row_size;
|
||||
}
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
|
||||
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_0_reference(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
float weight[QK5_0];
|
||||
int8_t L[QK5_0];
|
||||
|
||||
float sum_x2 = 0;
|
||||
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
||||
float sigma2 = sum_x2/n_per_row;
|
||||
|
||||
const int nb = n_per_row/QK5_0;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float * xb = x + QK5_0 * ib;
|
||||
const float * qw = quant_weights + QK5_0 * ib;
|
||||
for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||
float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
|
||||
y[ib].d = GGML_FP32_TO_FP16(d);
|
||||
|
||||
uint32_t qh = 0;
|
||||
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
const uint8_t xi0 = L[j];
|
||||
const uint8_t xi1 = L[j+16];
|
||||
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
||||
|
||||
// get the 5-th bit and store it in qh at the right position
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
||||
}
|
||||
|
||||
memcpy(&y[ib].qh, &qh, sizeof(qh));
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
||||
src += n_per_row;
|
||||
qrow += row_size;
|
||||
}
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
|
||||
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_1_reference(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
float weight[QK5_1];
|
||||
uint8_t L[QK5_1], Laux[QK5_1];
|
||||
|
||||
float sum_x2 = 0;
|
||||
for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
|
||||
float sigma2 = sum_x2/n_per_row;
|
||||
|
||||
const int nb = n_per_row/QK5_1;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float * xb = x + QK5_1 * ib;
|
||||
const float * qw = quant_weights + QK5_1 * ib;
|
||||
for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||
float min;
|
||||
float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
|
||||
y[ib].d = GGML_FP32_TO_FP16(d);
|
||||
y[ib].m = GGML_FP32_TO_FP16(-min);
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
const uint8_t xi0 = L[j];
|
||||
const uint8_t xi1 = L[j+16];
|
||||
y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
|
||||
// get the 5-th bit and store it in qh at the right position
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
||||
}
|
||||
memcpy(&y[ib].qh, &qh, sizeof(qh));
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
||||
src += n_per_row;
|
||||
qrow += row_size;
|
||||
}
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
// ====================== "True" 2-bit (de)-quantization
|
||||
|
||||
static const uint64_t iq2xxs_grid[256] = {
|
||||
|
@ -8375,7 +8577,7 @@ static int iq2_compare_func(const void * left, const void * right) {
|
|||
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
||||
}
|
||||
|
||||
static void q2xs_init_impl(int grid_size) {
|
||||
void iq2xs_init_impl(int grid_size) {
|
||||
const int gindex = iq2_data_index(grid_size);
|
||||
if (iq2_data[gindex].grid) {
|
||||
return;
|
||||
|
@ -8530,19 +8732,7 @@ static void q2xs_init_impl(int grid_size) {
|
|||
free(dist2);
|
||||
}
|
||||
|
||||
void ggml_init_iq2_quantization(enum ggml_type type) {
|
||||
if (type == GGML_TYPE_IQ2_XXS) {
|
||||
q2xs_init_impl(256);
|
||||
}
|
||||
else if (type == GGML_TYPE_IQ2_XS) {
|
||||
q2xs_init_impl(512);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
||||
}
|
||||
}
|
||||
|
||||
static void q2xs_deinit_impl(int grid_size) {
|
||||
void iq2xs_free_impl(int grid_size) {
|
||||
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
|
||||
const int gindex = iq2_data_index(grid_size);
|
||||
if (iq2_data[gindex].grid) {
|
||||
|
@ -8552,18 +8742,6 @@ static void q2xs_deinit_impl(int grid_size) {
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_deinit_iq2_quantization(enum ggml_type type) {
|
||||
if (type == GGML_TYPE_IQ2_XXS) {
|
||||
q2xs_deinit_impl(256);
|
||||
}
|
||||
else if (type == GGML_TYPE_IQ2_XS) {
|
||||
q2xs_deinit_impl(512);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
||||
}
|
||||
}
|
||||
|
||||
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
||||
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
||||
int num_neighbors = neighbours[0];
|
||||
|
@ -8596,10 +8774,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|||
const int * kmap_q2xs = iq2_data[gindex].map;
|
||||
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
||||
|
||||
GGML_ASSERT(quant_weights);
|
||||
GGML_ASSERT(kgrid_q2xs);
|
||||
GGML_ASSERT(kmap_q2xs);
|
||||
GGML_ASSERT(kneighbors_q2xs);
|
||||
GGML_ASSERT(quant_weights && "missing quantization weights");
|
||||
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
|
||||
const int kMaxQ = 3;
|
||||
|
@ -8815,10 +8993,10 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|||
const int * kmap_q2xs = iq2_data[gindex].map;
|
||||
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
||||
|
||||
GGML_ASSERT(quant_weights);
|
||||
GGML_ASSERT(kmap_q2xs);
|
||||
GGML_ASSERT(kgrid_q2xs);
|
||||
GGML_ASSERT(kneighbors_q2xs);
|
||||
GGML_ASSERT(quant_weights && "missing quantization weights");
|
||||
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
|
||||
const int kMaxQ = 3;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue