mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # Package.swift
This commit is contained in:
commit
ae08a49136
6 changed files with 47 additions and 27 deletions
|
@ -88,7 +88,8 @@ def main():
|
||||||
gguf_writer.add_embedding_length(hidden_size)
|
gguf_writer.add_embedding_length(hidden_size)
|
||||||
gguf_writer.add_block_count(block_count)
|
gguf_writer.add_block_count(block_count)
|
||||||
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
|
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
|
||||||
gguf_writer.add_rope_dimension_count(hidden_size // head_count)
|
# ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
|
||||||
|
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
|
||||||
gguf_writer.add_head_count(head_count)
|
gguf_writer.add_head_count(head_count)
|
||||||
gguf_writer.add_head_count_kv(head_count_kv)
|
gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
|
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
|
||||||
|
|
|
@ -3821,15 +3821,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
/* Compute combined scale for the block */
|
/* Compute combined scale for the block */
|
||||||
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
|
|
||||||
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
||||||
const __m256i off = _mm256_set1_epi8( 8 );
|
const __m256i off = _mm256_set1_epi8( 8 );
|
||||||
bx = _mm256_sub_epi8( bx, off );
|
qx = _mm256_sub_epi8( qx, off );
|
||||||
|
|
||||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||||
|
|
||||||
/* Multiply q with scale and accumulate */
|
/* Multiply q with scale and accumulate */
|
||||||
acc = _mm256_fmadd_ps( d, q, acc );
|
acc = _mm256_fmadd_ps( d, q, acc );
|
||||||
|
@ -4198,10 +4198,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
||||||
|
|
||||||
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
||||||
const __m256i bx = bytes_from_nibbles_32(x[i].qs);
|
const __m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
||||||
|
|
||||||
const __m256 xy = mul_sum_us8_pairs_float(bx, by);
|
const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
|
||||||
|
|
||||||
// Accumulate d0*d1*x*y
|
// Accumulate d0*d1*x*y
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
@ -4420,14 +4420,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
/* Compute combined scale for the block */
|
/* Compute combined scale for the block */
|
||||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||||
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
||||||
bx = _mm256_or_si256(bx, bxhi);
|
qx = _mm256_or_si256(qx, bxhi);
|
||||||
|
|
||||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||||
|
|
||||||
/* Multiply q with scale and accumulate */
|
/* Multiply q with scale and accumulate */
|
||||||
acc = _mm256_fmadd_ps(d, q, acc);
|
acc = _mm256_fmadd_ps(d, q, acc);
|
||||||
|
@ -4724,15 +4724,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||||
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
||||||
bx = _mm256_or_si256(bx, bxhi);
|
qx = _mm256_or_si256(qx, bxhi);
|
||||||
|
|
||||||
const __m256 dy = _mm256_set1_ps(y[i].d);
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
||||||
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
||||||
|
|
||||||
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
||||||
}
|
}
|
||||||
|
@ -4975,10 +4975,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
// Compute combined scale for the block
|
// Compute combined scale for the block
|
||||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
||||||
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
__m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
||||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||||
|
|
||||||
// Multiply q with scale and accumulate
|
// Multiply q with scale and accumulate
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
32
llama.cpp
32
llama.cpp
|
@ -796,22 +796,37 @@ struct LLM_TN {
|
||||||
llm_arch arch;
|
llm_arch arch;
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor) const {
|
std::string operator()(llm_tensor tensor) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return LLM_TENSOR_NAMES[arch].at(tensor);
|
return LLM_TENSOR_NAMES[arch].at(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, int bid) const {
|
std::string operator()(llm_tensor tensor, int bid) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -10550,6 +10565,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
}
|
}
|
||||||
++qs.i_ffn_up;
|
++qs.i_ffn_up;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
//}
|
//}
|
||||||
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
||||||
|
@ -10609,19 +10625,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// K-quants
|
// K-quants
|
||||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
||||||
|
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
|
|
1
spm-headers/ggml-alloc.h
Symbolic link
1
spm-headers/ggml-alloc.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml-alloc.h
|
1
spm-headers/ggml-backend.h
Symbolic link
1
spm-headers/ggml-backend.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml-backend.h
|
1
spm-headers/ggml.h
Symbolic link
1
spm-headers/ggml.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml.h
|
Loading…
Add table
Add a link
Reference in a new issue