From dbd8828eb03b9aa8d0af7e4c533d3c2f5b38aba6 Mon Sep 17 00:00:00 2001 From: Lee <44310445+lx200916@users.noreply.github.com> Date: Tue, 13 Feb 2024 01:29:57 +0800 Subject: [PATCH 1/4] py : fix persimmon `n_rot` conversion (#5460) * convert : fix persimmon offical weight conversion to write correct n_rot. * Update convert-persimmon-to-gguf.py --------- Co-authored-by: Georgi Gerganov --- convert-persimmon-to-gguf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index d2be805d1..def210531 100755 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -88,7 +88,8 @@ def main(): gguf_writer.add_embedding_length(hidden_size) gguf_writer.add_block_count(block_count) gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size) - gguf_writer.add_rope_dimension_count(hidden_size // head_count) + # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443 + gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) gguf_writer.add_head_count(head_count) gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_rope_freq_base(hparams.rotary_emb_base) From df334a11251b81fd0b6a0e51e7146e0ba9e973f2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 12 Feb 2024 19:54:29 +0200 Subject: [PATCH 2/4] swift : package no longer use ggml dependency (#5465) * Revert "swift : update Package.swift to use ggml as dependency (#4691)" This reverts commit ece9a45e8ffb73ad461c792720c2fec28b0137bc. * spm : add ggml headers --- Package.swift | 24 +++++++++++++++++++----- spm-headers/ggml-alloc.h | 1 + spm-headers/ggml-backend.h | 1 + spm-headers/ggml.h | 1 + 4 files changed, 22 insertions(+), 5 deletions(-) create mode 120000 spm-headers/ggml-alloc.h create mode 120000 spm-headers/ggml-backend.h create mode 120000 spm-headers/ggml.h diff --git a/Package.swift b/Package.swift index 37524edee..b24c9204a 100644 --- a/Package.swift +++ b/Package.swift @@ -13,17 +13,31 @@ let package = Package( products: [ .library(name: "llama", targets: ["llama"]), ], - dependencies: [ - .package(url: "https://github.com/ggerganov/ggml.git", .branch("release")) - ], targets: [ .target( name: "llama", - dependencies: ["ggml"], path: ".", - exclude: ["ggml-metal.metal"], + exclude: [ + "cmake", + "examples", + "scripts", + "models", + "tests", + "CMakeLists.txt", + "ggml-cuda.cu", + "ggml-cuda.h", + "Makefile" + ], sources: [ + "ggml.c", "llama.cpp", + "ggml-alloc.c", + "ggml-backend.c", + "ggml-quants.c", + "ggml-metal.m", + ], + resources: [ + .process("ggml-metal.metal") ], publicHeadersPath: "spm-headers", cSettings: [ diff --git a/spm-headers/ggml-alloc.h b/spm-headers/ggml-alloc.h new file mode 120000 index 000000000..a49d385a1 --- /dev/null +++ b/spm-headers/ggml-alloc.h @@ -0,0 +1 @@ +../ggml-alloc.h \ No newline at end of file diff --git a/spm-headers/ggml-backend.h b/spm-headers/ggml-backend.h new file mode 120000 index 000000000..17c2cf14f --- /dev/null +++ b/spm-headers/ggml-backend.h @@ -0,0 +1 @@ +../ggml-backend.h \ No newline at end of file diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h new file mode 120000 index 000000000..39215298f --- /dev/null +++ b/spm-headers/ggml.h @@ -0,0 +1 @@ +../ggml.h \ No newline at end of file From 099afc6274c859ca67146e725839f2d97a5ef313 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 12 Feb 2024 20:14:39 +0200 Subject: [PATCH 3/4] llama : fix quantization when tensors are missing (#5423) --- llama.cpp | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index a5b873a7b..d316d067b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -772,22 +772,37 @@ struct LLM_TN { llm_arch arch; std::string operator()(llm_tensor tensor) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return LLM_TENSOR_NAMES[arch].at(tensor); } std::string operator()(llm_tensor tensor, const std::string & suffix) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix; } std::string operator()(llm_tensor tensor, int bid) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid); } std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; } std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const { + if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + return "__missing__"; + } return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix; } }; @@ -10227,6 +10242,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } ++qs.i_ffn_up; } + // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; //} // IK: let's remove this, else Q2_K is almost the same as Q3_K_S @@ -10286,19 +10302,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K_S: - case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_XS: case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: - case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; case LLAMA_FTYPE_MOSTLY_Q4_K_S: - case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; case LLAMA_FTYPE_MOSTLY_Q5_K_S: - case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; - case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; + case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } From 895407f31b358e3d9335e847d13f033491ec8a5b Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Tue, 13 Feb 2024 09:07:57 +0200 Subject: [PATCH 4/4] ggml-quants : fix compiler warnings (shadow variable) (#5472) Co-authored-by: Iwan Kawrakow --- ggml-quants.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index b2a309bf8..f44377f45 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3819,15 +3819,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r /* Compute combined scale for the block */ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i qx = bytes_from_nibbles_32(x[i].qs); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); + qx = _mm256_sub_epi8( qx, off ); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_i8_pairs_float(qx, qy); /* Multiply q with scale and accumulate */ acc = _mm256_fmadd_ps( d, q, acc ); @@ -4196,10 +4196,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i bx = bytes_from_nibbles_32(x[i].qs); - const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); + const __m256i qx = bytes_from_nibbles_32(x[i].qs); + const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs ); - const __m256 xy = mul_sum_us8_pairs_float(bx, by); + const __m256 xy = mul_sum_us8_pairs_float(qx, qy); // Accumulate d0*d1*x*y #if defined(__AVX2__) @@ -4418,14 +4418,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r /* Compute combined scale for the block */ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i qx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); - bx = _mm256_or_si256(bx, bxhi); + qx = _mm256_or_si256(qx, bxhi); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_i8_pairs_float(qx, qy); /* Multiply q with scale and accumulate */ acc = _mm256_fmadd_ps(d, q, acc); @@ -4722,15 +4722,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i qx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); - bx = _mm256_or_si256(bx, bxhi); + qx = _mm256_or_si256(qx, bxhi); const __m256 dy = _mm256_set1_ps(y[i].d); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_us8_pairs_float(bx, by); + const __m256 q = mul_sum_us8_pairs_float(qx, qy); acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); } @@ -4973,10 +4973,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { // Compute combined scale for the block const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); - __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + __m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs); + __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_i8_pairs_float(qx, qy); // Multiply q with scale and accumulate #if defined(__AVX2__)