From 9422c5e34bbd302493b77a8f6d546154a1f4fe82 Mon Sep 17 00:00:00 2001 From: nickp27 Date: Sun, 2 Jun 2024 19:13:54 +1000 Subject: [PATCH 1/6] [SYCL] Update rpc-server.cpp to include SYCL backend (#7682) * Update rpc-server.cpp to include SYCL backend Draft PR to address inclusion of SYCL backend for RPC server * Update rpc-server.cpp --- examples/rpc/rpc-server.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 7c15d2aa4..62d828250 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -6,6 +6,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_SYCL +#include "ggml-sycl.h" +#endif + #include "ggml-rpc.h" #ifdef _WIN32 # include @@ -79,6 +83,12 @@ static ggml_backend_t create_backend() { if (!backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); } +#elif GGML_USE_SYCL + fprintf(stderr, "%s: using SYCL backend\n", __func__); + backend = ggml_backend_sycl_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__); + } #endif // if there aren't GPU Backends fallback to CPU backend From 7c4e5b7eae26581869e782015d9deca947c34997 Mon Sep 17 00:00:00 2001 From: Austin <77757836+teleprint-me@users.noreply.github.com> Date: Sun, 2 Jun 2024 13:39:08 -0400 Subject: [PATCH 2/6] chore : add ignore rule for generated server themes (#7689) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 50ae0973a..049efd703 100644 --- a/.gitignore +++ b/.gitignore @@ -105,6 +105,7 @@ examples/jeopardy/results.txt examples/server/*.html.hpp examples/server/*.js.hpp examples/server/*.mjs.hpp +examples/server/*.css.hpp poetry.lock poetry.toml From 1669810d7c2446af8425aa54ff6611bf6f14646c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 3 Jun 2024 00:13:12 +0300 Subject: [PATCH 3/6] flake.lock: Update (#7686) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'flake-parts': 'github:hercules-ci/flake-parts/8dc45382d5206bd292f9c2768b8058a8fd8311d9?narHash=sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78%3D' (2024-05-16) → 'github:hercules-ci/flake-parts/2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8?narHash=sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw%3D' (2024-06-01) • Updated input 'flake-parts/nixpkgs-lib': 'https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz?narHash=sha256-QBx10%2Bk6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94%3D' (2024-05-02) → 'https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz?narHash=sha256-lIbdfCsf8LMFloheeE6N31%2BBMIeixqyQWbSr2vk79EQ%3D' (2024-06-01) • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/bfb7a882678e518398ce9a31a881538679f6f092?narHash=sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8%3D' (2024-05-24) → 'github:NixOS/nixpkgs/ad57eef4ef0659193044870c731987a6df5cf56b?narHash=sha256-SzDKxseEcHR5KzPXLwsemyTR/kaM9whxeiJohbL04rs%3D' (2024-05-29) Co-authored-by: github-actions[bot] --- flake.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/flake.lock b/flake.lock index fd6e2a5f6..09047ab10 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1715865404, - "narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=", + "lastModified": 1717285511, + "narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9", + "rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1716509168, - "narHash": "sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8=", + "lastModified": 1716948383, + "narHash": "sha256-SzDKxseEcHR5KzPXLwsemyTR/kaM9whxeiJohbL04rs=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "bfb7a882678e518398ce9a31a881538679f6f092", + "rev": "ad57eef4ef0659193044870c731987a6df5cf56b", "type": "github" }, "original": { @@ -36,14 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1714640452, - "narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=", + "lastModified": 1717284937, + "narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=", "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz" }, "original": { "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz" } }, "root": { From 3413ae2193d0693f14bead02e5018f442cbf579b Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 3 Jun 2024 07:59:54 +1000 Subject: [PATCH 4/6] fix bug introduced in using calloc (#7701) compilade pointed this out on the previous MR --- ggml-alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 1fbd376ed..0146946eb 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t)); GGML_ASSERT(galloc->bufts != NULL); - galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs); + galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)); GGML_ASSERT(galloc->buffers != NULL); galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); From 9e405b6e2ecb888e860f7b92720b4809e21b3915 Mon Sep 17 00:00:00 2001 From: woachk <24752637+woachk@users.noreply.github.com> Date: Mon, 3 Jun 2024 07:32:16 +0200 Subject: [PATCH 5/6] kompute : implement op_getrows_f32 (#6403) op_getrows_f32 is required since https://github.com/ggerganov/llama.cpp/pull/6122 for the Vulkan w/ Kompute backend to be functional. As such, implement this op to make this backend functional again. --- CMakeLists.txt | 2 ++ ggml-kompute.cpp | 14 ++++++++++++- kompute-shaders/op_getrows_f32.comp | 31 +++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 kompute-shaders/op_getrows_f32.comp diff --git a/CMakeLists.txt b/CMakeLists.txt index 52b392a13..a9b33eaa1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -777,6 +777,7 @@ if (LLAMA_KOMPUTE) kompute-shaders/op_mul_mat_q4_0.comp kompute-shaders/op_mul_mat_q4_1.comp kompute-shaders/op_mul_mat_q6_k.comp + kompute-shaders/op_getrows_f32.comp kompute-shaders/op_getrows_f16.comp kompute-shaders/op_getrows_q4_0.comp kompute-shaders/op_getrows_q4_1.comp @@ -809,6 +810,7 @@ if (LLAMA_KOMPUTE) shaderop_mul_mat_q4_0.h shaderop_mul_mat_q4_1.h shaderop_mul_mat_q6_k.h + shaderop_getrows_f32.h shaderop_getrows_f16.h shaderop_getrows_q4_0.h shaderop_getrows_q4_1.h diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 0c51c322f..eabd70d5e 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -22,6 +22,7 @@ #include "shaderop_mul_mat_q4_1.h" #include "shaderop_mul_mat_q6_k.h" #include "shaderop_mul_mat_mat_f32.h" +#include "shaderop_getrows_f32.h" #include "shaderop_getrows_f16.h" #include "shaderop_getrows_q4_0.h" #include "shaderop_getrows_q4_1.h" @@ -1146,6 +1147,14 @@ static void ggml_vk_get_rows( seq.record(s_algo); } +template +static void ggml_vk_get_rows_f32(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv, + kp::shader_data::op_getrows_f32_comp_spv_len); + + ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward(args)...); +} + template static void ggml_vk_get_rows_f16(Args&&... args) { const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv, @@ -1371,6 +1380,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) { return op->ne[3] == 1; case GGML_OP_GET_ROWS: switch (op->src[0]->type) { + case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -1661,7 +1671,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml } break; case GGML_OP_GET_ROWS: { - if (src0t == GGML_TYPE_F16) { + if (src0t == GGML_TYPE_F32) { + ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); + } else if (src0t == GGML_TYPE_F16) { ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); } else if (src0t == GGML_TYPE_Q4_0) { ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); diff --git a/kompute-shaders/op_getrows_f32.comp b/kompute-shaders/op_getrows_f32.comp new file mode 100644 index 000000000..9d7acdaf8 --- /dev/null +++ b/kompute-shaders/op_getrows_f32.comp @@ -0,0 +1,31 @@ +#version 450 + +#include "common.comp" + +layout(local_size_x = 1) in; + +layout (binding = 0) readonly buffer tensorInA { float inA[]; }; +layout (binding = 1) readonly buffer tensorInB { int inB[]; }; +layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; + +layout (push_constant) uniform parameter { + uint inAOff; + uint inBOff; + uint outOff; + int ne00; + int nb01; + int nb1; +} pcs; + +void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { + for (int j = 0; j < k; j++) { + out_[y + j] = inA[x + j]; + } +} + +void main() { + const uint i = gl_WorkGroupID.x; + const int r = inB[i + pcs.inBOff]; + + dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); +} From 549279d8049d78620a2b081e26edb654f83c3bbd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 3 Jun 2024 08:34:43 +0300 Subject: [PATCH 6/6] llama : avoid double token-to-piece cache (#7654) ggml-ci --- llama.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/llama.cpp b/llama.cpp index 841be1de7..e90da793c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2164,8 +2164,7 @@ struct llama_vocab { std::vector id_to_token; std::vector cache_special_tokens; - std::vector cache_token_to_piece; // llama_token_to_piece(special = false); - std::vector cache_token_to_piece_special; // llama_token_to_piece(special = true); + std::vector cache_token_to_piece; // llama_token_to_piece(special = true); std::map, int> bpe_ranks; @@ -4845,23 +4844,19 @@ static void llm_load_vocab( LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size()); } - // build token to piece caches + // build token to piece cache { size_t size_cache = 0; - std::vector cache_token_to_piece (n_vocab); - std::vector cache_token_to_piece_special(n_vocab); + std::vector cache_token_to_piece(n_vocab); for (uint32_t id = 0; id < n_vocab; ++id) { - cache_token_to_piece[id] = llama_token_to_piece(&model, id, false); - cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true); + cache_token_to_piece[id] = llama_token_to_piece(&model, id, true); size_cache += cache_token_to_piece[id].size(); - size_cache += cache_token_to_piece_special[id].size(); } - std::swap(vocab.cache_token_to_piece, cache_token_to_piece); - std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special); + std::swap(vocab.cache_token_to_piece, cache_token_to_piece); LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); } @@ -18318,9 +18313,14 @@ static std::string llama_decode_text(const std::string & text) { // does not write null-terminator to buf int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) { + // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843 + if (!special && llama_is_control_token(model->vocab, token)) { + return 0; + } + // if we have a cache - use it { - const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece; + const auto & cache = model->vocab.cache_token_to_piece; if (!cache.empty()) { const auto & res = cache.at(token);