diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 7c15d2aa4..62d828250 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -6,6 +6,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_SYCL +#include "ggml-sycl.h" +#endif + #include "ggml-rpc.h" #ifdef _WIN32 # include @@ -79,6 +83,12 @@ static ggml_backend_t create_backend() { if (!backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); } +#elif GGML_USE_SYCL + fprintf(stderr, "%s: using SYCL backend\n", __func__); + backend = ggml_backend_sycl_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__); + } #endif // if there aren't GPU Backends fallback to CPU backend diff --git a/ggml-alloc.c b/ggml-alloc.c index 1fbd376ed..0146946eb 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t)); GGML_ASSERT(galloc->bufts != NULL); - galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs); + galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)); GGML_ASSERT(galloc->buffers != NULL); galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 0c51c322f..eabd70d5e 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -22,6 +22,7 @@ #include "shaderop_mul_mat_q4_1.h" #include "shaderop_mul_mat_q6_k.h" #include "shaderop_mul_mat_mat_f32.h" +#include "shaderop_getrows_f32.h" #include "shaderop_getrows_f16.h" #include "shaderop_getrows_q4_0.h" #include "shaderop_getrows_q4_1.h" @@ -1146,6 +1147,14 @@ static void ggml_vk_get_rows( seq.record(s_algo); } +template +static void ggml_vk_get_rows_f32(Args&&... args) { + const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv, + kp::shader_data::op_getrows_f32_comp_spv_len); + + ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward(args)...); +} + template static void ggml_vk_get_rows_f16(Args&&... args) { const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv, @@ -1371,6 +1380,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) { return op->ne[3] == 1; case GGML_OP_GET_ROWS: switch (op->src[0]->type) { + case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -1661,7 +1671,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml } break; case GGML_OP_GET_ROWS: { - if (src0t == GGML_TYPE_F16) { + if (src0t == GGML_TYPE_F32) { + ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); + } else if (src0t == GGML_TYPE_F16) { ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); } else if (src0t == GGML_TYPE_Q4_0) { ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); diff --git a/llama.cpp b/llama.cpp index ff2e43113..7c47879ac 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2190,8 +2190,7 @@ struct llama_vocab { std::vector id_to_token; std::vector cache_special_tokens; - std::vector cache_token_to_piece; // llama_token_to_piece(special = false); - std::vector cache_token_to_piece_special; // llama_token_to_piece(special = true); + std::vector cache_token_to_piece; // llama_token_to_piece(special = true); std::map, int> bpe_ranks; @@ -4908,23 +4907,19 @@ static void llm_load_vocab( LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size()); } - // build token to piece caches + // build token to piece cache { size_t size_cache = 0; - std::vector cache_token_to_piece (n_vocab); - std::vector cache_token_to_piece_special(n_vocab); + std::vector cache_token_to_piece(n_vocab); for (uint32_t id = 0; id < n_vocab; ++id) { - cache_token_to_piece[id] = llama_token_to_piece(&model, id, false); - cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true); + cache_token_to_piece[id] = llama_token_to_piece(&model, id, true); size_cache += cache_token_to_piece[id].size(); - size_cache += cache_token_to_piece_special[id].size(); } - std::swap(vocab.cache_token_to_piece, cache_token_to_piece); - std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special); + std::swap(vocab.cache_token_to_piece, cache_token_to_piece); LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); } @@ -18638,9 +18633,14 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token return llama_token_to_piece_old(model, token, buf, length); } + // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843 + if (!special && llama_is_control_token(model->vocab, token)) { + return 0; + } + // if we have a cache - use it { - const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece; + const auto & cache = model->vocab.cache_token_to_piece; if (!cache.empty()) { const auto & res = cache.at(token);