diff --git a/common/minja/minja.hpp b/common/minja/minja.hpp index 1a30499e7..c4f40b17e 100644 --- a/common/minja/minja.hpp +++ b/common/minja/minja.hpp @@ -2606,14 +2606,18 @@ inline std::shared_ptr Context::builtins() { auto & text = args.at("text"); return text.is_null() ? text : Value(strip(text.get())); })); - globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr &, Value & args) { - auto text = args.at("text"); - if (text.is_null()) return text; - std::string res; - auto str = text.get(); - std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower); - return Value(res); - })); + auto char_transform_function = [](const std::string & name, const std::function & fn) { + return simple_function(name, { "text" }, [=](const std::shared_ptr &, Value & args) { + auto text = args.at("text"); + if (text.is_null()) return text; + std::string res; + auto str = text.get(); + std::transform(str.begin(), str.end(), std::back_inserter(res), fn); + return Value(res); + }); + }; + globals.set("lower", char_transform_function("lower", ::tolower)); + globals.set("upper", char_transform_function("upper", ::toupper)); globals.set("default", Value::callable([=](const std::shared_ptr &, ArgumentsValue & args) { args.expectArgs("default", {2, 3}, {0, 1}); auto & value = args.args[0]; diff --git a/docs/backend/OPENCL.md b/docs/backend/OPENCL.md index 2a946dc8d..07146f710 100644 --- a/docs/backend/OPENCL.md +++ b/docs/backend/OPENCL.md @@ -145,8 +145,13 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi * Clang 19 * Ninja * Visual Studio 2022 +* Powershell 7 -Powershell is used for the following instructions. +Visual Studio provides necessary headers and libraries although it is not directly used for building. +Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio. + +Powershell 7 is used for the following commands. +If an older version of Powershell is used, these commands may not work as they are. ### I. Setup Environment @@ -196,10 +201,9 @@ ninja ## Known Issues -- Qwen2.5 0.5B model produces gibberish output with Adreno kernels. +- Currently OpenCL backend does not work on Adreno 6xx GPUs. ## TODO -- Fix Qwen2.5 0.5B - Optimization for Q6_K - Support and optimization for Q4_K diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 8edc12649..7a2d1e453 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -299,7 +299,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16; const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion; - const bool can_use_vector_kernel = (Q->ne[0] % (2*warp_size) == 0) && (prec == GGML_PREC_DEFAULT || Q->ne[0] <= 128); + const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0; if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) { if (prec == GGML_PREC_DEFAULT) { ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 7a03788f6..829cf9952 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -33,6 +33,28 @@ #include #include +#if defined(_MSC_VER) +# define NOMINMAX 1 +# include +# define YIELD() YieldProcessor() +#elif defined(__clang__) || defined(__GNUC__) +# if defined(__x86_64__) ||defined(__i386__) +# include +# define YIELD() _mm_pause() +# elif defined(__arm__) || defined(__aarch64__) +# if defined(__clang__) +# include +# define YIELD() __yield() +# else +# define YIELD() asm volatile("yield") +# endif +# endif +#endif + +#if !defined(YIELD) +#define YIELD() +#endif + #include "ggml-impl.h" #include "ggml-backend-impl.h" @@ -796,7 +818,8 @@ struct ggml_backend_vk_context { ggml_vk_garbage_collector gc; size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k; vk_buffer prealloc_x, prealloc_y, prealloc_split_k; - vk::Fence fence; + vk::Fence fence, almost_ready_fence; + bool almost_ready_fence_pending {}; vk_buffer buffer_pool[MAX_VK_BUFFERS]; @@ -887,6 +910,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx static void ggml_backend_vk_free(ggml_backend_t backend); +// Wait for ctx->fence to be signaled. +static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) { + // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep + // during this wait. + if (ctx->almost_ready_fence_pending) { + VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence"); + ctx->device->device.resetFences({ ctx->almost_ready_fence }); + ctx->almost_ready_fence_pending = false; + } + + // Spin (w/pause) waiting for the graph to finish executing. + vk::Result result; + while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) { + if (result != vk::Result::eNotReady) { + fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__); + exit(1); + } + for (uint32_t i = 0; i < 100; ++i) { + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + YIELD(); + } + } + ctx->device->device.resetFences({ ctx->fence }); +} + // variables to track number of compiles in progress static uint32_t compile_count = 0; static std::mutex compile_count_mutex; @@ -3372,6 +3428,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { ctx->prealloc_size_split_k = 0; ctx->fence = ctx->device->device.createFence({}); + ctx->almost_ready_fence = ctx->device->device.createFence({}); #ifdef GGML_VULKAN_CHECK_RESULTS const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); @@ -7976,11 +8033,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } } -static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence); +static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready); // Returns true if node has enqueued work into the queue, false otherwise // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. -static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){ +static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){ if (ggml_is_empty(node) || !node->buffer) { return false; } @@ -8352,7 +8409,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod ctx->compute_ctx.reset(); - bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false); + bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready); if (!ok) { if (node->op == GGML_OP_UNARY) { std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; @@ -8366,7 +8423,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod return true; } -static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) { ggml_backend_buffer * buf = nullptr; switch (tensor->op) { @@ -8469,12 +8526,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{}); + if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) { + ggml_vk_submit(subctx, ctx->almost_ready_fence); + ctx->almost_ready_fence_pending = true; + } else { + ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{}); + } if (use_fence) { - VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - - ctx->device->device.resetFences({ ctx->fence }); + ggml_vk_wait_for_fence(ctx); } #ifdef GGML_VULKAN_CHECK_RESULTS ggml_vk_check_results_1(tensor); @@ -8560,6 +8620,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->gc.events.clear(); ctx->device->device.destroyFence(ctx->fence); + ctx->device->device.destroyFence(ctx->almost_ready_fence); } static int ggml_vk_get_device_count() { @@ -8906,8 +8967,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) { } ggml_vk_submit(transfer_ctx, ctx->fence); - VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); - ctx->device->device.resetFences({ ctx->fence }); + ggml_vk_wait_for_fence(ctx); for (auto& cpy : transfer_ctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); @@ -8926,7 +8986,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg uint64_t total_mat_mul_bytes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false); + ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false); if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); } @@ -8968,11 +9028,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); } + // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) + bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; bool submit = (submitted_nodes >= nodes_per_submit) || (mul_mat_bytes >= mul_mat_bytes_per_submit) || - (i == last_node); + (i == last_node) || + (almost_ready && !ctx->almost_ready_fence_pending); - bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit); + bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit); if (enqueued) { ++submitted_nodes; @@ -8984,7 +9047,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg #endif } - if (submit) { + if (submit && enqueued) { first_node_in_batch = true; submitted_nodes = 0; mul_mat_bytes = 0; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 27bdcdf2b..4cd27a58f 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2461,14 +2461,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list length - if (match + text.length() > raw_text_base_offset + raw_text_base_length) break; - #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); #endif