From b81510a7b78f7c5a6f069c4f3d0e569b9d287913 Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Sat, 5 Jul 2025 12:10:53 +0800 Subject: [PATCH 1/7] test-backend-ops: add support for specifying output format (#14368) * test-backend-ops: add support for specifying output format Signed-off-by: Xiaodong Ye * Address review comments Signed-off-by: Xiaodong Ye * Add build_commit and build_number in test_result Signed-off-by: Xiaodong Ye * Address review comments Signed-off-by: Xiaodong Ye * refactor Signed-off-by: Xiaodong Ye * Get build commit from ggml_commit() Signed-off-by: Xiaodong Ye * Merge errors into test_operation_info && address review comments Signed-off-by: Xiaodong Ye * Address review comments Signed-off-by: Xiaodong Ye * Address review comments Signed-off-by: Xiaodong Ye * remove visitor nonsense * remove visitor comment Signed-off-by: Xiaodong Ye * Address review comments Signed-off-by: Xiaodong Ye --------- Signed-off-by: Xiaodong Ye Co-authored-by: slaren --- tests/test-backend-ops.cpp | 778 +++++++++++++++++++++++++++++++------ 1 file changed, 667 insertions(+), 111 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index c76635793..0dc9c09e2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -24,10 +24,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -317,6 +319,538 @@ enum test_mode { MODE_GRAD, }; +// Output format support similar to llama-bench +enum output_formats { CONSOLE, SQL }; + +static const char * output_format_str(output_formats format) { + switch (format) { + case CONSOLE: + return "console"; + case SQL: + return "sql"; + default: + GGML_ABORT("invalid output format"); + } +} + +static bool output_format_from_str(const std::string & s, output_formats & format) { + if (s == "console") { + format = CONSOLE; + } else if (s == "sql") { + format = SQL; + } else { + return false; + } + return true; +} + +// Test result structure for SQL output +struct test_result { + std::string test_time; + std::string build_commit; + std::string backend_name; + std::string op_name; + std::string op_params; + std::string test_mode; + bool supported; + bool passed; + std::string error_message; + double time_us; + double flops; + double bandwidth_gb_s; + size_t memory_kb; + int n_runs; + + test_result() { + // Initialize with default values + time_us = 0.0; + flops = 0.0; + bandwidth_gb_s = 0.0; + memory_kb = 0; + n_runs = 0; + supported = false; + passed = false; + + // Set test time + time_t t = time(NULL); + char buf[32]; + std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); + test_time = buf; + + // Set build info + build_commit = ggml_commit(); + } + + test_result(const std::string & backend_name, const std::string & op_name, const std::string & op_params, + const std::string & test_mode, bool supported, bool passed, const std::string & error_message = "", + double time_us = 0.0, double flops = 0.0, double bandwidth_gb_s = 0.0, size_t memory_kb = 0, + int n_runs = 0) : + backend_name(backend_name), + op_name(op_name), + op_params(op_params), + test_mode(test_mode), + supported(supported), + passed(passed), + error_message(error_message), + time_us(time_us), + flops(flops), + bandwidth_gb_s(bandwidth_gb_s), + memory_kb(memory_kb), + n_runs(n_runs) { + // Set test time + time_t t = time(NULL); + char buf[32]; + std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); + test_time = buf; + + // Set build info + build_commit = ggml_commit(); + } + + static const std::vector & get_fields() { + static const std::vector fields = { + "test_time", "build_commit", "backend_name", "op_name", "op_params", "test_mode", "supported", + "passed", "error_message", "time_us", "flops", "bandwidth_gb_s", "memory_kb", "n_runs" + }; + return fields; + } + + enum field_type { STRING, BOOL, INT, FLOAT }; + + static field_type get_field_type(const std::string & field) { + if (field == "supported" || field == "passed") { + return BOOL; + } + if (field == "memory_kb" || field == "n_runs") { + return INT; + } + if (field == "time_us" || field == "flops" || field == "bandwidth_gb_s") { + return FLOAT; + } + return STRING; + } + + std::vector get_values() const { + return { test_time, + build_commit, + backend_name, + op_name, + op_params, + test_mode, + std::to_string(supported), + std::to_string(passed), + error_message, + std::to_string(time_us), + std::to_string(flops), + std::to_string(bandwidth_gb_s), + std::to_string(memory_kb), + std::to_string(n_runs) }; + } +}; + +// Printer classes for different output formats +enum class test_status_t { NOT_SUPPORTED, OK, FAIL }; + +struct test_operation_info { + std::string op_name; + std::string op_params; + std::string backend_name; + test_status_t status = test_status_t::OK; + std::string failure_reason; + + // Additional information fields that were previously in separate structs + std::string error_component; + std::string error_details; + + // Gradient info + int64_t gradient_index = -1; + std::string gradient_param_name; + float gradient_value = 0.0f; + + // MAA error info + double maa_error = 0.0; + double maa_threshold = 0.0; + + // Flags for different types of information + bool has_error = false; + bool has_gradient_info = false; + bool has_maa_error = false; + bool is_compare_failure = false; + bool is_large_tensor_skip = false; + + test_operation_info() = default; + + test_operation_info(const std::string & op_name, const std::string & op_params, const std::string & backend_name, + test_status_t status = test_status_t::OK, const std::string & failure_reason = "") : + op_name(op_name), + op_params(op_params), + backend_name(backend_name), + status(status), + failure_reason(failure_reason) {} + + // Set error information + void set_error(const std::string & component, const std::string & details) { + has_error = true; + error_component = component; + error_details = details; + if (status == test_status_t::OK) { + status = test_status_t::FAIL; + } + } + + // Set gradient information + void set_gradient_info(int64_t index, const std::string & param_name, float value) { + has_gradient_info = true; + gradient_index = index; + gradient_param_name = param_name; + gradient_value = value; + if (status == test_status_t::OK) { + status = test_status_t::FAIL; + } + } + + // Set MAA error information + void set_maa_error(double error, double threshold) { + has_maa_error = true; + maa_error = error; + maa_threshold = threshold; + if (status == test_status_t::OK) { + status = test_status_t::FAIL; + } + } + + // Set compare failure + void set_compare_failure() { + is_compare_failure = true; + if (status == test_status_t::OK) { + status = test_status_t::FAIL; + } + } + + // Set large tensor skip + void set_large_tensor_skip() { is_large_tensor_skip = true; } +}; + +struct test_summary_info { + size_t tests_passed; + size_t tests_total; + bool is_backend_summary = false; // true for backend summary, false for test summary + + test_summary_info() = default; + + test_summary_info(size_t tests_passed, size_t tests_total, bool is_backend_summary = false) : + tests_passed(tests_passed), + tests_total(tests_total), + is_backend_summary(is_backend_summary) {} +}; + +struct testing_start_info { + size_t device_count; + + testing_start_info() = default; + + testing_start_info(size_t device_count) : device_count(device_count) {} +}; + +struct backend_init_info { + size_t device_index; + size_t total_devices; + std::string device_name; + bool skipped = false; + std::string skip_reason; + std::string description; + size_t memory_total_mb = 0; + size_t memory_free_mb = 0; + bool has_memory_info = false; + + backend_init_info() = default; + + backend_init_info(size_t device_index, size_t total_devices, const std::string & device_name, bool skipped = false, + const std::string & skip_reason = "", const std::string & description = "", + size_t memory_total_mb = 0, size_t memory_free_mb = 0, bool has_memory_info = false) : + device_index(device_index), + total_devices(total_devices), + device_name(device_name), + skipped(skipped), + skip_reason(skip_reason), + description(description), + memory_total_mb(memory_total_mb), + memory_free_mb(memory_free_mb), + has_memory_info(has_memory_info) {} +}; + +struct backend_status_info { + std::string backend_name; + test_status_t status; + + backend_status_info() = default; + + backend_status_info(const std::string & backend_name, test_status_t status) : + backend_name(backend_name), + status(status) {} +}; + +struct overall_summary_info { + size_t backends_passed; + size_t backends_total; + bool all_passed; + + overall_summary_info() = default; + + overall_summary_info(size_t backends_passed, size_t backends_total, bool all_passed) : + backends_passed(backends_passed), + backends_total(backends_total), + all_passed(all_passed) {} +}; + +struct printer { + virtual ~printer() {} + + FILE * fout = stdout; + + virtual void print_header() {} + + virtual void print_test_result(const test_result & result) = 0; + + virtual void print_footer() {} + + virtual void print_operation(const test_operation_info & info) { (void) info; } + + virtual void print_summary(const test_summary_info & info) { (void) info; } + + virtual void print_testing_start(const testing_start_info & info) { (void) info; } + + virtual void print_backend_init(const backend_init_info & info) { (void) info; } + + virtual void print_backend_status(const backend_status_info & info) { (void) info; } + + virtual void print_overall_summary(const overall_summary_info & info) { (void) info; } +}; + +struct console_printer : public printer { + void print_test_result(const test_result & result) override { + if (result.test_mode == "test") { + print_test_console(result); + } else if (result.test_mode == "perf") { + print_perf_console(result); + } + } + + void print_operation(const test_operation_info & info) override { + printf(" %s(%s): ", info.op_name.c_str(), info.op_params.c_str()); + fflush(stdout); + + // Handle large tensor skip first + if (info.is_large_tensor_skip) { + printf("skipping large tensors for speed \n"); + return; + } + + // Handle not supported status + if (info.status == test_status_t::NOT_SUPPORTED) { + if (!info.failure_reason.empty()) { + printf("not supported [%s]\n", info.failure_reason.c_str()); + } else { + printf("not supported [%s]\n", info.backend_name.c_str()); + } + return; + } + + // Handle errors and additional information + if (info.has_error) { + if (info.error_component == "allocation") { + fprintf(stderr, "failed to allocate tensors [%s] ", info.backend_name.c_str()); + } else if (info.error_component == "backend") { + fprintf(stderr, " Failed to initialize %s backend\n", info.backend_name.c_str()); + } else { + fprintf(stderr, "Error in %s: %s\n", info.error_component.c_str(), info.error_details.c_str()); + } + } + + // Handle gradient info + if (info.has_gradient_info) { + printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", info.op_name.c_str(), info.gradient_index, + info.gradient_param_name.c_str(), info.gradient_value); + } + + // Handle MAA error + if (info.has_maa_error) { + printf("[%s] MAA = %.9f > %.9f ", info.op_name.c_str(), info.maa_error, info.maa_threshold); + } + + // Handle compare failure + if (info.is_compare_failure) { + printf("compare failed "); + } + + // Print final status + if (info.status == test_status_t::OK) { + printf("\033[1;32mOK\033[0m\n"); + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + } + + void print_summary(const test_summary_info & info) override { + if (info.is_backend_summary) { + printf("%zu/%zu backends passed\n", info.tests_passed, info.tests_total); + } else { + printf(" %zu/%zu tests passed\n", info.tests_passed, info.tests_total); + } + } + + void print_backend_status(const backend_status_info & info) override { + printf(" Backend %s: ", info.backend_name.c_str()); + if (info.status == test_status_t::OK) { + printf("\033[1;32mOK\033[0m\n"); + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + } + + void print_testing_start(const testing_start_info & info) override { + printf("Testing %zu devices\n\n", info.device_count); + } + + void print_backend_init(const backend_init_info & info) override { + printf("Backend %zu/%zu: %s\n", info.device_index + 1, info.total_devices, info.device_name.c_str()); + + if (info.skipped) { + printf(" %s\n", info.skip_reason.c_str()); + return; + } + + if (!info.description.empty()) { + printf(" Device description: %s\n", info.description.c_str()); + } + + if (info.has_memory_info) { + printf(" Device memory: %zu MB (%zu MB free)\n", info.memory_total_mb, info.memory_free_mb); + } + + printf("\n"); + } + + void print_overall_summary(const overall_summary_info & info) override { + printf("%zu/%zu backends passed\n", info.backends_passed, info.backends_total); + if (info.all_passed) { + printf("\033[1;32mOK\033[0m\n"); + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + } + + private: + void print_test_console(const test_result & result) { + printf(" %s(%s): ", result.op_name.c_str(), result.op_params.c_str()); + fflush(stdout); + + if (!result.supported) { + printf("not supported [%s] ", result.backend_name.c_str()); + printf("\n"); + return; + } + + if (result.passed) { + printf("\033[1;32mOK\033[0m\n"); + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + } + + void print_perf_console(const test_result & result) { + int len = printf(" %s(%s): ", result.op_name.c_str(), result.op_params.c_str()); + fflush(stdout); + + if (!result.supported) { + printf("not supported\n"); + return; + } + + // align while also leaving some margin for variations in parameters + int align = 8; + int last = (len + align - 1) / align * align; + if (last - len < 5) { + last += align; + } + printf("%*s", last - len, ""); + + printf(" %8d runs - %8.2f us/run - ", result.n_runs, result.time_us); + + if (result.flops > 0) { + auto format_flops = [](double flops) -> std::string { + char buf[256]; + if (flops >= 1e12) { + snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12); + } else if (flops >= 1e9) { + snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9); + } else if (flops >= 1e6) { + snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6); + } else { + snprintf(buf, sizeof(buf), "%6.2f kFLOP", flops / 1e3); + } + return buf; + }; + uint64_t op_flops_per_run = result.flops * result.time_us / 1e6; + printf("%s/run - \033[1;34m%sS\033[0m", format_flops(op_flops_per_run).c_str(), + format_flops(result.flops).c_str()); + } else { + printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m", result.memory_kb, result.bandwidth_gb_s); + } + printf("\n"); + } +}; + +struct sql_printer : public printer { + static std::string get_sql_field_type(const std::string & field) { + switch (test_result::get_field_type(field)) { + case test_result::STRING: + return "TEXT"; + case test_result::BOOL: + case test_result::INT: + return "INTEGER"; + case test_result::FLOAT: + return "REAL"; + default: + GGML_ABORT("invalid field type"); + } + } + + void print_header() override { + std::vector fields = test_result::get_fields(); + fprintf(fout, "CREATE TABLE IF NOT EXISTS test_backend_ops (\n"); + for (size_t i = 0; i < fields.size(); i++) { + fprintf(fout, " %s %s%s\n", fields[i].c_str(), get_sql_field_type(fields[i]).c_str(), + i < fields.size() - 1 ? "," : ""); + } + fprintf(fout, ");\n\n"); + } + + void print_test_result(const test_result & result) override { + fprintf(fout, "INSERT INTO test_backend_ops ("); + std::vector fields = test_result::get_fields(); + for (size_t i = 0; i < fields.size(); i++) { + fprintf(fout, "%s%s", fields[i].c_str(), i < fields.size() - 1 ? ", " : ""); + } + fprintf(fout, ") VALUES ("); + std::vector values = result.get_values(); + for (size_t i = 0; i < values.size(); i++) { + fprintf(fout, "'%s'%s", values[i].c_str(), i < values.size() - 1 ? ", " : ""); + } + fprintf(fout, ");\n"); + } +}; + +static std::unique_ptr create_printer(output_formats format) { + switch (format) { + case CONSOLE: + return std::make_unique(); + case SQL: + return std::make_unique(); + } + GGML_ABORT("invalid output format"); +} + struct test_case { virtual ~test_case() {} @@ -434,7 +968,7 @@ struct test_case { return t; } - bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) { + bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name, printer * output_printer) { mode = MODE_TEST; ggml_init_params params = { @@ -451,29 +985,33 @@ struct test_case { add_sentinel(ctx); ggml_tensor * out = build_graph(ctx); - - if (op_name != nullptr && op_desc(out) != op_name) { + std::string current_op_name = op_desc(out); + if (op_name != nullptr && current_op_name != op_name) { //printf(" %s: skipping\n", op_desc(out).c_str()); ggml_free(ctx); return true; } - printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str()); - fflush(stdout); - // check if the backends support the ops bool supported = true; for (ggml_backend_t backend : {backend1, backend2}) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (!ggml_backend_supports_op(backend, t)) { - printf("not supported [%s] ", ggml_backend_name(backend)); supported = false; break; } } } + if (!supported) { - printf("\n"); + // Create test result for unsupported operation + test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test", + false, false, "not supported"); + + if (output_printer) { + output_printer->print_test_result(result); + } + ggml_free(ctx); return true; } @@ -578,24 +1116,24 @@ struct test_case { const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr); - if (!cmp_ok) { - printf("compare failed "); - } - ggml_backend_buffer_free(buf); ggml_free(ctx); - if (ud.ok && cmp_ok) { - printf("\033[1;32mOK\033[0m\n"); - return true; + // Create test result + bool test_passed = ud.ok && cmp_ok; + std::string error_msg = test_passed ? "" : (!cmp_ok ? "compare failed" : "test failed"); + test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test", supported, test_passed, + error_msg); + + if (output_printer) { + output_printer->print_test_result(result); } - printf("\033[1;31mFAIL\033[0m\n"); - return false; + return test_passed; } - bool eval_perf(ggml_backend_t backend, const char * op_name) { + bool eval_perf(ggml_backend_t backend, const char * op_name, printer * output_printer) { mode = MODE_PERF; static const size_t graph_nodes = 8192; @@ -608,30 +1146,26 @@ struct test_case { ggml_context_ptr ctx(ggml_init(params)); // smart ptr GGML_ASSERT(ctx); - ggml_tensor * out = build_graph(ctx.get()); - - if (op_name != nullptr && op_desc(out) != op_name) { + ggml_tensor * out = build_graph(ctx.get()); + std::string current_op_name = op_desc(out); + if (op_name != nullptr && current_op_name != op_name) { //printf(" %s: skipping\n", op_desc(out).c_str()); return true; } - int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str()); - fflush(stdout); - // check if backends support op if (!ggml_backend_supports_op(backend, out)) { - printf("not supported\n"); + // Create test result for unsupported performance test + test_result result(ggml_backend_name(backend), current_op_name, vars(), "perf", false, false, + "not supported"); + + if (output_printer) { + output_printer->print_test_result(result); + } + return true; } - // align while also leaving some margin for variations in parameters - int align = 8; - int last = (len + align - 1) / align * align; - if (last - len < 5) { - last += align; - } - printf("%*s", last - len, ""); - // allocate ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr @@ -715,40 +1249,24 @@ struct test_case { total_runs += n_runs; } while (total_time_us < 1000*1000); // run for at least 1 second - printf(" %8d runs - %8.2f us/run - ", - total_runs, - (double)total_time_us / total_runs); + // Create test result + double avg_time_us = (double) total_time_us / total_runs; + double calculated_flops = (op_flops(out) > 0) ? (op_flops(out) * total_runs) / (total_time_us / 1e6) : 0.0; + double calculated_bandwidth = + (op_flops(out) == 0) ? total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0 : 0.0; + size_t calculated_memory_kb = op_size(out) / 1024; - if (op_flops(out) > 0) { - double flops_per_sec = (op_flops(out) * total_runs) / (total_time_us / 1e6); - auto format_flops = [](double flops) -> std::string { - char buf[256]; - if (flops >= 1e12) { - snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12); - } else if (flops >= 1e9) { - snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9); - } else if (flops >= 1e6) { - snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6); - } else { - snprintf(buf, sizeof(buf), "%6.2f KFLOP", flops / 1e3); - } - return buf; - }; - printf("%s/run - \033[1;34m%sS\033[0m", - format_flops(op_flops(out)).c_str(), - format_flops(flops_per_sec).c_str()); + test_result result(ggml_backend_name(backend), current_op_name, vars(), "perf", true, true, "", avg_time_us, + calculated_flops, calculated_bandwidth, calculated_memory_kb, total_runs); - } else { - printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m", - op_size(out) / 1024, - total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0); + if (output_printer) { + output_printer->print_test_result(result); } - printf("\n"); return true; } - bool eval_grad(ggml_backend_t backend, const char * op_name) { + bool eval_grad(ggml_backend_t backend, const char * op_name, printer * output_printer) { mode = MODE_GRAD; const std::vector expect = grad_expect(); @@ -766,42 +1284,47 @@ struct test_case { ggml_tensor * out = build_graph(ctx.get()); if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) { - //printf(" %s: skipping\n", op_desc(out).c_str()); return true; } - printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str()); - fflush(stdout); - if (out->type != GGML_TYPE_F32) { - printf("not supported [%s->type != FP32]\n", out->name); + output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend), + test_status_t::NOT_SUPPORTED, + out->name + std::string("->type != FP32"))); return true; } + // Print operation info first + output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend))); + // check if the backend supports the ops - bool supported = true; - bool any_params = false; + bool supported = true; + bool any_params = false; + std::string failure_reason; + for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) { if (!ggml_backend_supports_op(backend, t)) { - printf("not supported [%s] ", ggml_backend_name(backend)); - supported = false; + supported = false; + failure_reason = ggml_backend_name(backend); break; } if ((t->flags & GGML_TENSOR_FLAG_PARAM)) { any_params = true; if (t->type != GGML_TYPE_F32) { - printf("not supported [%s->type != FP32] ", t->name); - supported = false; + supported = false; + failure_reason = std::string(t->name) + "->type != FP32"; break; } } } if (!any_params) { - printf("not supported [%s] \n", op_desc(out).c_str()); - supported = false; + supported = false; + failure_reason = op_desc(out); } + if (!supported) { - printf("\n"); + output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend), + test_status_t::NOT_SUPPORTED, failure_reason)); return true; } @@ -812,7 +1335,9 @@ struct test_case { } } if (ngrads > grad_nmax()) { - printf("skipping large tensors for speed \n"); + test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend)); + info.set_large_tensor_skip(); + output_printer->print_operation(info); return true; } @@ -835,25 +1360,30 @@ struct test_case { for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) { if (!ggml_backend_supports_op(backend, t)) { - printf("not supported [%s] ", ggml_backend_name(backend)); + output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend), + test_status_t::NOT_SUPPORTED, + ggml_backend_name(backend))); supported = false; break; } if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) { - printf("not supported [%s->type != FP32] ", t->name); + output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend), + test_status_t::NOT_SUPPORTED, + std::string(t->name) + "->type != FP32")); supported = false; break; } } if (!supported) { - printf("\n"); return true; } // allocate ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr if (buf == NULL) { - printf("failed to allocate tensors [%s] ", ggml_backend_name(backend)); + test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend)); + info.set_error("allocation", ""); + output_printer->print_operation(info); return false; } @@ -891,7 +1421,9 @@ struct test_case { for (int64_t i = 0; i < ne; ++i) { // gradient algebraic // check for nans if (!std::isfinite(ga[i])) { - printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", ggml_op_desc(t), i, bn, ga[i]); + test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend)); + info.set_gradient_info(i, bn, ga[i]); + output_printer->print_operation(info); ok = false; break; } @@ -959,7 +1491,9 @@ struct test_case { const double err = mean_abs_asymm(gn.data(), ga.data(), gn.size(), expect); if (err > max_maa_err()) { - printf("[%s] MAA = %.9f > %.9f ", ggml_op_desc(t), err, max_maa_err()); + test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend)); + info.set_maa_error(err, max_maa_err()); + output_printer->print_operation(info); ok = false; break; } @@ -968,16 +1502,18 @@ struct test_case { } } + // Create final test result + test_operation_info final_info(op_desc(out), vars(), ggml_backend_name(backend)); if (!ok) { - printf("compare failed "); + final_info.set_compare_failure(); } + final_info.status = ok ? test_status_t::OK : test_status_t::FAIL; + output_printer->print_operation(final_info); if (ok) { - printf("\033[1;32mOK\033[0m\n"); return true; } - printf("\033[1;31mFAIL\033[0m\n"); return false; } }; @@ -4989,7 +5525,8 @@ static std::vector> make_test_cases_perf() { return test_cases; } -static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name, const char * params_filter) { +static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name, const char * params_filter, + printer * output_printer) { auto filter_test_cases = [](std::vector> & test_cases, const char * params_filter) { if (params_filter == nullptr) { return; @@ -5012,17 +5549,19 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op filter_test_cases(test_cases, params_filter); ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); if (backend_cpu == NULL) { - printf(" Failed to initialize CPU backend\n"); + test_operation_info info("", "", "CPU"); + info.set_error("backend", "Failed to initialize CPU backend"); + output_printer->print_operation(info); return false; } size_t n_ok = 0; for (auto & test : test_cases) { - if (test->eval(backend, backend_cpu, op_name)) { + if (test->eval(backend, backend_cpu, op_name, output_printer)) { n_ok++; } } - printf(" %zu/%zu tests passed\n", n_ok, test_cases.size()); + output_printer->print_summary(test_summary_info(n_ok, test_cases.size(), false)); ggml_backend_free(backend_cpu); @@ -5034,11 +5573,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op filter_test_cases(test_cases, params_filter); size_t n_ok = 0; for (auto & test : test_cases) { - if (test->eval_grad(backend, op_name)) { + if (test->eval_grad(backend, op_name, output_printer)) { n_ok++; } } - printf(" %zu/%zu tests passed\n", n_ok, test_cases.size()); + output_printer->print_summary(test_summary_info(n_ok, test_cases.size(), false)); return n_ok == test_cases.size(); } @@ -5047,7 +5586,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op auto test_cases = make_test_cases_perf(); filter_test_cases(test_cases, params_filter); for (auto & test : test_cases) { - test->eval_perf(backend, op_name); + test->eval_perf(backend, op_name, output_printer); } return true; } @@ -5056,16 +5595,18 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } static void usage(char ** argv) { - printf("Usage: %s [mode] [-o ] [-b ] [-p ]\n", argv[0]); + printf("Usage: %s [mode] [-o ] [-b ] [-p ] [--output ]\n", argv[0]); printf(" valid modes:\n"); printf(" - test (default, compare with CPU backend for correctness)\n"); printf(" - grad (compare gradients from backpropagation with method of finite differences)\n"); printf(" - perf (performance evaluation)\n"); printf(" op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc)\n"); + printf(" --output specifies output format (default: console)\n"); } int main(int argc, char ** argv) { test_mode mode = MODE_TEST; + output_formats output_format = CONSOLE; const char * op_name_filter = nullptr; const char * backend_filter = nullptr; const char * params_filter = nullptr; @@ -5098,6 +5639,16 @@ int main(int argc, char ** argv) { usage(argv); return 1; } + } else if (strcmp(argv[i], "--output") == 0) { + if (i + 1 < argc) { + if (!output_format_from_str(argv[++i], output_format)) { + usage(argv); + return 1; + } + } else { + usage(argv); + return 1; + } } else { usage(argv); return 1; @@ -5107,23 +5658,29 @@ int main(int argc, char ** argv) { // load and enumerate backends ggml_backend_load_all(); - printf("Testing %zu devices\n\n", ggml_backend_dev_count()); + // Create printer for output format + std::unique_ptr output_printer = create_printer(output_format); + if (output_printer) { + output_printer->print_header(); + } + + output_printer->print_testing_start(testing_start_info(ggml_backend_dev_count())); size_t n_ok = 0; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); - printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev)); - if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) { - printf(" Skipping\n"); + output_printer->print_backend_init( + backend_init_info(i, ggml_backend_dev_count(), ggml_backend_dev_name(dev), true, "Skipping")); n_ok++; continue; } if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) { - printf(" Skipping CPU backend\n"); + output_printer->print_backend_init(backend_init_info( + i, ggml_backend_dev_count(), ggml_backend_dev_name(dev), true, "Skipping CPU backend")); n_ok++; continue; } @@ -5138,36 +5695,35 @@ int main(int argc, char ** argv) { ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency()); } - printf(" Device description: %s\n", ggml_backend_dev_description(dev)); - size_t free, total; // NOLINT + size_t free, total; // NOLINT ggml_backend_dev_memory(dev, &free, &total); - printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024); - printf("\n"); + output_printer->print_backend_init(backend_init_info(i, ggml_backend_dev_count(), ggml_backend_dev_name(dev), + false, "", ggml_backend_dev_description(dev), + total / 1024 / 1024, free / 1024 / 1024, true)); - bool ok = test_backend(backend, mode, op_name_filter, params_filter); + bool ok = test_backend(backend, mode, op_name_filter, params_filter, output_printer.get()); - printf(" Backend %s: ", ggml_backend_name(backend)); if (ok) { - printf("\033[1;32mOK\033[0m\n"); n_ok++; - } else { - printf("\033[1;31mFAIL\033[0m\n"); } - - printf("\n"); + output_printer->print_backend_status( + backend_status_info(ggml_backend_name(backend), ok ? test_status_t::OK : test_status_t::FAIL)); ggml_backend_free(backend); } ggml_quantize_free(); - printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count()); + if (output_printer) { + output_printer->print_footer(); + } + + output_printer->print_overall_summary( + overall_summary_info(n_ok, ggml_backend_dev_count(), n_ok == ggml_backend_dev_count())); if (n_ok != ggml_backend_dev_count()) { - printf("\033[1;31mFAIL\033[0m\n"); return 1; } - printf("\033[1;32mOK\033[0m\n"); return 0; } From bac8bed248d15419137c5bc7f834582397baaebc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 5 Jul 2025 07:18:09 +0300 Subject: [PATCH 2/7] eval-callback : check for empty input (#14539) --- examples/eval-callback/eval-callback.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index bbbec6a01..4afd80eb4 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -136,6 +136,11 @@ static bool run(llama_context * ctx, const common_params & params) { std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); + if (tokens.empty()) { + LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__); + return false; + } + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { LOG_ERR("%s : failed to eval\n", __func__); return false; From 668168814601d2aef6161ce49ab186bc74177ae7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 5 Jul 2025 08:24:56 +0200 Subject: [PATCH 3/7] opencl: add GELU_ERF (#14476) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 48 ++++++++++++++++++++++++++++ ggml/src/ggml-opencl/kernels/gelu.cl | 27 ++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 970dd3f67..a9fc03903 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -398,6 +398,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_scale; cl_kernel kernel_silu, kernel_silu_4; cl_kernel kernel_gelu, kernel_gelu_4; + cl_kernel kernel_gelu_erf, kernel_gelu_erf_4; cl_kernel kernel_gelu_quick, kernel_gelu_quick_4; cl_kernel kernel_relu; cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16; @@ -736,6 +737,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err)); CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err)); + CL_CHECK((backend_ctx->kernel_gelu_erf = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err)); + CL_CHECK((backend_ctx->kernel_gelu_erf_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err)); CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err)); CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err)); GGML_LOG_CONT("."); @@ -2266,6 +2269,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; case GGML_UNARY_OP_SIGMOID: @@ -3870,6 +3874,44 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } +static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + UNUSED(src1); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + cl_kernel kernel; + + int n = ggml_nelements(dst); + + if (n % 4 == 0) { + kernel = backend_ctx->kernel_gelu_erf_4; + n /= 4; + } else { + kernel = backend_ctx->kernel_gelu_erf; + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); + + size_t global_work_size[] = {(size_t)n, 1, 1}; + size_t local_work_size[] = {64, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); +} + static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -6388,6 +6430,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_gelu; break; + case GGML_UNARY_OP_GELU_ERF: + if (!any_on_device) { + return false; + } + func = ggml_cl_gelu_erf; + break; case GGML_UNARY_OP_GELU_QUICK: if (!any_on_device) { return false; diff --git a/ggml/src/ggml-opencl/kernels/gelu.cl b/ggml/src/ggml-opencl/kernels/gelu.cl index 71c310cc9..1ab426c77 100644 --- a/ggml/src/ggml-opencl/kernels/gelu.cl +++ b/ggml/src/ggml-opencl/kernels/gelu.cl @@ -6,6 +6,7 @@ #define GELU_COEF_A 0.044715f #define GELU_QUICK_COEF -1.702f #define SQRT_2_OVER_PI 0.79788456080286535587989211986876f +#define SQRT_2_INV 0.70710678118654752440084436210484f kernel void kernel_gelu( global float * src0, @@ -35,6 +36,32 @@ kernel void kernel_gelu_4( dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); } +kernel void kernel_gelu_erf( + global float * src0, + ulong offset0, + global float * dst, + ulong offsetd +) { + src0 = (global float*)((global char*)src0 + offset0); + dst = (global float*)((global char*)dst + offsetd); + + float x = src0[get_global_id(0)]; + dst[get_global_id(0)] = 0.5f*x*(1.0f + erf(x*SQRT_2_INV)); +} + +kernel void kernel_gelu_erf_4( + global float4 * src0, + ulong offset0, + global float4 * dst, + ulong offsetd +) { + src0 = (global float4*)((global char*)src0 + offset0); + dst = (global float4*)((global char*)dst + offsetd); + + float4 x = src0[get_global_id(0)]; + dst[get_global_id(0)] = 0.5f*x*(1.0f + erf(x*SQRT_2_INV)); +} + kernel void kernel_gelu_quick( global float * src0, ulong offset0, From ddef99522d1ba74193b7394e803fab8db5c78bae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 5 Jul 2025 09:17:14 +0200 Subject: [PATCH 4/7] server : fix assistant prefilling when content is an array (#14360) --- .../server/tests/unit/test_chat_completion.py | 23 +++++++++++++++++++ tools/server/utils.hpp | 8 ++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 1b5205f79..7ee9a1651 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -132,6 +132,28 @@ def test_chat_template(): assert res.body["__verbose"]["prompt"] == " <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +@pytest.mark.parametrize("prefill,re_prefill", [ + ("Whill", "Whill"), + ([{"type": "text", "text": "Wh"}, {"type": "text", "text": "ill"}], "Whill"), +]) +def test_chat_template_assistant_prefill(prefill, re_prefill): + global server + server.chat_template = "llama3" + server.debug = True # to get the "__verbose" object in the response + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": 8, + "messages": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + {"role": "assistant", "content": prefill}, + ] + }) + assert res.status_code == 200 + assert "__verbose" in res.body + assert res.body["__verbose"]["prompt"] == f" <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}" + + def test_apply_chat_template(): global server server.chat_template = "command-r" @@ -228,6 +250,7 @@ def test_completion_with_grammar(jinja: bool, grammar: str, n_predicted: int, re [{"role": "system", "content": 123}], # [{"content": "hello"}], # TODO: should not be a valid case [{"role": "system", "content": "test"}, {}], + [{"role": "user", "content": "test"}, {"role": "assistant", "content": "test"}, {"role": "assistant", "content": "test"}], ]) def test_invalid_chat_completion_req(messages): global server diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 2ef9a1645..6c2e91359 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -792,7 +792,13 @@ static json oaicompat_chat_params_parse( /* Append assistant prefilled message */ if (prefill_assistant_message) { - chat_params.prompt += last_message.content; + if (!last_message.content_parts.empty()) { + for (auto & p : last_message.content_parts) { + chat_params.prompt += p.text; + } + } else { + chat_params.prompt += last_message.content; + } } llama_params["chat_format"] = static_cast(chat_params.format); From a0374a67e2924f2e845cdc59dd67d9a44065a89c Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 5 Jul 2025 02:26:04 -0500 Subject: [PATCH 5/7] vulkan: Handle updated FA dim2/3 definition (#14518) * vulkan: Handle updated FA dim2/3 definition Pack mask boolean and n_head_log2 into a single dword to keep the push constant block under the 128B limit. * handle null mask for gqa * allow gqa with dim3>1 --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 19 ++++++++----------- .../vulkan-shaders/flash_attn.comp | 6 +++--- .../vulkan-shaders/flash_attn_base.comp | 13 +++++++++---- .../vulkan-shaders/flash_attn_cm1.comp | 6 +++--- .../vulkan-shaders/flash_attn_cm2.comp | 6 +++--- 5 files changed, 26 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 22a34a433..e8df00d41 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -636,6 +636,7 @@ struct vk_flash_attn_push_constants { uint32_t nev3; uint32_t nem1; uint32_t nem2; + uint32_t nem3; uint32_t nb01; uint32_t nb02; @@ -651,8 +652,7 @@ struct vk_flash_attn_push_constants { float max_bias; float logit_softcap; - uint32_t mask; - uint32_t n_head_log2; + uint32_t mask_n_head_log2; float m0; float m1; @@ -6111,6 +6111,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx const uint32_t nem1 = mask ? mask->ne[1] : 0; const uint32_t nem2 = mask ? mask->ne[2] : 0; + const uint32_t nem3 = mask ? mask->ne[3] : 0; const uint32_t HSK = nek0; const uint32_t HSV = nev0; @@ -6178,7 +6179,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } if (N == 1 && qk_ratio > 1 && qk_ratio <= max_gqa && - qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) { + qk_ratio * nek2 == neq2 && nek2 == nev2 && nem2 <= 1) { // grouped query attention - make the N dimension equal to gqa_ratio, reduce // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1 // and change addressing calculations to index Q's dimension 2. @@ -6348,17 +6349,19 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } } + uint32_t mask_n_head_log2 = ((mask != nullptr) << 16) | n_head_log2; + const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, - nem1, nem2, + nem1, nem2, nem3, q_stride, (uint32_t)nbq2, (uint32_t)nbq3, k_stride, (uint32_t)nbk2, (uint32_t)nbk3, v_stride, (uint32_t)nbv2, (uint32_t)nbv3, scale, max_bias, logit_softcap, - mask != nullptr, n_head_log2, m0, m1, + mask_n_head_log2, m0, m1, gqa_ratio, split_kv, split_k }; ggml_vk_sync_buffers(subctx); @@ -10303,12 +10306,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { return false; } - // TODO: support broadcast - // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14449, but - // the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505 - if (op->src[0]->ne[3] != 1 || (op->src[3] && op->src[3]->ne[2] != 1)) { - return false; - } // It's straightforward to support different K/V dequant, but would // significantly increase the number of pipelines if (op->src[1]->type != op->src[2]->type) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 788a5e065..45c6e7736 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -101,8 +101,8 @@ void main() { uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2; #endif uint32_t m_offset = 0; - if (p.nem2 != 1) { - m_offset = (iq3 % p.nem2) * p.nem1 * KV; + if (p.nem2 != 1 || p.nem3 != 1) { + m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV; } [[dont_unroll]] @@ -149,7 +149,7 @@ void main() { } } - if (p.mask != 0) { + if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) { uint32_t c = (idx + tid) % Bc; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp index 6609f0bad..7defe72b4 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp @@ -25,6 +25,7 @@ layout (push_constant) uniform parameter { uint32_t nev3; uint32_t nem1; uint32_t nem2; + uint32_t nem3; uint32_t nb01; uint32_t nb02; @@ -40,8 +41,7 @@ layout (push_constant) uniform parameter { float max_bias; float logit_softcap; - uint32_t mask; - uint32_t n_head_log2; + uint32_t mask_n_head_log2; float m0; float m1; @@ -50,6 +50,9 @@ layout (push_constant) uniform parameter { uint32_t k_num; } p; +#define MASK_ENABLE_BIT (1<<16) +#define N_LOG2_MASK 0xFFFF + layout (binding = 4) writeonly buffer O {D_TYPE data_o[];}; #if defined(A_TYPE_PACKED16) @@ -100,8 +103,10 @@ ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const i { const uint32_t h = iq2 + (r % p.gqa_ratio); - const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1); - const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1); + uint32_t n_head_log2 = p.mask_n_head_log2 & N_LOG2_MASK; + + const ACC_TYPE base = ACC_TYPE(h < n_head_log2 ? p.m0 : p.m1); + const int exph = int(h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1); return ACC_TYPE(pow(base, ACC_TYPE(exph))); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index e74e2fa93..486735fe8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -126,8 +126,8 @@ void main() { uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2; #endif uint32_t m_offset = 0; - if (p.nem2 != 1) { - m_offset = (iq3 % p.nem2) * p.nem1 * KV; + if (p.nem2 != 1 || p.nem3 != 1) { + m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV; } [[dont_unroll]] @@ -182,7 +182,7 @@ void main() { barrier(); } - if (p.mask != 0) { + if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) { uint32_t c = (idx + tid) % Bc; uint32_t r = (idx + tid) / Bc; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index 8792d5195..274f48fca 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -131,8 +131,8 @@ void main() { } uint32_t m_offset = 0; - if (p.nem2 != 1) { - m_offset = (iq3 % p.nem2) * p.nem1 * KV * 2 /*sizeof(float16_t)*/; + if (p.nem2 != 1 || p.nem3 != 1) { + m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV * 2 /*sizeof(float16_t)*/; } [[dont_unroll]] @@ -153,7 +153,7 @@ void main() { } } - if (p.mask != 0) { + if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp); tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1); From e592be15756ae546ba87bb5a5250b71248121971 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 6 Jul 2025 03:08:16 -0500 Subject: [PATCH 6/7] vulkan: fix rms_norm+mul fusion (#14545) The fused operation was grabbing the epsilon value from the wrong place. Add an env var to disable fusion. Add some missing checks for supported shapes/types. Handle fused rms_norm+mul in check_results. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 106 ++++++++++++++++++++++----- tests/test-backend-ops.cpp | 6 +- 2 files changed, 88 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e8df00d41..7b8faf178 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -501,6 +501,8 @@ struct vk_device_struct { ggml_backend_buffer_type buffer_type; + bool disable_fusion; + #ifdef GGML_VULKAN_MEMORY_DEBUG std::unique_ptr memory_logger; #endif @@ -1091,8 +1093,8 @@ static size_t vk_skip_checks; static size_t vk_output_tensor; static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); -static void ggml_vk_check_results_0(ggml_tensor * tensor); -static void ggml_vk_check_results_1(ggml_tensor * tensor); +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); #endif typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); @@ -3507,6 +3509,8 @@ static vk_device ggml_vk_get_device(size_t idx) { device->idx = idx; + device->disable_fusion = getenv("GGML_VK_DISABLE_FUSION") != nullptr; + return device; } @@ -7654,8 +7658,7 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun); } -static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { - float * op_params = (float *)dst->op_params; +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -8885,7 +8888,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } } -static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready); +static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready); // Returns true if node has enqueued work into the queue, false otherwise // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. @@ -9146,9 +9149,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr // fused rms_norm + mul ggml_tensor *mul = cgraph->nodes[node_idx + 1]; ggml_tensor *other_src = mul->src[0] == node ? mul->src[1] : mul->src[0]; - ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, dryrun); + ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, (float *)node->op_params, dryrun); } else { - ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, dryrun); + ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, (float *)node->op_params, dryrun); } break; case GGML_OP_RMS_NORM_BACK: @@ -9308,7 +9311,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr ctx->compute_ctx.reset(); - bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready); + bool ok = ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, false, almost_ready); if (!ok) { if (node->op == GGML_OP_UNARY) { std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; @@ -9323,7 +9326,8 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr return true; } -static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) { +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) { + GGML_UNUSED(cgraph); ggml_backend_buffer * buf = nullptr; switch (tensor->op) { @@ -9433,7 +9437,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * // Only run if ctx hasn't been submitted yet if (!subctx->seqs.empty()) { #ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(tensor); + ggml_vk_check_results_0(ctx, cgraph, tensor_idx); use_fence = true; #endif @@ -9453,7 +9457,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * ggml_vk_wait_for_fence(ctx); } #ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_1(tensor); + ggml_vk_check_results_1(ctx, cgraph, tensor_idx); #endif } @@ -9900,6 +9904,37 @@ static bool ggml_vk_is_empty(ggml_tensor * node) { return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; } +static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { + if (!ggml_can_fuse(cgraph, node_idx, ops)) { + return false; + } + + if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) { + // additional constraints specific to this fusion + const ggml_tensor *rms_norm = cgraph->nodes[node_idx]; + const ggml_tensor *mul = cgraph->nodes[node_idx + 1]; + + GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(rms_norm->type == GGML_TYPE_F32); + // rms_norm only supports f32 + if (mul->src[0]->type != GGML_TYPE_F32 || + mul->src[1]->type != GGML_TYPE_F32 || + mul->type != GGML_TYPE_F32) { + return false; + } + // if rms_norm is the B operand, then we don't handle broadcast + if (rms_norm == mul->src[1] && + mul->src[0]->ne[1] != rms_norm->ne[1]) { + return false; + } + // rms_norm shader assumes contiguous rows + if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) { + return false; + } + } + return true; +} + static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; @@ -9913,7 +9948,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg uint64_t total_mat_mul_bytes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + if (!ctx->device->disable_fusion && ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { ctx->num_additional_fused_ops = 1; } ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false); @@ -9983,7 +10018,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); } - if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + if (!ctx->device->disable_fusion && ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { ctx->num_additional_fused_ops = 1; } @@ -10760,11 +10795,21 @@ void * comp_result; size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; -static void ggml_vk_check_results_0(ggml_tensor * tensor) { +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { + ggml_tensor * tensor = cgraph->nodes[tensor_idx]; if (tensor->op == GGML_OP_TRANSPOSE) { return; } + bool fused_rms_norm_mul = false; + int rms_norm_idx = -1; + if (ctx->num_additional_fused_ops == 1 && + tensor->op == GGML_OP_RMS_NORM && + cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) { + fused_rms_norm_mul = true; + tensor = cgraph->nodes[tensor_idx + 1]; + } + check_counter++; if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { return; @@ -10792,6 +10837,15 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { for (int i = 0; i < 6; i++) { ggml_tensor * srci = tensor->src[i]; + if (fused_rms_norm_mul) { + rms_norm_idx = tensor->src[0]->op == GGML_OP_RMS_NORM ? 0 : 1; + ggml_tensor *rms_norm = tensor->src[rms_norm_idx]; + switch (i) { + case 0: srci = rms_norm->src[0]; break; + case 1: srci = tensor->src[1 - rms_norm_idx]; break; + default: continue; + } + } if (srci == nullptr) { continue; } @@ -10849,7 +10903,12 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { } else if (tensor->op == GGML_OP_SUB) { tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]); } else if (tensor->op == GGML_OP_MUL) { - tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]); + if (fused_rms_norm_mul) { + tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->src[rms_norm_idx]->op_params); + tensor_clone = ggml_mul(ggml_ctx, tensor_clone, src_clone[1 - rms_norm_idx]); + } else { + tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]); + } } else if (tensor->op == GGML_OP_DIV) { tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]); } else if (tensor->op == GGML_OP_CONCAT) { @@ -11040,10 +11099,10 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { GGML_ABORT("fatal error"); } - ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); - ggml_build_forward_expand(cgraph, tensor_clone); + ggml_cgraph * cgraph_cpu = ggml_new_graph(ggml_ctx); + ggml_build_forward_expand(cgraph_cpu, tensor_clone); - ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8); + ggml_graph_compute_with_ctx(ggml_ctx, cgraph_cpu, 8); if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { ggml_vk_print_tensor(tensor_clone, "tensor_clone"); @@ -11066,10 +11125,19 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")"); } -static void ggml_vk_check_results_1(ggml_tensor * tensor) { +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { + ggml_tensor * tensor = cgraph->nodes[tensor_idx]; if (tensor->op == GGML_OP_TRANSPOSE) { return; } + bool fused_rms_norm_mul = false; + if (ctx->num_additional_fused_ops == 1 && + tensor->op == GGML_OP_RMS_NORM && + cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) { + fused_rms_norm_mul = true; + tensor = cgraph->nodes[tensor_idx + 1]; + } + if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { return; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 0dc9c09e2..652856a35 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2583,10 +2583,6 @@ struct test_rms_norm_mul : public test_case { } } - double max_nmse_err() override { - return 1e-6; - } - float grad_eps() override { return 1.0f; } @@ -5058,7 +5054,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps)); } - for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) { + for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) { test_cases.emplace_back(new test_rms_norm_mul(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); } From 6491d6e4f1caf0ad2221865b4249ae6938a6308c Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Sun, 6 Jul 2025 10:29:36 +0000 Subject: [PATCH 7/7] vulkan: increase LOAD_VEC_A to 8 (IQ1/IQ2) or 4 (IQ3) (#14485) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit taken from remyoudompheng's PR https://github.com/ggml-org/llama.cpp/pull/12260 Co-authored-by: Rémy Oudompheng --- .../ggml-vulkan/vulkan-shaders/mul_mm.comp | 144 ++++++++++-------- .../vulkan-shaders/vulkan-shaders-gen.cpp | 4 +- 2 files changed, 79 insertions(+), 69 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 26163b167..888ce79f6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -500,10 +500,9 @@ void main() { const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint ib8 = (idx % 128) / 4; - const int i8 = 2 * int(idx % 4); + const uint ib = idx / 32; // 8 values per idx + const uint ib32 = (idx % 32) / 4; // 0..7 + const uint ib8 = idx % 32; const float d = float(data_a[ib].d); const uint qh = data_a[ib].qh[ib32]; @@ -512,22 +511,16 @@ void main() { const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]); - const ivec2 gvec = ivec2( - bitfieldExtract(grid, 2 * (i8), 2), - bitfieldExtract(grid, 2 * (i8 + 1), 2) - ); - const vec2 v = dl * (vec2(gvec) + delta); - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + [[unroll]] for (int k = 0; k < 8; ++k) { + buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta)); + } #elif defined(DATA_A_IQ1_M) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib8 = (idx % 128) / 4; + const uint ib = idx / 32; // 8 values per idx + const uint ib8 = idx % 32; const uint ib16 = ib8 / 2; - const int i8 = 2 * int(idx % 4); const uint16_t[4] scales = data_a[ib].scales; const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12; @@ -538,21 +531,17 @@ void main() { const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1); const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA; const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]); - const ivec2 gvec = ivec2( - bitfieldExtract(grid, 2 * (i8), 2), - bitfieldExtract(grid, 2 * (i8 + 1), 2) - ); - const vec2 v = dl * (vec2(gvec) + delta); - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + [[unroll]] for (int k = 0; k < 8; ++k) { + buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta)); + } #elif defined(DATA_A_IQ2_XXS) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint ib8 = (idx / 4) % 4; + const uint ib = idx / 32; // 8 values per idx + const uint ib32 = (idx % 32) / 4; // 0..7 + const uint ib8 = idx % 4; const float d = float(data_a[ib].d); const uint qs = data_a[ib].qs[8 * ib32 + ib8]; @@ -562,63 +551,81 @@ void main() { data_a[ib].qs[8*ib32 + 6], data_a[ib].qs[8*ib32 + 7] )); - const float db = d * 0.25 * (0.5 + (signs >> 28)); + const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + (signs >> 28))); const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7); - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint grid = iq2xxs_grid[qs][(idx % 4) / 2] >> (16 * (idx & 1)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 + const uint sign = sign7 | (bitCount(sign7) << 7); + const uvec2 grid = iq2xxs_grid[qs]; + const vec4 grid0 = vec4(unpack8(grid.x)); + const vec4 grid1 = vec4(unpack8(grid.y)); - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); + buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); + buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); + buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); + buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); + buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); + buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); + buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); #elif defined(DATA_A_IQ2_XS) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint ib8 = (idx / 4) % 4; // 0..3 + const uint ib = idx / 32; // 8 values per idx + const uint ib32 = (idx % 32) / 4; // 0..7 + const uint ib8 = idx % 4; // 0..3 const float d = float(data_a[ib].d); const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; - const float db = d * 0.25 * (0.5 + scale); + const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale)); const uint qs = data_a[ib].qs[4 * ib32 + ib8]; const uint sign7 = qs >> 9; - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint grid = iq2xs_grid[qs & 511][(idx % 4) / 2] >> (16 * (idx & 1)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 + const uint sign = sign7 | (bitCount(sign7) << 7); + const uvec2 grid = iq2xs_grid[qs & 511]; + const vec4 grid0 = vec4(unpack8(grid.x)); + const vec4 grid1 = vec4(unpack8(grid.y)); - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); + buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); + buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); + buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); + buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); + buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); + buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); + buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); #elif defined(DATA_A_IQ2_S) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint ib8 = (idx % 128) / 4; // 0..31 - const uint ib32 = ib8 / 4; // 0..7 + const uint ib = idx / 32; // 8 values per idx + const uint ib8 = idx % 32; // 0..31 + const uint ib32 = ib8 / 4; // 0..7 const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; const uint qs = data_a[ib].qs[ib8]; const uint qh = data_a[ib].qh[ib32]; const uint qhshift = 2 * (ib8 % 4); - const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4)); + const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8]; const float d = float(data_a[ib].d); - const float db = d * 0.25 * (0.5 + scale); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1]; - const vec2 v = db * vec2(sign01) * vec2(unpack8(uint32_t(grid)).xy); // vec4 used due to #12147 + const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale)); + const uvec2 grid = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)]; + const vec4 grid0 = vec4(unpack8(grid.x)); + const vec4 grid1 = vec4(unpack8(grid.y)); - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + buf_a[buf_idx ] = db * FLOAT_TYPE((sign & 1) != 0 ? -grid0.x : grid0.x); + buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign & 2) != 0 ? -grid0.y : grid0.y); + buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign & 4) != 0 ? -grid0.z : grid0.z); + buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign & 8) != 0 ? -grid0.w : grid0.w); + buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign & 16) != 0 ? -grid1.x : grid1.x); + buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign & 32) != 0 ? -grid1.y : grid1.y); + buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign & 64) != 0 ? -grid1.z : grid1.z); + buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w); #elif defined(DATA_A_IQ3_XXS) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint iqs = (idx % 128) / 2; // 0..63 + const uint ib = idx / 64; // 4 values per idx + const uint iqs = idx % 64; // 0..63 const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values const float d = float(data_a[ib].d); @@ -631,33 +638,36 @@ void main() { )); const float db = d * 0.5 * (0.5 + (signs >> 28)); const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7); - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 + const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (4 * (idx % 2)); + const uint grid = iq3xxs_grid[qs]; + const vec4 v = db * vec4(unpack8(grid)); - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y); + buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z); + buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w); #elif defined(DATA_A_IQ3_S) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - const uint ib = idx / 128; // 2 values per idx - const uint iqs = (idx % 128) / 2; // 0..63 + const uint ib = idx / 64; // 4 values per idx + const uint iqs = idx % 64; // 0..63 const uint iqh = iqs / 8; const float d = float(data_a[ib].d); const uint qs = data_a[ib].qs[iqs]; const uint qh = data_a[ib].qh[iqh]; - const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (2 * (idx % 4))); + const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (4 * (idx % 2))); const uint scale = data_a[ib].scales[iqs / 16]; const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign))); const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf)); - const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 + const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)]; + const vec4 v = db * vec4(unpack8(grid)); - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); + buf_a[buf_idx ] = FLOAT_TYPE((sign & 1) != 0 ? -v.x : v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE((sign & 2) != 0 ? -v.y : v.y); + buf_a[buf_idx + 2] = FLOAT_TYPE((sign & 4) != 0 ? -v.z : v.z); + buf_a[buf_idx + 3] = FLOAT_TYPE((sign & 8) != 0 ? -v.w : v.w); #elif defined(DATA_A_IQ4_XS) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 2698522ed..30f78fabb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -360,9 +360,9 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool for (const auto& tname : type_names) { std::string load_vec_quant = "2"; - if ((tname == "q4_0") || (tname == "q4_1")) + if ((tname == "q4_0") || (tname == "q4_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s")) load_vec_quant = "8"; - else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq4_nl")) + else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl")) load_vec_quant = "4"; if (tname == "bf16") {