diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 3af7aff70..48ded82e8 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -39,7 +39,7 @@ #include "ggml-hexagon.h" #include "ggml-impl.h" #include "ggml-quants.h" -#include "op-desc.h" +#include "htp-opnode.h" #include "htp-ops.h" #include "htp_iface.h" #include "htp-drv.h" @@ -102,23 +102,23 @@ static const char * status_to_str(uint32_t status) { // ** debug helpers -static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) { +static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) { if (!opt_verbose) return; - op_desc desc(op); + htp_opformat fmt(node); GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(), - ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags); + node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags); } static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) { if (!opt_verbose) return; - op_desc desc(op); + htp_opformat fmt(htp_opformat(htp_opnode{const_cast(op), {}, HTP_OP_INVALID})); GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(), - ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no"); + ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no"); } -static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op, +static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node, uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) { if (!opt_profile) return; @@ -129,15 +129,16 @@ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_t pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]); } - op_desc desc(op); + htp_opformat fmt(node); GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(), - ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str); + node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str); } // ** backend sessions struct ggml_hexagon_opbatch; struct ggml_hexagon_opqueue; +struct htp_opnode; struct ggml_hexagon_session { std::string name; @@ -167,7 +168,7 @@ struct ggml_hexagon_session { void allocate(int dev_id) noexcept(false); void release() noexcept(true); - void enqueue_op(htp_op_code opcode, const ggml_tensor *op); + void enqueue_op(const htp_opnode & node); void flush(bool all = true); void flush_pending(bool all = false); @@ -1782,12 +1783,10 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf /* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host, }; -// Backend session implementation - struct ggml_hexagon_opbatch { ggml_hexagon_session* sess; - std::vector ops; // pointers to original ops + std::vector ops; // htp_opnode of ops std::vector h_bufs; // htp buffer descriptors std::vector h_tens; // htp tensor descriptors @@ -1919,7 +1918,7 @@ struct ggml_hexagon_opbatch { return ti; } - bool fit_op(const struct ggml_tensor *t) const { + bool fit_op(const htp_opnode & node) const { if (n_ops >= n_ops_max ) return false; // check how much extras we will need @@ -1939,10 +1938,10 @@ struct ggml_hexagon_opbatch { } }; - for (unsigned int i=0; i < HTP_OP_MAX_INPUTS && t->src[i]; i++) { - fit_tensor(t->src[i]); + for (const auto * src : node.get_inputs()) { + fit_tensor(src); } - fit_tensor(t); + fit_tensor(node.dst()); if ((extra_bufs + n_bufs) > n_bufs_max) return false; if ((extra_tens + n_tens) > n_tens_max) return false; @@ -1952,29 +1951,30 @@ struct ggml_hexagon_opbatch { } // assumes that fit_op() was called first and returned true - void add_op(htp_op_code opcode, const struct ggml_tensor * t) { + void add_op(const htp_opnode & node) { // Add new op unsigned int n = n_ops++; GGML_ASSERT(n_ops <= n_ops_max); - ops[n] = t; + ops[n] = node; htp_op_desc &o = h_ops[n]; - memcpy(&o.params, &t->op_params, sizeof(t->op_params)); - o.opcode = opcode; + memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params)); + o.opcode = node.opcode; o.flags = 0; if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) { o.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } - ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags); + ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags); + auto inputs = node.get_inputs(); for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) { - o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff; + o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff; } - o.dst = add_tensor(t); + o.dst = add_tensor(node.dst()); } }; @@ -1983,7 +1983,7 @@ struct ggml_hexagon_opqueue { ggml_hexagon_shared_buffer *shm_buf; size_t shm_blk_size; - using opvec = std::vector; + using opvec = std::vector; std::queue done; // completed batch ids std::vector op_cache; // per batch op cache @@ -2182,11 +2182,11 @@ void ggml_hexagon_session::flush_batch() { } } -void ggml_hexagon_session::enqueue_op(htp_op_code opcode, const ggml_tensor *op) { - if (!op_batch->fit_op(op)) { +void ggml_hexagon_session::enqueue_op(const htp_opnode & node) { + if (!op_batch->fit_op(node)) { flush_batch(); } - op_batch->add_op(opcode, op); + op_batch->add_op(node); } // Flush HTP response queue i.e wait for all outstanding requests to complete @@ -3179,10 +3179,43 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes); + std::vector nodes; + nodes.reserve(graph->n_nodes); + + // Fusion for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * n = graph->nodes[i]; - if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) { - sess->enqueue_op(op_remap_to_htp(n), n); + if (!op_is_compute(n)) { + continue; + } + + ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr; + + htp_opnode node = { + /*.node =*/ n, + /*.fused =*/ {}, + /*.opcode =*/ HTP_OP_INVALID + }; + + if (n->op == GGML_OP_RMS_NORM && next_node) { + if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + node.add_fused(next_node); + node.opcode = HTP_OP_RMS_NORM_MUL; + i++; // skip the fused MUL node + } + } + + if (node.opcode == HTP_OP_INVALID) { + node.opcode = op_remap_to_htp(n); + } + + nodes.push_back(std::move(node)); + } + + // Queue and execute + if (opt_opstage & HTP_OPSTAGE_QUEUE) { + for (const auto & node : nodes) { + sess->enqueue_op(node); } } @@ -3201,51 +3234,7 @@ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) { sess->flush(); } -struct node_info { - ggml_tensor * node; - - std::vector fused; - - ggml_op op() const { - return node->op; - } - - const ggml_tensor * dst() const { - return fused.empty() ? node : fused.back(); - } - - const ggml_tensor * src0() const { - return node->src[0]; - } - - const ggml_tensor * src1() const { - return node->src[1]; - } - - bool is_empty() const { - return ggml_op_is_empty(node->op); - } - - void add_fused(ggml_tensor * t) { - fused.push_back(t); - } - - bool stackable() const { - switch (this->op()) { - case GGML_OP_MUL_MAT: - case GGML_OP_MUL_MAT_ID: - return ggml_is_quantized(this->src0()->type); - default: - return false; - } - } - - bool same_input(const node_info& n) const { - return n.src1() == this->src1(); - } -}; - -static std::vector ggml_hexagon_graph_optimize_reorder(const std::vector & nodes) { +static std::vector ggml_hexagon_graph_optimize_reorder(const std::vector & nodes) { const int n = nodes.size(); std::vector res; @@ -3299,14 +3288,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr enum ggml_op ops[MAX_FUSE]; - std::vector nodes; + std::vector nodes; nodes.reserve(gf->n_nodes); // fuse nodes: // we don't want to make reorders that break fusing, so we first pack all fusable tensors // and perform the reorder over the fused nodes. after the reorder is done, we unfuse for (int i = 0; i < n; i++) { - node_info node = { + htp_opnode node = { /*.node =*/gf->nodes[i], /*.fused =*/{}, }; diff --git a/ggml/src/ggml-hexagon/htp-opnode.h b/ggml/src/ggml-hexagon/htp-opnode.h new file mode 100644 index 000000000..14b232240 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp-opnode.h @@ -0,0 +1,241 @@ +#ifndef HTP_OPNODE_H +#define HTP_OPNODE_H + +#define GGML_COMMON_IMPL_CPP +#include "ggml-backend-impl.h" +#include "ggml-common.h" + +#include +#include +#include +#include "htp-ops.h" + +struct htp_opnode { + ggml_tensor * node = nullptr; + + std::vector fused; + + htp_op_code opcode = HTP_OP_INVALID; + + ggml_op op() const { + return node->op; + } + + const ggml_tensor * dst() const { + return fused.empty() ? node : fused.back(); + } + + const ggml_tensor * src0() const { + return node->src[0]; + } + + const ggml_tensor * src1() const { + return node->src[1]; + } + + bool is_empty() const { + return ggml_op_is_empty(node->op); + } + + void add_fused(ggml_tensor * t) { + fused.push_back(t); + } + + bool stackable() const { + switch (this->op()) { + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + return ggml_is_quantized(this->src0()->type); + default: + return false; + } + } + + bool same_input(const htp_opnode& n) const { + return n.src1() == this->src1(); + } + + std::vector get_inputs() const { + std::vector inputs; + std::vector outputs; + outputs.push_back(node); + for (const auto * f : fused) { + outputs.push_back(f); + } + + auto contains = [&](const std::vector & vec, const ggml_tensor * t) { + for (const auto * x : vec) { + if (x == t) return true; + } + return false; + }; + + auto add_input = [&](const ggml_tensor * t) { + if (t && !contains(outputs, t) && !contains(inputs, t)) { + inputs.push_back(t); + } + }; + + for (int i = 0; i < GGML_MAX_SRC && node->src[i]; i++) { + add_input(node->src[i]); + } + for (const auto * f : fused) { + for (int i = 0; i < GGML_MAX_SRC && f->src[i]; i++) { + add_input(f->src[i]); + } + } + return inputs; + } + + std::string op_name() const { + if (fused.empty()) { + return ggml_op_desc(node); + } + std::string name = ggml_op_desc(node); + for (const auto * f : fused) { + name += "+"; + name += ggml_op_desc(f); + } + return name; + } +}; + +struct htp_opformat { + char strides[64 * GGML_MAX_SRC]; + char dims[64 * GGML_MAX_SRC]; + char types[16 * GGML_MAX_SRC]; + char buffs[64 * GGML_MAX_SRC]; + char names[64 * GGML_MAX_SRC]; + + int format_tensor_dims(char * str, const struct ggml_tensor * t) { + if (t->ne[2] == 1 && t->ne[3] == 1) { + return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]); + } else { + return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); + } + } + + void format_op_dims(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += format_tensor_dims(p, inputs[0]); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += format_tensor_dims(p, inputs[i]); + } + + p += sprintf(p, " -> "); + } + + char self[64]; + format_tensor_dims(self, node.dst()); + p += sprintf(p, "%s", self); + } + + int format_tensor_strides(char * str, const struct ggml_tensor * t) { + const char * c = ggml_is_contiguous(t) ? "" : "!"; + + if (t->ne[2] == 1 && t->ne[3] == 1) { + return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c); + } else { + return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c); + } + } + + void format_op_strides(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += format_tensor_strides(p, inputs[0]); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += format_tensor_strides(p, inputs[i]); + } + + p += sprintf(p, " -> "); + } + + char self[64]; + format_tensor_strides(self, node.dst()); + p += sprintf(p, "%s", self); + } + + void format_op_types(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += sprintf(p, "%s", ggml_type_name(inputs[0]->type)); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += sprintf(p, "%s", ggml_type_name(inputs[i]->type)); + } + + p += sprintf(p, " -> "); + } + + p += sprintf(p, "%s", ggml_type_name(node.dst()->type)); + } + + const char * tensor_buff_name(const struct ggml_tensor * t) { + if (t->buffer) { + return ggml_backend_buffer_name(t->buffer); + } + return "NONE"; + } + + void format_op_buffs(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += sprintf(p, "%s", tensor_buff_name(inputs[0])); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += sprintf(p, "%s", tensor_buff_name(inputs[i])); + } + + p += sprintf(p, " -> "); + } + + p += sprintf(p, "%s", tensor_buff_name(node.dst())); + } + + void format_op_names(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += sprintf(p, "%s", inputs[0]->name); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += sprintf(p, "%s", inputs[i]->name); + } + + p += sprintf(p, " -> "); + } + + p += sprintf(p, "%s", node.dst()->name); + } + + void format(const htp_opnode & node) { + format_op_dims(dims, node); + format_op_strides(strides, node); + format_op_types(types, node); + format_op_buffs(buffs, node); + format_op_names(names, node); + } + + htp_opformat() {} + htp_opformat(const htp_opnode & node) { format(node); } +}; + +#endif // HTP_OPNODE_H diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index aadc77235..fa85bf4ca 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -58,6 +58,7 @@ enum htp_op_code { HTP_OP_MUL_MAT, HTP_OP_MUL_MAT_ID, HTP_OP_RMS_NORM, + HTP_OP_RMS_NORM_MUL, HTP_OP_UNARY_SILU, HTP_OP_UNARY_GELU, HTP_OP_UNARY_SIGMOID, diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 7dd90ac7d..623008be4 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -537,6 +537,7 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_NORM: case HTP_OP_RMS_NORM: + case HTP_OP_RMS_NORM_MUL: case HTP_OP_SCALE: case HTP_OP_SQR: case HTP_OP_SQRT: diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index 7d0431d8b..770a66732 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -23,21 +23,26 @@ struct htp_unary_context { // Precomputed values const uint8_t * data_src0; + const uint8_t * data_src1; // weight/scale tensor for RMS_NORM_MUL uint8_t * data_dst; size_t src0_data_row_size; // actual data bytes per row + size_t src1_data_row_size; size_t dst_data_row_size; // actual data bytes per row size_t src0_row_size_aligned; + size_t src1_row_size_aligned; size_t dst_row_size_aligned; size_t src0_spad_half_size; + size_t src1_spad_half_size; size_t dst_spad_half_size; uint32_t block; uint32_t src0_nrows; uint32_t src0_nrows_per_thread; uint32_t nc; + bool broadcast_weight; }; // Convert flat row index to DDR byte offset using the tensor's actual strides. @@ -158,6 +163,71 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src, } } +static void hvx_fast_rms_norm_mul_f32(const uint8_t * restrict src, + const uint8_t * restrict weight, + uint8_t * restrict dst, + const int num_elems, + float epsilon) { + const HVX_Vector * restrict v_src = (const HVX_Vector *) src; + const HVX_Vector * restrict v_weight = (const HVX_Vector *) weight; + HVX_Vector * restrict v_dst = (HVX_Vector *) dst; + + const int nvec = num_elems / VLEN_FP32; // number of full vectors + const int nloe = num_elems % VLEN_FP32; // leftover elements + + // Compute sum of squares for full vectors + HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000); + HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2); + } + + // Reduce HVX sum + sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v)); + + HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems); + HVX_Vector denom_v = hvx_vec_inverse_f32(t_v); + HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v); + HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v); + + // Scale and multiply + HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v)); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v); + HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2); + HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[i]); + v_dst[i] = Q6_Vsf_equals_Vqf32(result); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v); + HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2); + HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[nvec]); + HVX_Vector res_v = Q6_Vsf_equals_Vqf32(result); + + // Store with masking to avoid overwriting memory beyond the tensor + hvx_vec_store_a(&v_dst[nvec], nloe * 4, res_v); + } +} + static void hvx_fast_norm_f32(const uint8_t * restrict src, uint8_t * restrict dst, uint8_t * restrict pad, @@ -269,6 +339,27 @@ static void rms_norm_f32(const float * restrict src, } } +static void rms_norm_mul_f32(const float * restrict src, + const float * restrict weight, + float * restrict dst, + const uint32_t num_rows, + const uint32_t row_elems, + const size_t row_size, + const size_t weight_row_size, + int32_t * op_params, + bool broadcast_weight) { + float epsilon = 0.f; + memcpy(&epsilon, op_params, sizeof(float)); + + for (uint32_t ir = 0; ir < num_rows; ir++) { + const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size); + const uint8_t * restrict w_local = (const uint8_t *)weight + (broadcast_weight ? 0 : ir * weight_row_size); + uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size); + + hvx_fast_rms_norm_mul_f32(src_local, w_local, dst_local, row_elems, epsilon); + } +} + static void norm_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -598,12 +689,15 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * t1 = HAP_perf_get_qtimer_count(); const uint8_t * restrict data_src = uctx->data_src0; + const uint8_t * restrict data_src1 = uctx->data_src1; uint8_t * restrict data_dst = uctx->data_dst; uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread); + uint8_t * src1_spad_data = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread); uint8_t * dst_spad_data = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread); size_t src0_spad_half_size = uctx->src0_spad_half_size; + size_t src1_spad_half_size = uctx->src1_spad_half_size; size_t dst_spad_half_size = uctx->dst_spad_half_size; // Non-contiguous tensors have gaps at dim-2/3 boundaries that a single-stride @@ -624,6 +718,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * dma_queue * dma_queue = octx->ctx->dma[ith]; + // If weight is broadcasted, load it once per thread at the beginning of execution + if (htp_op == HTP_OP_RMS_NORM_MUL && uctx->broadcast_weight) { + dma_queue_push(dma_queue, dma_make_ptr(src1_spad_data, data_src1), uctx->src1_row_size_aligned, 0, uctx->src1_data_row_size, 1); + dma_queue_flush(dma_queue); + } + for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; spad_idx++) { const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1); @@ -636,6 +736,14 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * dma_queue_push(dma_queue, dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + src0_off), src0_row_size_aligned, nb01, src0_data_row_size, block_size); + + if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) { + const size_t src1_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03); + dma_queue_push(dma_queue, + dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + src1_off), + uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, block_size); + } + ir += block_size; } @@ -644,6 +752,10 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * float * dst_spad = (float *) dma_queue_pop(dma_queue).src; float * src0_spad = (float *) dma_queue_pop(dma_queue).dst; + float * src1_spad = NULL; + if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) { + src1_spad = (float *) dma_queue_pop(dma_queue).dst; + } // Process block in VTCM switch (htp_op) { @@ -653,6 +765,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * case HTP_OP_RMS_NORM: rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; + case HTP_OP_RMS_NORM_MUL: + { + const float * w_ptr = uctx->broadcast_weight ? (const float *) src1_spad_data : src1_spad; + rms_norm_mul_f32(src0_spad, w_ptr, dst_spad, block_size, ne0, src0_row_size_aligned, uctx->src1_row_size_aligned, op_params, uctx->broadcast_weight); + } + break; case HTP_OP_SCALE: scale_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; @@ -700,9 +818,16 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * if (pref_ir < src0_end_row) { const uint32_t pref_block_size = unary_block_size(pref_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1); const size_t src0_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03); - dma_queue_push(dma_queue, - dma_make_ptr(src0_spad, data_src + src0_pref_off), - src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size); + dma_queue_push(dma_queue, + dma_make_ptr(src0_spad, data_src + src0_pref_off), + src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size); + + if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) { + const size_t src1_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03); + dma_queue_push(dma_queue, + dma_make_ptr(src1_spad, data_src1 + src1_pref_off), + uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, pref_block_size); + } } } ir += block_size; @@ -732,6 +857,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { case HTP_OP_RMS_NORM: op_type = "rmsnorm-f32"; break; + case HTP_OP_RMS_NORM_MUL: + op_type = "rmsnorm-mul-f32"; + break; case HTP_OP_SCALE: op_type = "scale-f32"; break; @@ -777,12 +905,44 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { const size_t src0_row_size_aligned = hex_round_up(src0_data_row_size, VLEN); const size_t dst_row_size_aligned = hex_round_up(dst_data_row_size, VLEN); + size_t src1_data_row_size = 0; + size_t src1_row_size_aligned = 0; + bool broadcast_weight = false; + const struct htp_tensor * src1 = NULL; + + if (octx->op == HTP_OP_RMS_NORM_MUL) { + src1 = octx->src[1]; + src1_data_row_size = src1->ne[0] * sizeof(float); + src1_row_size_aligned = hex_round_up(src1_data_row_size, VLEN); + broadcast_weight = (src1->ne[1] * src1->ne[2] * src1->ne[3] == 1); + } + // VTCM scratchpads for all tensors // N rows per thread, padded to HVX vector size // Double buffering requires 2x size per buffer - size_t spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned); - size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row); + size_t spad_size_per_row = 0; + size_t vtcm_row_per_thread = 0; + + if (octx->op == HTP_OP_RMS_NORM_MUL) { + if (broadcast_weight) { + size_t available_vtcm = octx->ctx->vtcm_size; + size_t src1_spad_total = n_threads * src1_row_size_aligned; + if (available_vtcm > src1_spad_total) { + available_vtcm -= src1_spad_total; + } else { + available_vtcm = 0; + } + spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned); + vtcm_row_per_thread = available_vtcm / (n_threads * spad_size_per_row); + } else { + spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned + src1_row_size_aligned); + vtcm_row_per_thread = (octx->ctx->vtcm_size) / (n_threads * spad_size_per_row); + } + } else { + spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned); + vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row); + } // Make sure the reserved vtcm size is sufficient if (vtcm_row_per_thread == 0) { @@ -797,8 +957,25 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread; octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread; + if (octx->op == HTP_OP_RMS_NORM_MUL) { + if (broadcast_weight) { + octx->src1_spad.size_per_thread = src1_row_size_aligned; + } else { + octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread * 2; + } + octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread; + } else { + octx->src1_spad.size = 0; + octx->src1_spad.size_per_thread = 0; + } + octx->src0_spad.data = octx->ctx->vtcm_base; - octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + if (octx->op == HTP_OP_RMS_NORM_MUL) { + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; + octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; + } else { + octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + } FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], @@ -811,19 +988,24 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { .src0_nrows = src0_nrows, .data_src0 = (const uint8_t *)src0->data, + .data_src1 = (octx->op == HTP_OP_RMS_NORM_MUL) ? (const uint8_t *)src1->data : NULL, .data_dst = (uint8_t *)dst->data, .src0_data_row_size = src0_data_row_size, + .src1_data_row_size = src1_data_row_size, .dst_data_row_size = dst_data_row_size, .src0_row_size_aligned = src0_row_size_aligned, + .src1_row_size_aligned = src1_row_size_aligned, .dst_row_size_aligned = dst_row_size_aligned, .src0_spad_half_size = octx->src0_spad.size_per_thread / 2, + .src1_spad_half_size = (octx->op == HTP_OP_RMS_NORM_MUL) ? (octx->src1_spad.size_per_thread / (broadcast_weight ? 1 : 2)) : 0, .dst_spad_half_size = octx->dst_spad.size_per_thread / 2, .block = (octx->src0_spad.size_per_thread / 2) / src0_row_size_aligned, .nc = src0->ne[0], + .broadcast_weight = broadcast_weight, }; worker_pool_run_func(octx->ctx->worker_pool, unary_job_f32_per_thread, &uctx, n_threads); diff --git a/ggml/src/ggml-hexagon/op-desc.h b/ggml/src/ggml-hexagon/op-desc.h deleted file mode 100644 index a1e8ddd8b..000000000 --- a/ggml/src/ggml-hexagon/op-desc.h +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef OP_DESC_H -#define OP_DESC_H - -#define GGML_COMMON_IMPL_CPP -#include "ggml-backend-impl.h" -#include "ggml-common.h" - -#include -#include - -struct op_desc { - char strides[64 * GGML_MAX_SRC]; - char dims[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - int format_tensor_dims(char * str, const struct ggml_tensor * t) { - if (t->ne[2] == 1 && t->ne[3] == 1) { - return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]); - } else { - return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); - } - } - - void format_op_dims(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += format_tensor_dims(p, t->src[0]); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += format_tensor_dims(p, t->src[i]); - } - - p += sprintf(p, " -> "); - } - - // format self dims separately for better visual alignment - char self[64]; - format_tensor_dims(self, t); - - p += sprintf(p, "%s", self); - } - - int format_tensor_strides(char * str, const struct ggml_tensor * t) { - const char * c = ggml_is_contiguous(t) ? "" : "!"; - - if (t->ne[2] == 1 && t->ne[3] == 1) { - return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c); - } else { - return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c); - } - } - - void format_op_strides(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += format_tensor_strides(p, t->src[0]); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += format_tensor_strides(p, t->src[i]); - } - - p += sprintf(p, " -> "); - } - - // format self dims separately for better visual alignment - char self[64]; - format_tensor_strides(self, t); - - p += sprintf(p, "%s", self); - } - - void format_op_types(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += sprintf(p, "%s", ggml_type_name(t->src[0]->type)); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += sprintf(p, "%s", ggml_type_name(t->src[i]->type)); - } - - p += sprintf(p, " -> "); - } - - p += sprintf(p, "%s", ggml_type_name(t->type)); - } - - const char * tensor_buff_name(const struct ggml_tensor * t) { - if (t->buffer) { - return ggml_backend_buffer_name(t->buffer); - } - return "NONE"; - } - - void format_op_buffs(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += sprintf(p, "%s", tensor_buff_name(t->src[0])); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += sprintf(p, "%s", tensor_buff_name(t->src[i])); - } - - p += sprintf(p, " -> "); - } - - p += sprintf(p, "%s", tensor_buff_name(t)); - } - - void format_op_names(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += sprintf(p, "%s", t->src[0]->name); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += sprintf(p, "%s", t->src[i]->name); - } - - p += sprintf(p, " -> "); - } - - p += sprintf(p, "%s", t->name); - } - - void format(const ggml_tensor * op) { - format_op_dims(dims, op); - format_op_strides(strides, op); - format_op_types(types, op); - format_op_buffs(buffs, op); - format_op_names(names, op); - } - - op_desc() {} - op_desc(const ggml_tensor * op) { format(op); } -}; - -#endif // OP_DESC_H diff --git a/scripts/snapdragon/ggml-hexagon-profile.py b/scripts/snapdragon/ggml-hexagon-profile.py index 3edaacd27..aa1f20dcc 100755 --- a/scripts/snapdragon/ggml-hexagon-profile.py +++ b/scripts/snapdragon/ggml-hexagon-profile.py @@ -24,7 +24,7 @@ COL_MAP = { } op_pattern = re.compile( - r"profile-op\s+(?P[A-Z_0-9]+):\s+.*?\s+:\s+(?P[\d:x\s\->!]+)\s+:\s+(?P[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P\d+)\s+cycles\s+(?P\d+)(?:\s+pmu\s+\[(?P[\d,\s]+)\])?" + r"profile-op\s+(?P[A-Z_0-9+]+):\s+.*?\s+:\s+(?P[\d:x\s\->!]+)\s+:\s+(?P[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P\d+)\s+cycles\s+(?P\d+)(?:\s+pmu\s+\[(?P[\d,\s]+)\])?" ) logger = logging.getLogger("ggml-hexagon-profile")