hexagon: basic/generic op fusion support and RMS_NORM+MUL fusion (#23835)

Updating infra to enable op fusion and using RMS_NORM+MUL as the use-case.
This commit is contained in:
Max Krasnyansky 2026-05-28 14:05:54 -07:00 committed by GitHub
parent 751ebd17a5
commit 19e92c33ef
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 498 additions and 237 deletions

View file

@ -39,7 +39,7 @@
#include "ggml-hexagon.h"
#include "ggml-impl.h"
#include "ggml-quants.h"
#include "op-desc.h"
#include "htp-opnode.h"
#include "htp-ops.h"
#include "htp_iface.h"
#include "htp-drv.h"
@ -102,23 +102,23 @@ static const char * status_to_str(uint32_t status) {
// ** debug helpers
static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) {
if (!opt_verbose) return;
op_desc desc(op);
htp_opformat fmt(node);
GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags);
}
static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
if (!opt_verbose) return;
op_desc desc(op);
htp_opformat fmt(htp_opformat(htp_opnode{const_cast<ggml_tensor*>(op), {}, HTP_OP_INVALID}));
GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
}
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
if (!opt_profile) return;
@ -129,15 +129,16 @@ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_t
pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
}
op_desc desc(op);
htp_opformat fmt(node);
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
}
// ** backend sessions
struct ggml_hexagon_opbatch;
struct ggml_hexagon_opqueue;
struct htp_opnode;
struct ggml_hexagon_session {
std::string name;
@ -167,7 +168,7 @@ struct ggml_hexagon_session {
void allocate(int dev_id) noexcept(false);
void release() noexcept(true);
void enqueue_op(htp_op_code opcode, const ggml_tensor *op);
void enqueue_op(const htp_opnode & node);
void flush(bool all = true);
void flush_pending(bool all = false);
@ -1782,12 +1783,10 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
/* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host,
};
// Backend session implementation
struct ggml_hexagon_opbatch {
ggml_hexagon_session* sess;
std::vector<const ggml_tensor*> ops; // pointers to original ops
std::vector<htp_opnode> ops; // htp_opnode of ops
std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
std::vector<htp_tensor> h_tens; // htp tensor descriptors
@ -1919,7 +1918,7 @@ struct ggml_hexagon_opbatch {
return ti;
}
bool fit_op(const struct ggml_tensor *t) const {
bool fit_op(const htp_opnode & node) const {
if (n_ops >= n_ops_max ) return false;
// check how much extras we will need
@ -1939,10 +1938,10 @@ struct ggml_hexagon_opbatch {
}
};
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS && t->src[i]; i++) {
fit_tensor(t->src[i]);
for (const auto * src : node.get_inputs()) {
fit_tensor(src);
}
fit_tensor(t);
fit_tensor(node.dst());
if ((extra_bufs + n_bufs) > n_bufs_max) return false;
if ((extra_tens + n_tens) > n_tens_max) return false;
@ -1952,29 +1951,30 @@ struct ggml_hexagon_opbatch {
}
// assumes that fit_op() was called first and returned true
void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
void add_op(const htp_opnode & node) {
// Add new op
unsigned int n = n_ops++;
GGML_ASSERT(n_ops <= n_ops_max);
ops[n] = t;
ops[n] = node;
htp_op_desc &o = h_ops[n];
memcpy(&o.params, &t->op_params, sizeof(t->op_params));
o.opcode = opcode;
memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params));
o.opcode = node.opcode;
o.flags = 0;
if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
}
ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags);
auto inputs = node.get_inputs();
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff;
}
o.dst = add_tensor(t);
o.dst = add_tensor(node.dst());
}
};
@ -1983,7 +1983,7 @@ struct ggml_hexagon_opqueue {
ggml_hexagon_shared_buffer *shm_buf;
size_t shm_blk_size;
using opvec = std::vector<const ggml_tensor*>;
using opvec = std::vector<htp_opnode>;
std::queue<unsigned int> done; // completed batch ids
std::vector<opvec> op_cache; // per batch op cache
@ -2182,11 +2182,11 @@ void ggml_hexagon_session::flush_batch() {
}
}
void ggml_hexagon_session::enqueue_op(htp_op_code opcode, const ggml_tensor *op) {
if (!op_batch->fit_op(op)) {
void ggml_hexagon_session::enqueue_op(const htp_opnode & node) {
if (!op_batch->fit_op(node)) {
flush_batch();
}
op_batch->add_op(opcode, op);
op_batch->add_op(node);
}
// Flush HTP response queue i.e wait for all outstanding requests to complete
@ -3179,10 +3179,43 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
std::vector<htp_opnode> nodes;
nodes.reserve(graph->n_nodes);
// Fusion
for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * n = graph->nodes[i];
if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
sess->enqueue_op(op_remap_to_htp(n), n);
if (!op_is_compute(n)) {
continue;
}
ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr;
htp_opnode node = {
/*.node =*/ n,
/*.fused =*/ {},
/*.opcode =*/ HTP_OP_INVALID
};
if (n->op == GGML_OP_RMS_NORM && next_node) {
if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
node.add_fused(next_node);
node.opcode = HTP_OP_RMS_NORM_MUL;
i++; // skip the fused MUL node
}
}
if (node.opcode == HTP_OP_INVALID) {
node.opcode = op_remap_to_htp(n);
}
nodes.push_back(std::move(node));
}
// Queue and execute
if (opt_opstage & HTP_OPSTAGE_QUEUE) {
for (const auto & node : nodes) {
sess->enqueue_op(node);
}
}
@ -3201,51 +3234,7 @@ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
sess->flush();
}
struct node_info {
ggml_tensor * node;
std::vector<ggml_tensor *> fused;
ggml_op op() const {
return node->op;
}
const ggml_tensor * dst() const {
return fused.empty() ? node : fused.back();
}
const ggml_tensor * src0() const {
return node->src[0];
}
const ggml_tensor * src1() const {
return node->src[1];
}
bool is_empty() const {
return ggml_op_is_empty(node->op);
}
void add_fused(ggml_tensor * t) {
fused.push_back(t);
}
bool stackable() const {
switch (this->op()) {
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
return ggml_is_quantized(this->src0()->type);
default:
return false;
}
}
bool same_input(const node_info& n) const {
return n.src1() == this->src1();
}
};
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<htp_opnode> & nodes) {
const int n = nodes.size();
std::vector<int> res;
@ -3299,14 +3288,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
enum ggml_op ops[MAX_FUSE];
std::vector<node_info> nodes;
std::vector<htp_opnode> nodes;
nodes.reserve(gf->n_nodes);
// fuse nodes:
// we don't want to make reorders that break fusing, so we first pack all fusable tensors
// and perform the reorder over the fused nodes. after the reorder is done, we unfuse
for (int i = 0; i < n; i++) {
node_info node = {
htp_opnode node = {
/*.node =*/gf->nodes[i],
/*.fused =*/{},
};

View file

@ -0,0 +1,241 @@
#ifndef HTP_OPNODE_H
#define HTP_OPNODE_H
#define GGML_COMMON_IMPL_CPP
#include "ggml-backend-impl.h"
#include "ggml-common.h"
#include <string>
#include <vector>
#include <stdio.h>
#include "htp-ops.h"
struct htp_opnode {
ggml_tensor * node = nullptr;
std::vector<ggml_tensor *> fused;
htp_op_code opcode = HTP_OP_INVALID;
ggml_op op() const {
return node->op;
}
const ggml_tensor * dst() const {
return fused.empty() ? node : fused.back();
}
const ggml_tensor * src0() const {
return node->src[0];
}
const ggml_tensor * src1() const {
return node->src[1];
}
bool is_empty() const {
return ggml_op_is_empty(node->op);
}
void add_fused(ggml_tensor * t) {
fused.push_back(t);
}
bool stackable() const {
switch (this->op()) {
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
return ggml_is_quantized(this->src0()->type);
default:
return false;
}
}
bool same_input(const htp_opnode& n) const {
return n.src1() == this->src1();
}
std::vector<const ggml_tensor *> get_inputs() const {
std::vector<const ggml_tensor *> inputs;
std::vector<const ggml_tensor *> outputs;
outputs.push_back(node);
for (const auto * f : fused) {
outputs.push_back(f);
}
auto contains = [&](const std::vector<const ggml_tensor *> & vec, const ggml_tensor * t) {
for (const auto * x : vec) {
if (x == t) return true;
}
return false;
};
auto add_input = [&](const ggml_tensor * t) {
if (t && !contains(outputs, t) && !contains(inputs, t)) {
inputs.push_back(t);
}
};
for (int i = 0; i < GGML_MAX_SRC && node->src[i]; i++) {
add_input(node->src[i]);
}
for (const auto * f : fused) {
for (int i = 0; i < GGML_MAX_SRC && f->src[i]; i++) {
add_input(f->src[i]);
}
}
return inputs;
}
std::string op_name() const {
if (fused.empty()) {
return ggml_op_desc(node);
}
std::string name = ggml_op_desc(node);
for (const auto * f : fused) {
name += "+";
name += ggml_op_desc(f);
}
return name;
}
};
struct htp_opformat {
char strides[64 * GGML_MAX_SRC];
char dims[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
if (t->ne[2] == 1 && t->ne[3] == 1) {
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
} else {
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
}
}
void format_op_dims(char * str, const htp_opnode & node) {
char * p = str;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
p += format_tensor_dims(p, inputs[0]);
for (size_t i = 1; i < inputs.size(); i++) {
p += sprintf(p, " x ");
p += format_tensor_dims(p, inputs[i]);
}
p += sprintf(p, " -> ");
}
char self[64];
format_tensor_dims(self, node.dst());
p += sprintf(p, "%s", self);
}
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
const char * c = ggml_is_contiguous(t) ? "" : "!";
if (t->ne[2] == 1 && t->ne[3] == 1) {
return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
} else {
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
}
}
void format_op_strides(char * str, const htp_opnode & node) {
char * p = str;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
p += format_tensor_strides(p, inputs[0]);
for (size_t i = 1; i < inputs.size(); i++) {
p += sprintf(p, " x ");
p += format_tensor_strides(p, inputs[i]);
}
p += sprintf(p, " -> ");
}
char self[64];
format_tensor_strides(self, node.dst());
p += sprintf(p, "%s", self);
}
void format_op_types(char * str, const htp_opnode & node) {
char * p = str;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
p += sprintf(p, "%s", ggml_type_name(inputs[0]->type));
for (size_t i = 1; i < inputs.size(); i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", ggml_type_name(inputs[i]->type));
}
p += sprintf(p, " -> ");
}
p += sprintf(p, "%s", ggml_type_name(node.dst()->type));
}
const char * tensor_buff_name(const struct ggml_tensor * t) {
if (t->buffer) {
return ggml_backend_buffer_name(t->buffer);
}
return "NONE";
}
void format_op_buffs(char * str, const htp_opnode & node) {
char * p = str;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
p += sprintf(p, "%s", tensor_buff_name(inputs[0]));
for (size_t i = 1; i < inputs.size(); i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", tensor_buff_name(inputs[i]));
}
p += sprintf(p, " -> ");
}
p += sprintf(p, "%s", tensor_buff_name(node.dst()));
}
void format_op_names(char * str, const htp_opnode & node) {
char * p = str;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
p += sprintf(p, "%s", inputs[0]->name);
for (size_t i = 1; i < inputs.size(); i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", inputs[i]->name);
}
p += sprintf(p, " -> ");
}
p += sprintf(p, "%s", node.dst()->name);
}
void format(const htp_opnode & node) {
format_op_dims(dims, node);
format_op_strides(strides, node);
format_op_types(types, node);
format_op_buffs(buffs, node);
format_op_names(names, node);
}
htp_opformat() {}
htp_opformat(const htp_opnode & node) { format(node); }
};
#endif // HTP_OPNODE_H

View file

@ -58,6 +58,7 @@ enum htp_op_code {
HTP_OP_MUL_MAT,
HTP_OP_MUL_MAT_ID,
HTP_OP_RMS_NORM,
HTP_OP_RMS_NORM_MUL,
HTP_OP_UNARY_SILU,
HTP_OP_UNARY_GELU,
HTP_OP_UNARY_SIGMOID,

View file

@ -537,6 +537,7 @@ static int execute_op(struct htp_ops_context * octx) {
case HTP_OP_NORM:
case HTP_OP_RMS_NORM:
case HTP_OP_RMS_NORM_MUL:
case HTP_OP_SCALE:
case HTP_OP_SQR:
case HTP_OP_SQRT:

View file

@ -23,21 +23,26 @@ struct htp_unary_context {
// Precomputed values
const uint8_t * data_src0;
const uint8_t * data_src1; // weight/scale tensor for RMS_NORM_MUL
uint8_t * data_dst;
size_t src0_data_row_size; // actual data bytes per row
size_t src1_data_row_size;
size_t dst_data_row_size; // actual data bytes per row
size_t src0_row_size_aligned;
size_t src1_row_size_aligned;
size_t dst_row_size_aligned;
size_t src0_spad_half_size;
size_t src1_spad_half_size;
size_t dst_spad_half_size;
uint32_t block;
uint32_t src0_nrows;
uint32_t src0_nrows_per_thread;
uint32_t nc;
bool broadcast_weight;
};
// Convert flat row index to DDR byte offset using the tensor's actual strides.
@ -158,6 +163,71 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
}
}
static void hvx_fast_rms_norm_mul_f32(const uint8_t * restrict src,
const uint8_t * restrict weight,
uint8_t * restrict dst,
const int num_elems,
float epsilon) {
const HVX_Vector * restrict v_src = (const HVX_Vector *) src;
const HVX_Vector * restrict v_weight = (const HVX_Vector *) weight;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
const int nvec = num_elems / VLEN_FP32; // number of full vectors
const int nloe = num_elems % VLEN_FP32; // leftover elements
// Compute sum of squares for full vectors
HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
#pragma unroll(4)
for (int i = 0; i < nvec; i++) {
HVX_Vector v1 = v_src[i];
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
}
// Handle tail elements using vectorized ops with masking
if (nloe > 0) {
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
}
// Reduce HVX sum
sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems);
HVX_Vector denom_v = hvx_vec_inverse_f32(t_v);
HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
// Scale and multiply
HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
#pragma unroll(4)
for (int i = 0; i < nvec; i++) {
HVX_Vector v1 = v_src[i];
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2);
HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[i]);
v_dst[i] = Q6_Vsf_equals_Vqf32(result);
}
// Handle tail elements using vectorized ops with masking
if (nloe > 0) {
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2);
HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[nvec]);
HVX_Vector res_v = Q6_Vsf_equals_Vqf32(result);
// Store with masking to avoid overwriting memory beyond the tensor
hvx_vec_store_a(&v_dst[nvec], nloe * 4, res_v);
}
}
static void hvx_fast_norm_f32(const uint8_t * restrict src,
uint8_t * restrict dst,
uint8_t * restrict pad,
@ -269,6 +339,27 @@ static void rms_norm_f32(const float * restrict src,
}
}
static void rms_norm_mul_f32(const float * restrict src,
const float * restrict weight,
float * restrict dst,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
const size_t weight_row_size,
int32_t * op_params,
bool broadcast_weight) {
float epsilon = 0.f;
memcpy(&epsilon, op_params, sizeof(float));
for (uint32_t ir = 0; ir < num_rows; ir++) {
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
const uint8_t * restrict w_local = (const uint8_t *)weight + (broadcast_weight ? 0 : ir * weight_row_size);
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
hvx_fast_rms_norm_mul_f32(src_local, w_local, dst_local, row_elems, epsilon);
}
}
static void norm_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
@ -598,12 +689,15 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
t1 = HAP_perf_get_qtimer_count();
const uint8_t * restrict data_src = uctx->data_src0;
const uint8_t * restrict data_src1 = uctx->data_src1;
uint8_t * restrict data_dst = uctx->data_dst;
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
size_t src0_spad_half_size = uctx->src0_spad_half_size;
size_t src1_spad_half_size = uctx->src1_spad_half_size;
size_t dst_spad_half_size = uctx->dst_spad_half_size;
// Non-contiguous tensors have gaps at dim-2/3 boundaries that a single-stride
@ -624,6 +718,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
dma_queue * dma_queue = octx->ctx->dma[ith];
// If weight is broadcasted, load it once per thread at the beginning of execution
if (htp_op == HTP_OP_RMS_NORM_MUL && uctx->broadcast_weight) {
dma_queue_push(dma_queue, dma_make_ptr(src1_spad_data, data_src1), uctx->src1_row_size_aligned, 0, uctx->src1_data_row_size, 1);
dma_queue_flush(dma_queue);
}
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; spad_idx++) {
const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
@ -636,6 +736,14 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
dma_queue_push(dma_queue,
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + src0_off),
src0_row_size_aligned, nb01, src0_data_row_size, block_size);
if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
const size_t src1_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03);
dma_queue_push(dma_queue,
dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + src1_off),
uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, block_size);
}
ir += block_size;
}
@ -644,6 +752,10 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
float * dst_spad = (float *) dma_queue_pop(dma_queue).src;
float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
float * src1_spad = NULL;
if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
src1_spad = (float *) dma_queue_pop(dma_queue).dst;
}
// Process block in VTCM
switch (htp_op) {
@ -653,6 +765,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
case HTP_OP_RMS_NORM:
rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
break;
case HTP_OP_RMS_NORM_MUL:
{
const float * w_ptr = uctx->broadcast_weight ? (const float *) src1_spad_data : src1_spad;
rms_norm_mul_f32(src0_spad, w_ptr, dst_spad, block_size, ne0, src0_row_size_aligned, uctx->src1_row_size_aligned, op_params, uctx->broadcast_weight);
}
break;
case HTP_OP_SCALE:
scale_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
break;
@ -700,9 +818,16 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
if (pref_ir < src0_end_row) {
const uint32_t pref_block_size = unary_block_size(pref_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
const size_t src0_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
dma_queue_push(dma_queue,
dma_make_ptr(src0_spad, data_src + src0_pref_off),
src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
dma_queue_push(dma_queue,
dma_make_ptr(src0_spad, data_src + src0_pref_off),
src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
const size_t src1_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
dma_queue_push(dma_queue,
dma_make_ptr(src1_spad, data_src1 + src1_pref_off),
uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, pref_block_size);
}
}
}
ir += block_size;
@ -732,6 +857,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
case HTP_OP_RMS_NORM:
op_type = "rmsnorm-f32";
break;
case HTP_OP_RMS_NORM_MUL:
op_type = "rmsnorm-mul-f32";
break;
case HTP_OP_SCALE:
op_type = "scale-f32";
break;
@ -777,12 +905,44 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
const size_t src0_row_size_aligned = hex_round_up(src0_data_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_data_row_size, VLEN);
size_t src1_data_row_size = 0;
size_t src1_row_size_aligned = 0;
bool broadcast_weight = false;
const struct htp_tensor * src1 = NULL;
if (octx->op == HTP_OP_RMS_NORM_MUL) {
src1 = octx->src[1];
src1_data_row_size = src1->ne[0] * sizeof(float);
src1_row_size_aligned = hex_round_up(src1_data_row_size, VLEN);
broadcast_weight = (src1->ne[1] * src1->ne[2] * src1->ne[3] == 1);
}
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
// Double buffering requires 2x size per buffer
size_t spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned);
size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
size_t spad_size_per_row = 0;
size_t vtcm_row_per_thread = 0;
if (octx->op == HTP_OP_RMS_NORM_MUL) {
if (broadcast_weight) {
size_t available_vtcm = octx->ctx->vtcm_size;
size_t src1_spad_total = n_threads * src1_row_size_aligned;
if (available_vtcm > src1_spad_total) {
available_vtcm -= src1_spad_total;
} else {
available_vtcm = 0;
}
spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned);
vtcm_row_per_thread = available_vtcm / (n_threads * spad_size_per_row);
} else {
spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned + src1_row_size_aligned);
vtcm_row_per_thread = (octx->ctx->vtcm_size) / (n_threads * spad_size_per_row);
}
} else {
spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned);
vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
}
// Make sure the reserved vtcm size is sufficient
if (vtcm_row_per_thread == 0) {
@ -797,8 +957,25 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
if (octx->op == HTP_OP_RMS_NORM_MUL) {
if (broadcast_weight) {
octx->src1_spad.size_per_thread = src1_row_size_aligned;
} else {
octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread * 2;
}
octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
} else {
octx->src1_spad.size = 0;
octx->src1_spad.size_per_thread = 0;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
if (octx->op == HTP_OP_RMS_NORM_MUL) {
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
} else {
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
}
FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
@ -811,19 +988,24 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
.src0_nrows = src0_nrows,
.data_src0 = (const uint8_t *)src0->data,
.data_src1 = (octx->op == HTP_OP_RMS_NORM_MUL) ? (const uint8_t *)src1->data : NULL,
.data_dst = (uint8_t *)dst->data,
.src0_data_row_size = src0_data_row_size,
.src1_data_row_size = src1_data_row_size,
.dst_data_row_size = dst_data_row_size,
.src0_row_size_aligned = src0_row_size_aligned,
.src1_row_size_aligned = src1_row_size_aligned,
.dst_row_size_aligned = dst_row_size_aligned,
.src0_spad_half_size = octx->src0_spad.size_per_thread / 2,
.src1_spad_half_size = (octx->op == HTP_OP_RMS_NORM_MUL) ? (octx->src1_spad.size_per_thread / (broadcast_weight ? 1 : 2)) : 0,
.dst_spad_half_size = octx->dst_spad.size_per_thread / 2,
.block = (octx->src0_spad.size_per_thread / 2) / src0_row_size_aligned,
.nc = src0->ne[0],
.broadcast_weight = broadcast_weight,
};
worker_pool_run_func(octx->ctx->worker_pool, unary_job_f32_per_thread, &uctx, n_threads);

View file

@ -1,153 +0,0 @@
#ifndef OP_DESC_H
#define OP_DESC_H
#define GGML_COMMON_IMPL_CPP
#include "ggml-backend-impl.h"
#include "ggml-common.h"
#include <string>
#include <stdio.h>
struct op_desc {
char strides[64 * GGML_MAX_SRC];
char dims[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
if (t->ne[2] == 1 && t->ne[3] == 1) {
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
} else {
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
}
}
void format_op_dims(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += format_tensor_dims(p, t->src[0]);
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += format_tensor_dims(p, t->src[i]);
}
p += sprintf(p, " -> ");
}
// format self dims separately for better visual alignment
char self[64];
format_tensor_dims(self, t);
p += sprintf(p, "%s", self);
}
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
const char * c = ggml_is_contiguous(t) ? "" : "!";
if (t->ne[2] == 1 && t->ne[3] == 1) {
return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
} else {
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
}
}
void format_op_strides(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += format_tensor_strides(p, t->src[0]);
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += format_tensor_strides(p, t->src[i]);
}
p += sprintf(p, " -> ");
}
// format self dims separately for better visual alignment
char self[64];
format_tensor_strides(self, t);
p += sprintf(p, "%s", self);
}
void format_op_types(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
}
p += sprintf(p, " -> ");
}
p += sprintf(p, "%s", ggml_type_name(t->type));
}
const char * tensor_buff_name(const struct ggml_tensor * t) {
if (t->buffer) {
return ggml_backend_buffer_name(t->buffer);
}
return "NONE";
}
void format_op_buffs(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
}
p += sprintf(p, " -> ");
}
p += sprintf(p, "%s", tensor_buff_name(t));
}
void format_op_names(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += sprintf(p, "%s", t->src[0]->name);
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", t->src[i]->name);
}
p += sprintf(p, " -> ");
}
p += sprintf(p, "%s", t->name);
}
void format(const ggml_tensor * op) {
format_op_dims(dims, op);
format_op_strides(strides, op);
format_op_types(types, op);
format_op_buffs(buffs, op);
format_op_names(names, op);
}
op_desc() {}
op_desc(const ggml_tensor * op) { format(op); }
};
#endif // OP_DESC_H

View file

@ -24,7 +24,7 @@ COL_MAP = {
}
op_pattern = re.compile(
r"profile-op\s+(?P<op_name>[A-Z_0-9]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
)
logger = logging.getLogger("ggml-hexagon-profile")