mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-31 21:39:42 +00:00
hexagon: basic/generic op fusion support and RMS_NORM+MUL fusion (#23835)
Updating infra to enable op fusion and using RMS_NORM+MUL as the use-case.
This commit is contained in:
parent
751ebd17a5
commit
19e92c33ef
7 changed files with 498 additions and 237 deletions
|
|
@ -39,7 +39,7 @@
|
|||
#include "ggml-hexagon.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "op-desc.h"
|
||||
#include "htp-opnode.h"
|
||||
#include "htp-ops.h"
|
||||
#include "htp_iface.h"
|
||||
#include "htp-drv.h"
|
||||
|
|
@ -102,23 +102,23 @@ static const char * status_to_str(uint32_t status) {
|
|||
|
||||
// ** debug helpers
|
||||
|
||||
static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
|
||||
static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) {
|
||||
if (!opt_verbose) return;
|
||||
|
||||
op_desc desc(op);
|
||||
htp_opformat fmt(node);
|
||||
GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
|
||||
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
|
||||
node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags);
|
||||
}
|
||||
|
||||
static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
|
||||
if (!opt_verbose) return;
|
||||
|
||||
op_desc desc(op);
|
||||
htp_opformat fmt(htp_opformat(htp_opnode{const_cast<ggml_tensor*>(op), {}, HTP_OP_INVALID}));
|
||||
GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
|
||||
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
|
||||
ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
|
||||
}
|
||||
|
||||
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
|
||||
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
|
||||
uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
|
||||
if (!opt_profile) return;
|
||||
|
||||
|
|
@ -129,15 +129,16 @@ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_t
|
|||
pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
|
||||
}
|
||||
|
||||
op_desc desc(op);
|
||||
htp_opformat fmt(node);
|
||||
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
|
||||
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
|
||||
node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
|
||||
}
|
||||
|
||||
// ** backend sessions
|
||||
|
||||
struct ggml_hexagon_opbatch;
|
||||
struct ggml_hexagon_opqueue;
|
||||
struct htp_opnode;
|
||||
|
||||
struct ggml_hexagon_session {
|
||||
std::string name;
|
||||
|
|
@ -167,7 +168,7 @@ struct ggml_hexagon_session {
|
|||
void allocate(int dev_id) noexcept(false);
|
||||
void release() noexcept(true);
|
||||
|
||||
void enqueue_op(htp_op_code opcode, const ggml_tensor *op);
|
||||
void enqueue_op(const htp_opnode & node);
|
||||
void flush(bool all = true);
|
||||
|
||||
void flush_pending(bool all = false);
|
||||
|
|
@ -1782,12 +1783,10 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
|
|||
/* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host,
|
||||
};
|
||||
|
||||
// Backend session implementation
|
||||
|
||||
struct ggml_hexagon_opbatch {
|
||||
ggml_hexagon_session* sess;
|
||||
|
||||
std::vector<const ggml_tensor*> ops; // pointers to original ops
|
||||
std::vector<htp_opnode> ops; // htp_opnode of ops
|
||||
|
||||
std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
|
||||
std::vector<htp_tensor> h_tens; // htp tensor descriptors
|
||||
|
|
@ -1919,7 +1918,7 @@ struct ggml_hexagon_opbatch {
|
|||
return ti;
|
||||
}
|
||||
|
||||
bool fit_op(const struct ggml_tensor *t) const {
|
||||
bool fit_op(const htp_opnode & node) const {
|
||||
if (n_ops >= n_ops_max ) return false;
|
||||
|
||||
// check how much extras we will need
|
||||
|
|
@ -1939,10 +1938,10 @@ struct ggml_hexagon_opbatch {
|
|||
}
|
||||
};
|
||||
|
||||
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS && t->src[i]; i++) {
|
||||
fit_tensor(t->src[i]);
|
||||
for (const auto * src : node.get_inputs()) {
|
||||
fit_tensor(src);
|
||||
}
|
||||
fit_tensor(t);
|
||||
fit_tensor(node.dst());
|
||||
|
||||
if ((extra_bufs + n_bufs) > n_bufs_max) return false;
|
||||
if ((extra_tens + n_tens) > n_tens_max) return false;
|
||||
|
|
@ -1952,29 +1951,30 @@ struct ggml_hexagon_opbatch {
|
|||
}
|
||||
|
||||
// assumes that fit_op() was called first and returned true
|
||||
void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
|
||||
void add_op(const htp_opnode & node) {
|
||||
// Add new op
|
||||
|
||||
unsigned int n = n_ops++;
|
||||
GGML_ASSERT(n_ops <= n_ops_max);
|
||||
|
||||
ops[n] = t;
|
||||
ops[n] = node;
|
||||
|
||||
htp_op_desc &o = h_ops[n];
|
||||
memcpy(&o.params, &t->op_params, sizeof(t->op_params));
|
||||
o.opcode = opcode;
|
||||
memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params));
|
||||
o.opcode = node.opcode;
|
||||
o.flags = 0;
|
||||
|
||||
if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
|
||||
o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
||||
}
|
||||
|
||||
ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
|
||||
ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags);
|
||||
|
||||
auto inputs = node.get_inputs();
|
||||
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
|
||||
o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
|
||||
o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff;
|
||||
}
|
||||
o.dst = add_tensor(t);
|
||||
o.dst = add_tensor(node.dst());
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -1983,7 +1983,7 @@ struct ggml_hexagon_opqueue {
|
|||
ggml_hexagon_shared_buffer *shm_buf;
|
||||
size_t shm_blk_size;
|
||||
|
||||
using opvec = std::vector<const ggml_tensor*>;
|
||||
using opvec = std::vector<htp_opnode>;
|
||||
|
||||
std::queue<unsigned int> done; // completed batch ids
|
||||
std::vector<opvec> op_cache; // per batch op cache
|
||||
|
|
@ -2182,11 +2182,11 @@ void ggml_hexagon_session::flush_batch() {
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_hexagon_session::enqueue_op(htp_op_code opcode, const ggml_tensor *op) {
|
||||
if (!op_batch->fit_op(op)) {
|
||||
void ggml_hexagon_session::enqueue_op(const htp_opnode & node) {
|
||||
if (!op_batch->fit_op(node)) {
|
||||
flush_batch();
|
||||
}
|
||||
op_batch->add_op(opcode, op);
|
||||
op_batch->add_op(node);
|
||||
}
|
||||
|
||||
// Flush HTP response queue i.e wait for all outstanding requests to complete
|
||||
|
|
@ -3179,10 +3179,43 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|||
|
||||
HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
|
||||
|
||||
std::vector<htp_opnode> nodes;
|
||||
nodes.reserve(graph->n_nodes);
|
||||
|
||||
// Fusion
|
||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||
ggml_tensor * n = graph->nodes[i];
|
||||
if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
|
||||
sess->enqueue_op(op_remap_to_htp(n), n);
|
||||
if (!op_is_compute(n)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr;
|
||||
|
||||
htp_opnode node = {
|
||||
/*.node =*/ n,
|
||||
/*.fused =*/ {},
|
||||
/*.opcode =*/ HTP_OP_INVALID
|
||||
};
|
||||
|
||||
if (n->op == GGML_OP_RMS_NORM && next_node) {
|
||||
if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
node.add_fused(next_node);
|
||||
node.opcode = HTP_OP_RMS_NORM_MUL;
|
||||
i++; // skip the fused MUL node
|
||||
}
|
||||
}
|
||||
|
||||
if (node.opcode == HTP_OP_INVALID) {
|
||||
node.opcode = op_remap_to_htp(n);
|
||||
}
|
||||
|
||||
nodes.push_back(std::move(node));
|
||||
}
|
||||
|
||||
// Queue and execute
|
||||
if (opt_opstage & HTP_OPSTAGE_QUEUE) {
|
||||
for (const auto & node : nodes) {
|
||||
sess->enqueue_op(node);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3201,51 +3234,7 @@ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
|
|||
sess->flush();
|
||||
}
|
||||
|
||||
struct node_info {
|
||||
ggml_tensor * node;
|
||||
|
||||
std::vector<ggml_tensor *> fused;
|
||||
|
||||
ggml_op op() const {
|
||||
return node->op;
|
||||
}
|
||||
|
||||
const ggml_tensor * dst() const {
|
||||
return fused.empty() ? node : fused.back();
|
||||
}
|
||||
|
||||
const ggml_tensor * src0() const {
|
||||
return node->src[0];
|
||||
}
|
||||
|
||||
const ggml_tensor * src1() const {
|
||||
return node->src[1];
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return ggml_op_is_empty(node->op);
|
||||
}
|
||||
|
||||
void add_fused(ggml_tensor * t) {
|
||||
fused.push_back(t);
|
||||
}
|
||||
|
||||
bool stackable() const {
|
||||
switch (this->op()) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
return ggml_is_quantized(this->src0()->type);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool same_input(const node_info& n) const {
|
||||
return n.src1() == this->src1();
|
||||
}
|
||||
};
|
||||
|
||||
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
|
||||
static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<htp_opnode> & nodes) {
|
||||
const int n = nodes.size();
|
||||
|
||||
std::vector<int> res;
|
||||
|
|
@ -3299,14 +3288,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
|
|||
|
||||
enum ggml_op ops[MAX_FUSE];
|
||||
|
||||
std::vector<node_info> nodes;
|
||||
std::vector<htp_opnode> nodes;
|
||||
nodes.reserve(gf->n_nodes);
|
||||
|
||||
// fuse nodes:
|
||||
// we don't want to make reorders that break fusing, so we first pack all fusable tensors
|
||||
// and perform the reorder over the fused nodes. after the reorder is done, we unfuse
|
||||
for (int i = 0; i < n; i++) {
|
||||
node_info node = {
|
||||
htp_opnode node = {
|
||||
/*.node =*/gf->nodes[i],
|
||||
/*.fused =*/{},
|
||||
};
|
||||
|
|
|
|||
241
ggml/src/ggml-hexagon/htp-opnode.h
Normal file
241
ggml/src/ggml-hexagon/htp-opnode.h
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
#ifndef HTP_OPNODE_H
|
||||
#define HTP_OPNODE_H
|
||||
|
||||
#define GGML_COMMON_IMPL_CPP
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
#include "htp-ops.h"
|
||||
|
||||
struct htp_opnode {
|
||||
ggml_tensor * node = nullptr;
|
||||
|
||||
std::vector<ggml_tensor *> fused;
|
||||
|
||||
htp_op_code opcode = HTP_OP_INVALID;
|
||||
|
||||
ggml_op op() const {
|
||||
return node->op;
|
||||
}
|
||||
|
||||
const ggml_tensor * dst() const {
|
||||
return fused.empty() ? node : fused.back();
|
||||
}
|
||||
|
||||
const ggml_tensor * src0() const {
|
||||
return node->src[0];
|
||||
}
|
||||
|
||||
const ggml_tensor * src1() const {
|
||||
return node->src[1];
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return ggml_op_is_empty(node->op);
|
||||
}
|
||||
|
||||
void add_fused(ggml_tensor * t) {
|
||||
fused.push_back(t);
|
||||
}
|
||||
|
||||
bool stackable() const {
|
||||
switch (this->op()) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
return ggml_is_quantized(this->src0()->type);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool same_input(const htp_opnode& n) const {
|
||||
return n.src1() == this->src1();
|
||||
}
|
||||
|
||||
std::vector<const ggml_tensor *> get_inputs() const {
|
||||
std::vector<const ggml_tensor *> inputs;
|
||||
std::vector<const ggml_tensor *> outputs;
|
||||
outputs.push_back(node);
|
||||
for (const auto * f : fused) {
|
||||
outputs.push_back(f);
|
||||
}
|
||||
|
||||
auto contains = [&](const std::vector<const ggml_tensor *> & vec, const ggml_tensor * t) {
|
||||
for (const auto * x : vec) {
|
||||
if (x == t) return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
auto add_input = [&](const ggml_tensor * t) {
|
||||
if (t && !contains(outputs, t) && !contains(inputs, t)) {
|
||||
inputs.push_back(t);
|
||||
}
|
||||
};
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC && node->src[i]; i++) {
|
||||
add_input(node->src[i]);
|
||||
}
|
||||
for (const auto * f : fused) {
|
||||
for (int i = 0; i < GGML_MAX_SRC && f->src[i]; i++) {
|
||||
add_input(f->src[i]);
|
||||
}
|
||||
}
|
||||
return inputs;
|
||||
}
|
||||
|
||||
std::string op_name() const {
|
||||
if (fused.empty()) {
|
||||
return ggml_op_desc(node);
|
||||
}
|
||||
std::string name = ggml_op_desc(node);
|
||||
for (const auto * f : fused) {
|
||||
name += "+";
|
||||
name += ggml_op_desc(f);
|
||||
}
|
||||
return name;
|
||||
}
|
||||
};
|
||||
|
||||
struct htp_opformat {
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
|
||||
} else {
|
||||
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_dims(char * str, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += format_tensor_dims(p, inputs[0]);
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_dims(p, inputs[i]);
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
char self[64];
|
||||
format_tensor_dims(self, node.dst());
|
||||
p += sprintf(p, "%s", self);
|
||||
}
|
||||
|
||||
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
|
||||
const char * c = ggml_is_contiguous(t) ? "" : "!";
|
||||
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
|
||||
} else {
|
||||
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_strides(char * str, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += format_tensor_strides(p, inputs[0]);
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_strides(p, inputs[i]);
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
char self[64];
|
||||
format_tensor_strides(self, node.dst());
|
||||
p += sprintf(p, "%s", self);
|
||||
}
|
||||
|
||||
void format_op_types(char * str, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", ggml_type_name(inputs[0]->type));
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", ggml_type_name(inputs[i]->type));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", ggml_type_name(node.dst()->type));
|
||||
}
|
||||
|
||||
const char * tensor_buff_name(const struct ggml_tensor * t) {
|
||||
if (t->buffer) {
|
||||
return ggml_backend_buffer_name(t->buffer);
|
||||
}
|
||||
return "NONE";
|
||||
}
|
||||
|
||||
void format_op_buffs(char * str, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", tensor_buff_name(inputs[0]));
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", tensor_buff_name(inputs[i]));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", tensor_buff_name(node.dst()));
|
||||
}
|
||||
|
||||
void format_op_names(char * str, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", inputs[0]->name);
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", inputs[i]->name);
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", node.dst()->name);
|
||||
}
|
||||
|
||||
void format(const htp_opnode & node) {
|
||||
format_op_dims(dims, node);
|
||||
format_op_strides(strides, node);
|
||||
format_op_types(types, node);
|
||||
format_op_buffs(buffs, node);
|
||||
format_op_names(names, node);
|
||||
}
|
||||
|
||||
htp_opformat() {}
|
||||
htp_opformat(const htp_opnode & node) { format(node); }
|
||||
};
|
||||
|
||||
#endif // HTP_OPNODE_H
|
||||
|
|
@ -58,6 +58,7 @@ enum htp_op_code {
|
|||
HTP_OP_MUL_MAT,
|
||||
HTP_OP_MUL_MAT_ID,
|
||||
HTP_OP_RMS_NORM,
|
||||
HTP_OP_RMS_NORM_MUL,
|
||||
HTP_OP_UNARY_SILU,
|
||||
HTP_OP_UNARY_GELU,
|
||||
HTP_OP_UNARY_SIGMOID,
|
||||
|
|
|
|||
|
|
@ -537,6 +537,7 @@ static int execute_op(struct htp_ops_context * octx) {
|
|||
|
||||
case HTP_OP_NORM:
|
||||
case HTP_OP_RMS_NORM:
|
||||
case HTP_OP_RMS_NORM_MUL:
|
||||
case HTP_OP_SCALE:
|
||||
case HTP_OP_SQR:
|
||||
case HTP_OP_SQRT:
|
||||
|
|
|
|||
|
|
@ -23,21 +23,26 @@ struct htp_unary_context {
|
|||
|
||||
// Precomputed values
|
||||
const uint8_t * data_src0;
|
||||
const uint8_t * data_src1; // weight/scale tensor for RMS_NORM_MUL
|
||||
uint8_t * data_dst;
|
||||
|
||||
size_t src0_data_row_size; // actual data bytes per row
|
||||
size_t src1_data_row_size;
|
||||
size_t dst_data_row_size; // actual data bytes per row
|
||||
|
||||
size_t src0_row_size_aligned;
|
||||
size_t src1_row_size_aligned;
|
||||
size_t dst_row_size_aligned;
|
||||
|
||||
size_t src0_spad_half_size;
|
||||
size_t src1_spad_half_size;
|
||||
size_t dst_spad_half_size;
|
||||
|
||||
uint32_t block;
|
||||
uint32_t src0_nrows;
|
||||
uint32_t src0_nrows_per_thread;
|
||||
uint32_t nc;
|
||||
bool broadcast_weight;
|
||||
};
|
||||
|
||||
// Convert flat row index to DDR byte offset using the tensor's actual strides.
|
||||
|
|
@ -158,6 +163,71 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
|
|||
}
|
||||
}
|
||||
|
||||
static void hvx_fast_rms_norm_mul_f32(const uint8_t * restrict src,
|
||||
const uint8_t * restrict weight,
|
||||
uint8_t * restrict dst,
|
||||
const int num_elems,
|
||||
float epsilon) {
|
||||
const HVX_Vector * restrict v_src = (const HVX_Vector *) src;
|
||||
const HVX_Vector * restrict v_weight = (const HVX_Vector *) weight;
|
||||
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
||||
|
||||
const int nvec = num_elems / VLEN_FP32; // number of full vectors
|
||||
const int nloe = num_elems % VLEN_FP32; // leftover elements
|
||||
|
||||
// Compute sum of squares for full vectors
|
||||
HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
|
||||
HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
HVX_Vector v1 = v_src[i];
|
||||
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
|
||||
}
|
||||
|
||||
// Handle tail elements using vectorized ops with masking
|
||||
if (nloe > 0) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
|
||||
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
|
||||
}
|
||||
|
||||
// Reduce HVX sum
|
||||
sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
|
||||
|
||||
HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems);
|
||||
HVX_Vector denom_v = hvx_vec_inverse_f32(t_v);
|
||||
HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
|
||||
HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
|
||||
|
||||
// Scale and multiply
|
||||
HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
HVX_Vector v1 = v_src[i];
|
||||
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
|
||||
HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2);
|
||||
HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[i]);
|
||||
v_dst[i] = Q6_Vsf_equals_Vqf32(result);
|
||||
}
|
||||
|
||||
// Handle tail elements using vectorized ops with masking
|
||||
if (nloe > 0) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
|
||||
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
|
||||
HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2);
|
||||
HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[nvec]);
|
||||
HVX_Vector res_v = Q6_Vsf_equals_Vqf32(result);
|
||||
|
||||
// Store with masking to avoid overwriting memory beyond the tensor
|
||||
hvx_vec_store_a(&v_dst[nvec], nloe * 4, res_v);
|
||||
}
|
||||
}
|
||||
|
||||
static void hvx_fast_norm_f32(const uint8_t * restrict src,
|
||||
uint8_t * restrict dst,
|
||||
uint8_t * restrict pad,
|
||||
|
|
@ -269,6 +339,27 @@ static void rms_norm_f32(const float * restrict src,
|
|||
}
|
||||
}
|
||||
|
||||
static void rms_norm_mul_f32(const float * restrict src,
|
||||
const float * restrict weight,
|
||||
float * restrict dst,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
const size_t weight_row_size,
|
||||
int32_t * op_params,
|
||||
bool broadcast_weight) {
|
||||
float epsilon = 0.f;
|
||||
memcpy(&epsilon, op_params, sizeof(float));
|
||||
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
|
||||
const uint8_t * restrict w_local = (const uint8_t *)weight + (broadcast_weight ? 0 : ir * weight_row_size);
|
||||
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
|
||||
|
||||
hvx_fast_rms_norm_mul_f32(src_local, w_local, dst_local, row_elems, epsilon);
|
||||
}
|
||||
}
|
||||
|
||||
static void norm_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
|
|
@ -598,12 +689,15 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
|||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint8_t * restrict data_src = uctx->data_src0;
|
||||
const uint8_t * restrict data_src1 = uctx->data_src1;
|
||||
uint8_t * restrict data_dst = uctx->data_dst;
|
||||
|
||||
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
|
||||
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
|
||||
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
|
||||
|
||||
size_t src0_spad_half_size = uctx->src0_spad_half_size;
|
||||
size_t src1_spad_half_size = uctx->src1_spad_half_size;
|
||||
size_t dst_spad_half_size = uctx->dst_spad_half_size;
|
||||
|
||||
// Non-contiguous tensors have gaps at dim-2/3 boundaries that a single-stride
|
||||
|
|
@ -624,6 +718,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
|||
|
||||
dma_queue * dma_queue = octx->ctx->dma[ith];
|
||||
|
||||
// If weight is broadcasted, load it once per thread at the beginning of execution
|
||||
if (htp_op == HTP_OP_RMS_NORM_MUL && uctx->broadcast_weight) {
|
||||
dma_queue_push(dma_queue, dma_make_ptr(src1_spad_data, data_src1), uctx->src1_row_size_aligned, 0, uctx->src1_data_row_size, 1);
|
||||
dma_queue_flush(dma_queue);
|
||||
}
|
||||
|
||||
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; spad_idx++) {
|
||||
const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
|
||||
|
||||
|
|
@ -636,6 +736,14 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
|||
dma_queue_push(dma_queue,
|
||||
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + src0_off),
|
||||
src0_row_size_aligned, nb01, src0_data_row_size, block_size);
|
||||
|
||||
if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
|
||||
const size_t src1_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03);
|
||||
dma_queue_push(dma_queue,
|
||||
dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + src1_off),
|
||||
uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, block_size);
|
||||
}
|
||||
|
||||
ir += block_size;
|
||||
}
|
||||
|
||||
|
|
@ -644,6 +752,10 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
|||
|
||||
float * dst_spad = (float *) dma_queue_pop(dma_queue).src;
|
||||
float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
|
||||
float * src1_spad = NULL;
|
||||
if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
|
||||
src1_spad = (float *) dma_queue_pop(dma_queue).dst;
|
||||
}
|
||||
|
||||
// Process block in VTCM
|
||||
switch (htp_op) {
|
||||
|
|
@ -653,6 +765,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
|||
case HTP_OP_RMS_NORM:
|
||||
rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_RMS_NORM_MUL:
|
||||
{
|
||||
const float * w_ptr = uctx->broadcast_weight ? (const float *) src1_spad_data : src1_spad;
|
||||
rms_norm_mul_f32(src0_spad, w_ptr, dst_spad, block_size, ne0, src0_row_size_aligned, uctx->src1_row_size_aligned, op_params, uctx->broadcast_weight);
|
||||
}
|
||||
break;
|
||||
case HTP_OP_SCALE:
|
||||
scale_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
|
|
@ -700,9 +818,16 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
|||
if (pref_ir < src0_end_row) {
|
||||
const uint32_t pref_block_size = unary_block_size(pref_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
|
||||
const size_t src0_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
|
||||
dma_queue_push(dma_queue,
|
||||
dma_make_ptr(src0_spad, data_src + src0_pref_off),
|
||||
src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
|
||||
dma_queue_push(dma_queue,
|
||||
dma_make_ptr(src0_spad, data_src + src0_pref_off),
|
||||
src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
|
||||
|
||||
if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
|
||||
const size_t src1_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
|
||||
dma_queue_push(dma_queue,
|
||||
dma_make_ptr(src1_spad, data_src1 + src1_pref_off),
|
||||
uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, pref_block_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
ir += block_size;
|
||||
|
|
@ -732,6 +857,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
|||
case HTP_OP_RMS_NORM:
|
||||
op_type = "rmsnorm-f32";
|
||||
break;
|
||||
case HTP_OP_RMS_NORM_MUL:
|
||||
op_type = "rmsnorm-mul-f32";
|
||||
break;
|
||||
case HTP_OP_SCALE:
|
||||
op_type = "scale-f32";
|
||||
break;
|
||||
|
|
@ -777,12 +905,44 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
|||
const size_t src0_row_size_aligned = hex_round_up(src0_data_row_size, VLEN);
|
||||
const size_t dst_row_size_aligned = hex_round_up(dst_data_row_size, VLEN);
|
||||
|
||||
size_t src1_data_row_size = 0;
|
||||
size_t src1_row_size_aligned = 0;
|
||||
bool broadcast_weight = false;
|
||||
const struct htp_tensor * src1 = NULL;
|
||||
|
||||
if (octx->op == HTP_OP_RMS_NORM_MUL) {
|
||||
src1 = octx->src[1];
|
||||
src1_data_row_size = src1->ne[0] * sizeof(float);
|
||||
src1_row_size_aligned = hex_round_up(src1_data_row_size, VLEN);
|
||||
broadcast_weight = (src1->ne[1] * src1->ne[2] * src1->ne[3] == 1);
|
||||
}
|
||||
|
||||
// VTCM scratchpads for all tensors
|
||||
// N rows per thread, padded to HVX vector size
|
||||
// Double buffering requires 2x size per buffer
|
||||
|
||||
size_t spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned);
|
||||
size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
|
||||
size_t spad_size_per_row = 0;
|
||||
size_t vtcm_row_per_thread = 0;
|
||||
|
||||
if (octx->op == HTP_OP_RMS_NORM_MUL) {
|
||||
if (broadcast_weight) {
|
||||
size_t available_vtcm = octx->ctx->vtcm_size;
|
||||
size_t src1_spad_total = n_threads * src1_row_size_aligned;
|
||||
if (available_vtcm > src1_spad_total) {
|
||||
available_vtcm -= src1_spad_total;
|
||||
} else {
|
||||
available_vtcm = 0;
|
||||
}
|
||||
spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned);
|
||||
vtcm_row_per_thread = available_vtcm / (n_threads * spad_size_per_row);
|
||||
} else {
|
||||
spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned + src1_row_size_aligned);
|
||||
vtcm_row_per_thread = (octx->ctx->vtcm_size) / (n_threads * spad_size_per_row);
|
||||
}
|
||||
} else {
|
||||
spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned);
|
||||
vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
|
||||
}
|
||||
|
||||
// Make sure the reserved vtcm size is sufficient
|
||||
if (vtcm_row_per_thread == 0) {
|
||||
|
|
@ -797,8 +957,25 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
|||
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
||||
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
||||
|
||||
if (octx->op == HTP_OP_RMS_NORM_MUL) {
|
||||
if (broadcast_weight) {
|
||||
octx->src1_spad.size_per_thread = src1_row_size_aligned;
|
||||
} else {
|
||||
octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread * 2;
|
||||
}
|
||||
octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
|
||||
} else {
|
||||
octx->src1_spad.size = 0;
|
||||
octx->src1_spad.size_per_thread = 0;
|
||||
}
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
if (octx->op == HTP_OP_RMS_NORM_MUL) {
|
||||
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
|
||||
} else {
|
||||
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
}
|
||||
|
||||
FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
|
|
@ -811,19 +988,24 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
|||
.src0_nrows = src0_nrows,
|
||||
|
||||
.data_src0 = (const uint8_t *)src0->data,
|
||||
.data_src1 = (octx->op == HTP_OP_RMS_NORM_MUL) ? (const uint8_t *)src1->data : NULL,
|
||||
.data_dst = (uint8_t *)dst->data,
|
||||
|
||||
.src0_data_row_size = src0_data_row_size,
|
||||
.src1_data_row_size = src1_data_row_size,
|
||||
.dst_data_row_size = dst_data_row_size,
|
||||
|
||||
.src0_row_size_aligned = src0_row_size_aligned,
|
||||
.src1_row_size_aligned = src1_row_size_aligned,
|
||||
.dst_row_size_aligned = dst_row_size_aligned,
|
||||
|
||||
.src0_spad_half_size = octx->src0_spad.size_per_thread / 2,
|
||||
.src1_spad_half_size = (octx->op == HTP_OP_RMS_NORM_MUL) ? (octx->src1_spad.size_per_thread / (broadcast_weight ? 1 : 2)) : 0,
|
||||
.dst_spad_half_size = octx->dst_spad.size_per_thread / 2,
|
||||
|
||||
.block = (octx->src0_spad.size_per_thread / 2) / src0_row_size_aligned,
|
||||
.nc = src0->ne[0],
|
||||
.broadcast_weight = broadcast_weight,
|
||||
};
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, unary_job_f32_per_thread, &uctx, n_threads);
|
||||
|
|
|
|||
|
|
@ -1,153 +0,0 @@
|
|||
#ifndef OP_DESC_H
|
||||
#define OP_DESC_H
|
||||
|
||||
#define GGML_COMMON_IMPL_CPP
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
struct op_desc {
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
|
||||
} else {
|
||||
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_dims(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += format_tensor_dims(p, t->src[0]);
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_dims(p, t->src[i]);
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
// format self dims separately for better visual alignment
|
||||
char self[64];
|
||||
format_tensor_dims(self, t);
|
||||
|
||||
p += sprintf(p, "%s", self);
|
||||
}
|
||||
|
||||
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
|
||||
const char * c = ggml_is_contiguous(t) ? "" : "!";
|
||||
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
|
||||
} else {
|
||||
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_strides(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += format_tensor_strides(p, t->src[0]);
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_strides(p, t->src[i]);
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
// format self dims separately for better visual alignment
|
||||
char self[64];
|
||||
format_tensor_strides(self, t);
|
||||
|
||||
p += sprintf(p, "%s", self);
|
||||
}
|
||||
|
||||
void format_op_types(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", ggml_type_name(t->type));
|
||||
}
|
||||
|
||||
const char * tensor_buff_name(const struct ggml_tensor * t) {
|
||||
if (t->buffer) {
|
||||
return ggml_backend_buffer_name(t->buffer);
|
||||
}
|
||||
return "NONE";
|
||||
}
|
||||
|
||||
void format_op_buffs(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", tensor_buff_name(t));
|
||||
}
|
||||
|
||||
void format_op_names(char * str, const struct ggml_tensor * t) {
|
||||
char * p = str;
|
||||
|
||||
// append src0 and src1 (if any)
|
||||
if (t->src[0]) {
|
||||
p += sprintf(p, "%s", t->src[0]->name);
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", t->src[i]->name);
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", t->name);
|
||||
}
|
||||
|
||||
void format(const ggml_tensor * op) {
|
||||
format_op_dims(dims, op);
|
||||
format_op_strides(strides, op);
|
||||
format_op_types(types, op);
|
||||
format_op_buffs(buffs, op);
|
||||
format_op_names(names, op);
|
||||
}
|
||||
|
||||
op_desc() {}
|
||||
op_desc(const ggml_tensor * op) { format(op); }
|
||||
};
|
||||
|
||||
#endif // OP_DESC_H
|
||||
|
|
@ -24,7 +24,7 @@ COL_MAP = {
|
|||
}
|
||||
|
||||
op_pattern = re.compile(
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
|
||||
)
|
||||
|
||||
logger = logging.getLogger("ggml-hexagon-profile")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue