From 5d2b52d80d9f375a6e81d07e212d047d8ee4f76e Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 23 Apr 2026 14:17:21 -0700 Subject: [PATCH] hexagon: add support for basic and extended Op profiling (#22269) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * hexagon: restore HTP_OPMASK_QUEUE * hexagon: honor OPMASK_SKIP_COMPUTE in hmx-matmul * hex-prof: restore op profiling * hex-prof: enable PMU * hexagon: simplify and improve op-queuing with full profiling support Add separate profile descriptors. * hexagon: remove opsync and rename opmask into opstage opsync is no longer needed since the profiler is fully async now. opmask name was confusing and opstage is more accurate. * hexagon: refactor opbatch queue handling * hexagon: add iface hooks for enabling profiler from the host Also move all the PMU setup stuff out of the hex-utils since it's not inteded for normal use. * hexagon: make profiler mode configurable On older devices getting PMU counters is expensive so it's now optional. * hexagon: add support for setting profiler pmu events from env * hexagon: simplify profiler output (no need to print buffs, etc) * hexagon: simplify pmu counter formating * hexagon: add a simple profile post-proc tool * hex-prof: add support for reading logs from stdin * hexagon: document GGML_HEXAGON_PROFILE * hex-prof: update default width for dims field * hex-prof: fix linter warnings and errors * Update ggml/src/ggml-hexagon/htp/htp-ops.h Co-authored-by: Sigbjørn Skjæret * Update scripts/snapdragon/ggml-hexagon-profile.py Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Trivikram Reddy Co-authored-by: Sigbjørn Skjæret --- docs/backend/snapdragon/README.md | 19 +- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 402 +++++++++++------- ggml/src/ggml-hexagon/htp/hex-utils.h | 28 ++ ggml/src/ggml-hexagon/htp/htp-ctx.h | 5 +- ggml/src/ggml-hexagon/htp/htp-ops.h | 36 +- ggml/src/ggml-hexagon/htp/htp_iface.idl | 8 +- ggml/src/ggml-hexagon/htp/main.c | 176 +++++--- ggml/src/ggml-hexagon/htp/matmul-ops.c | 4 + scripts/snapdragon/adb/run-bench.sh | 4 +- scripts/snapdragon/adb/run-cli.sh | 4 +- scripts/snapdragon/adb/run-completion.sh | 4 +- scripts/snapdragon/adb/run-mtmd.sh | 4 +- scripts/snapdragon/adb/run-tool.sh | 4 +- scripts/snapdragon/ggml-hexagon-profile.py | 188 ++++++++ scripts/snapdragon/windows/run-bench.ps1 | 6 +- scripts/snapdragon/windows/run-cli.ps1 | 6 +- scripts/snapdragon/windows/run-completion.ps1 | 6 +- scripts/snapdragon/windows/run-mtmd.ps1 | 6 +- scripts/snapdragon/windows/run-tool.ps1 | 6 +- 19 files changed, 671 insertions(+), 245 deletions(-) create mode 100755 scripts/snapdragon/ggml-hexagon-profile.py diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md index e13fdfd05..2414eeaf6 100644 --- a/docs/backend/snapdragon/README.md +++ b/docs/backend/snapdragon/README.md @@ -249,18 +249,27 @@ build: 6a8cf8914 (6733) ``` - `GGML_HEXAGON_PROFILE=1` - Generates a host-side profile for the ggml-hexagon Ops. + Enables Op profiling: -- `GGML_HEXAGON_OPMASK=0x0` - Allows enabling specific stages of the processing pipeline: + - `1` Basic profile with per-op `usecs` and `cycles` counters + - `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data + - `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data + + The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report. + Examples: + + `GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -` + +- `GGML_HEXAGON_OPSTAGE=0x0` + Allows enabling specific stages of the Op processing pipeline: - `0x1` Enable Op Queue (i.e., queuing Ops into NPU) - `0x2` Enable Op Compute (MUL_MAT, etc.) Examples: - `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out - `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default) + `GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled + `GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default) - `GGML_HEXAGON_OPFILTER=regex` Allows filtering (disabling) Ops that match the regex pattern: diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index cdd9fcf59..955903418 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -12,9 +12,12 @@ #include #include #include +#include +#include #include #include #include +#include #ifdef _WIN32 # include @@ -41,18 +44,26 @@ #include "htp_iface.h" #include "htp-drv.h" +using intvec = std::vector; +using uintvec = std::vector; +using u32vec = std::vector; + static size_t opt_ndev = 1; static size_t opt_nhvx = 0; // use all static int opt_arch = 0; // autodetect static int opt_etm = 0; static int opt_verbose = 0; -static int opt_profile = 0; +static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu) static int opt_hostbuf = 1; // hostbuf ON by default static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only +// Default PMU events, if profiling with PMU (mode=2) is enabled +// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html +// https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html +static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C }; + // Enable all stages by default -static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE; -static int opt_opsync = 0; // synchronous ops +static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE; static int opt_opbatch = 1024; // max number of ops in a batch static int opt_opqueue = 16; // max number of pending batches static std::regex* opt_opfilter = NULL; // regex of ops to not claim @@ -104,19 +115,26 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct } static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op, - uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) { + uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) { if (!opt_profile) return; op_desc desc(op); - GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(), - ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, - op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec); + + char pmu_str[256] = ""; + if (opt_profile > 1) { + static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters"); + sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]", + pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]); + } + + GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(), + ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str); } // ** backend sessions struct ggml_hexagon_opbatch; -struct ggml_hexagon_opshm; +struct ggml_hexagon_opqueue; struct ggml_hexagon_session { std::string name; @@ -132,8 +150,8 @@ struct ggml_hexagon_session { bool valid_iface; std::atomic op_pending; - ggml_hexagon_opbatch *op_batch; - ggml_hexagon_opshm *op_shm; + ggml_hexagon_opbatch* op_batch; + ggml_hexagon_opqueue* op_queue; ggml_backend_buffer_type buffer_type = {}; ggml_backend_buffer_type repack_buffer_type = {}; @@ -1521,65 +1539,14 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf // Backend session implementation -struct ggml_hexagon_opshm { - ggml_hexagon_shared_buffer *sbuf; - - std::vector block_mask; - size_t block_size; - - uint8_t * base() const { return this->sbuf->base; } - int fd() const { return this->sbuf->fd; } - size_t n_blocks() const { return this->block_mask.size(); } - - ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) { - size_t n_bufs = HTP_OP_MAX_BUFS; - size_t n_ops = max_batch; - size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS; - - block_mask.resize(max_pending, true); - - block_size = sizeof(htp_buf_desc) * n_bufs + - sizeof(htp_tensor) * n_tensors + - sizeof(htp_op_desc) * n_ops; - - sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */); - - if (opt_verbose) { - GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n", - sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending); - } - } - - ~ggml_hexagon_opshm() { - delete sbuf; - } - - uint8_t * allocate() { - auto it = std::find(block_mask.begin(), block_mask.end(), true); - if (it == block_mask.end()) - return nullptr; - - unsigned int i = std::distance(block_mask.begin(), it); - uint8_t* addr = sbuf->base + (i * block_size); - block_mask[i] = false; - - HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr); - return addr; - } - - void release(uint8_t * addr) { - int i = (addr - sbuf->base) / block_size; - block_mask[i] = true; - HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr); - } -}; - struct ggml_hexagon_opbatch { - const char* name; + ggml_hexagon_session* sess; - std::vector buffers; - std::vector tensors; - std::vector ops; + std::vector ops; // pointers to original ops + + std::vector h_bufs; // htp buffer descriptors + std::vector h_tens; // htp tensor descriptors + std::vector h_ops; // htp op descriptors std::unordered_map b_map; // buffer fd to index std::unordered_map t_map; // tensor ptr to index @@ -1606,19 +1573,21 @@ struct ggml_hexagon_opbatch { d_map.clear(); } - ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) { - name = sess->c_name(); + ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) { + this->sess = sess; n_bufs_max = HTP_OP_MAX_BUFS; - n_ops_max = max_batch; + n_ops_max = batch_size; n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS; b_vmem_max = HTP_OP_MAX_VMEM; - buffers.resize(n_bufs_max); - tensors.resize(n_tens_max); ops.resize(n_ops_max); + h_bufs.resize(n_bufs_max); + h_tens.resize(n_tens_max); + h_ops.resize(n_ops_max); + b_map.reserve(n_bufs_max); t_map.reserve(n_tens_max); d_map.reserve(n_tens_max); @@ -1640,7 +1609,7 @@ struct ggml_hexagon_opbatch { b_map.insert({sbuf->fd, bi}); - htp_buf_desc &b = buffers[bi]; + htp_buf_desc &b = h_bufs[bi]; b.base = (uint64_t) sbuf->base; b.fd = sbuf->fd; b.size = sbuf->size; @@ -1664,7 +1633,7 @@ struct ggml_hexagon_opbatch { // First lookup by tensor data auto range = d_map.equal_range(t->data); for (auto it = range.first; it != range.second; ++it) { - htp_tensor * h = &tensors[it->second]; + htp_tensor * h = &h_tens[it->second]; if (same_shape(h, t)) { return it->second; } } @@ -1682,7 +1651,7 @@ struct ggml_hexagon_opbatch { uint64_t t_offset = (uint8_t *) t->data - sbuf->base; size_t t_size = ggml_nbytes(t); - htp_tensor &h = tensors[ti]; + htp_tensor &h = h_tens[ti]; h.bi = add_buffer(sbuf); h.data = t_offset; h.size = t_size; @@ -1737,65 +1706,170 @@ struct ggml_hexagon_opbatch { // assumes that fit_op() was called first and returned true void add_op(htp_op_code opcode, const struct ggml_tensor * t) { // Add new op - htp_op_desc &o = ops[n_ops++]; + + unsigned int n = n_ops++; GGML_ASSERT(n_ops <= n_ops_max); + ops[n] = t; + + htp_op_desc &o = h_ops[n]; memcpy(&o.params, &t->op_params, sizeof(t->op_params)); o.opcode = opcode; o.flags = 0; - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { + if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) { o.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } - ggml_hexagon_dump_op_exec(name, t, o.flags); + ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags); for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) { o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff; } o.dst = add_tensor(t); } +}; - size_t flush(uint8_t * mem_addr, size_t mem_size) { - static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8"); - static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8"); - static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8"); +struct ggml_hexagon_opqueue { + // Shared buffer for storing batches + ggml_hexagon_shared_buffer *shm_buf; + size_t shm_blk_size; - const size_t b_size = sizeof(htp_buf_desc) * n_bufs; - const size_t t_size = sizeof(htp_tensor) * n_tens; - const size_t o_size = sizeof(htp_op_desc) * n_ops; + using opvec = std::vector; - const size_t m_size = b_size + t_size + o_size; - GGML_ASSERT(m_size <= mem_size); + std::queue done; // completed batch ids + std::vector op_cache; // per batch op cache + std::vector start_usec; // per batch start time - uint8_t * b_ptr = (uint8_t *) mem_addr; - uint8_t * t_ptr = (uint8_t *) b_ptr + b_size; - uint8_t * o_ptr = (uint8_t *) t_ptr + t_size; + ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) { + size_t n_bufs = HTP_OP_MAX_BUFS; + size_t n_ops = batch_size; + size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS; - memcpy(b_ptr, (void *) buffers.data(), b_size); - memcpy(t_ptr, (void *) tensors.data(), t_size); - memcpy(o_ptr, (void *) ops.data(), o_size); + shm_blk_size = sizeof(htp_buf_desc) * n_bufs + + sizeof(htp_tensor) * n_tensors + + sizeof(htp_op_desc) * n_ops + + sizeof(htp_prof_desc) * n_ops; - HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n", - name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size); + shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */); + + op_cache.resize(depth); + start_usec.resize(depth, 0); + + // init done queue + for (unsigned int i = 0; i < depth; i++) { done.push(i); } + + if (opt_verbose) { + GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n", + sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size); + } + } + + ~ggml_hexagon_opqueue() { + delete shm_buf; + } + + // push new batch + bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) { + static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8"); + static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8"); + static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8"); + static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8"); + static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8"); + static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8"); + + if (done.empty()) { return false; } + + req.id = done.front(); done.pop(); // batch id + req.n_bufs = op_batch->n_bufs; + req.n_tensors = op_batch->n_tens; + req.n_ops = op_batch->n_ops; + + op_cache[req.id] = op_batch->ops; + start_usec[req.id] = ggml_time_us(); + + const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs; + const size_t t_size = sizeof(htp_tensor) * req.n_tensors; + const size_t o_size = sizeof(htp_op_desc) * req.n_ops; + const size_t p_size = sizeof(htp_prof_desc) * req.n_ops; + + dbuf.ptr = shm_buf->base + (req.id * shm_blk_size); + dbuf.fd = shm_buf->fd; + dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; + dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base; + dbuf.size = b_size + t_size + o_size + p_size; + + GGML_ASSERT(dbuf.size <= shm_blk_size); + + uint8_t * m_ptr = (uint8_t*) dbuf.ptr; + uint8_t * b_ptr = m_ptr; m_ptr += b_size; + uint8_t * t_ptr = m_ptr; m_ptr += t_size; + uint8_t * o_ptr = m_ptr; + + memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size); + memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size); + memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size); + + HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n", + shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem, + b_size, t_size, o_size, (size_t) dbuf.size); + + op_batch->reset(); if (opt_verbose > 1) { htp_buf_desc *b = (htp_buf_desc*) b_ptr; - for (unsigned int i=0; i < n_bufs; i++) { - GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i, + for (unsigned int i=0; i < req.n_bufs; i++) { + GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i, b[i].fd, (void *) b[i].base, (size_t) b[i].size); } htp_tensor *t = (htp_tensor*) t_ptr; - for (unsigned int i=0; i < n_tens; i++) { + for (unsigned int i=0; i < req.n_tensors; i++) { GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n", - name, i, t[i].bi, t[i].data, t[i].size, + shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size, (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]); } } - reset(); + return true; + } - return m_size; + void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) { + GGML_ASSERT(rsp.id < op_cache.size()); + + done.push(rsp.id); + + const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs; + const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors; + const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops; + const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops; + + const size_t m_size = b_size + t_size + o_size + p_size; + GGML_ASSERT(m_size <= shm_blk_size); + + HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n", + shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops, + (size_t) dbuf.size, b_size, t_size, o_size); + + uint8_t * m_ptr = (uint8_t*) dbuf.ptr; + uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size); + + if (opt_profile && rsp.n_ops > 0) { + auto & ops = op_cache[rsp.id]; + + uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id]; + uint32_t htp_usec = 0; + + GGML_ASSERT(rsp.n_ops <= ops.size()); + + const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr; + for (uint32_t i = 0; i < rsp.n_ops; i++) { + htp_usec += pd[i].usecs; + ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu); + } + + GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n", + shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec); + } } }; @@ -1824,17 +1898,12 @@ void ggml_hexagon_session::flush_pending(bool all) { GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs); } - op_shm->release((uint8_t*) dbuf.ptr); - if (rsp.status != HTP_STATUS_OK) { GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status)); // TODO: handle errors } - // FIXME: profile will be per opreq - // this->prof_usecs = rsp.prof_usecs; - // this->prof_cycles = rsp.prof_cycles; - // this->prof_pkts = rsp.prof_pkts; + op_queue->pop(rsp, dbuf); this->op_pending--; // atomic dec @@ -1845,28 +1914,17 @@ void ggml_hexagon_session::flush_pending(bool all) { void ggml_hexagon_session::flush_batch() { if (op_batch->empty()) { return; } - htp_opbatch_req req; - req.n_bufs = op_batch->n_bufs; - req.n_tensors = op_batch->n_tens; - req.n_ops = op_batch->n_ops; + htp_opbatch_req req {}; + dspqueue_buffer dbuf{}; - dspqueue_buffer dbuf; - dbuf.fd = op_shm->fd(); - dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; - dbuf.ptr = op_shm->allocate(); - if (!dbuf.ptr) { + if (!op_queue->push(req, dbuf, op_batch)) { flush_pending(false); - dbuf.ptr = op_shm->allocate(); + op_queue->push(req, dbuf, op_batch); } - dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base(); - dbuf.size = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size); - // Bump pending flag (cleared in the session::flush once we get the response) this->op_pending++; // atomic inc - HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size); - int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT); if (err != 0) { GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err); @@ -2016,25 +2074,33 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { } if (opt_etm) { - err = htp_iface_enable_etm(this->handle); + err = htp_iface_etm(this->handle, 1); if (err != 0) { GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err); } } - // Start the DSP-side service. We need to pass the queue ID to the - // DSP in a FastRPC call; the DSP side will import the queue and start - // listening for packets in a callback. + if (opt_profile) { + htp_iface_pmu_conf pmu_conf{}; + std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events); + + err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf); + if (err != 0) { + GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err); + } + } + + // Allocate buffers and state for op batching + this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch); + this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue); + + // Start processing op batch requests err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx); if (err != 0) { GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err); throw std::runtime_error("ggml-hex: iface start failed (see log for details)"); } this->valid_iface = true; - - // Allocate buffers and state for op batching - this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch); - this->op_shm = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue); } void ggml_hexagon_session::release() noexcept(true) { @@ -2043,7 +2109,7 @@ void ggml_hexagon_session::release() noexcept(true) { int err; delete this->op_batch; - delete this->op_shm; + delete this->op_queue; // Stop the DSP-side service and close the queue if (this->valid_iface) { @@ -2054,12 +2120,20 @@ void ggml_hexagon_session::release() noexcept(true) { } if (opt_etm) { - err = htp_iface_disable_etm(this->handle); + err = htp_iface_etm(this->handle, 0); if (err != 0) { GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err); } } + if (opt_profile) { + htp_iface_pmu_conf pmu_conf{}; + err = htp_iface_profiler(this->handle, 0, &pmu_conf); + if (err != 0) { + GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err); + } + } + if (this->valid_queue) { err = dspqueue_close(queue); if (err != 0) { @@ -2077,7 +2151,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n repack_buffer_type.device = dev; op_batch = nullptr; - op_shm = nullptr; + op_queue = nullptr; try { allocate(dev_id); @@ -2698,7 +2772,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * n = graph->nodes[i]; - if (op_is_compute(n)) { + if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) { sess->enqueue_op(op_remap_to_htp(n), n); } } @@ -3338,6 +3412,26 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons return NULL; } +template std::vector str_to_vec(const char* str) { + std::stringstream ss(str); + std::vector v; + std::string t; + + while (std::getline(ss, t, ',')) { + v.push_back(std::stoul(t, nullptr, 0)); + } + + return v; +} + +template std::string vec_to_str(std::vector v) { + std::stringstream ss; + ss << std::setbase(BASE) << std::showbase; + for (auto i : v) { ss << i << ','; } + auto str = ss.str(); str.pop_back(); // drop last comma + return str; +} + static void ggml_hexagon_init(ggml_backend_reg * reg) { // Basic sanity checks to make sure definitions match static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0, @@ -3351,8 +3445,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE"); const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF"); - const char * str_opmask = getenv("GGML_HEXAGON_OPMASK"); - const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC"); + const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE"); const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH"); const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE"); const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER"); @@ -3365,19 +3458,30 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { auto RE_ICASE = std::regex_constants::icase; - opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL; - opt_verbose = str_verbose ? atoi(str_verbose) : 0; - opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf; - opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask; - opt_opsync = str_opsync ? atoi(str_opsync) : opt_opsync; - opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch; - opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue; - opt_profile = str_profile ? atoi(str_profile) : 0; - opt_etm = str_etm ? atoi(str_etm) : 0; - opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx; - opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx; - opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev; - opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf; + opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL; + opt_verbose = str_verbose ? atoi(str_verbose) : 0; + opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf; + opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage; + opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch; + opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue; + opt_etm = str_etm ? atoi(str_etm) : 0; + opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx; + opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx; + opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev; + opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf; + + if (str_profile) { + opt_pmu_evt = [&]() -> std::vector { + auto v = str_to_vec(str_profile); + switch (v.size()) { + case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events + case 8: opt_profile = 2; return v; // mode with custom pmu events + default: opt_profile = 0; return {}; // garbage input + }}(); + if (opt_profile == 1) opt_pmu_evt = {}; + GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile, + vec_to_str(opt_pmu_evt).c_str()); + } if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) { opt_ndev = GGML_HEXAGON_MAX_SESSIONS; diff --git a/ggml/src/ggml-hexagon/htp/hex-utils.h b/ggml/src/ggml-hexagon/htp/hex-utils.h index f6713c5cf..329249e11 100644 --- a/ggml/src/ggml-hexagon/htp/hex-utils.h +++ b/ggml/src/ggml-hexagon/htp/hex-utils.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "hexagon_types.h" #include "hexagon_protos.h" @@ -100,4 +101,31 @@ static inline void hex_pause() { asm volatile(" pause(#255)\n"); } +#ifndef HEX_NUM_PMU_COUNTERS +#define HEX_NUM_PMU_COUNTERS 8 +#endif + +static inline void hex_get_pmu(uint32_t counters[]) { +#if __HVX_ARCH__ >= 79 + asm volatile("%0 = upmucnt0" : "=r"(counters[0])); + asm volatile("%0 = upmucnt1" : "=r"(counters[1])); + asm volatile("%0 = upmucnt2" : "=r"(counters[2])); + asm volatile("%0 = upmucnt3" : "=r"(counters[3])); + asm volatile("%0 = upmucnt4" : "=r"(counters[4])); + asm volatile("%0 = upmucnt5" : "=r"(counters[5])); + asm volatile("%0 = upmucnt6" : "=r"(counters[6])); + asm volatile("%0 = upmucnt7" : "=r"(counters[7])); +#else + counters[0] = qurt_pmu_get(QURT_PMUCNT0); + counters[1] = qurt_pmu_get(QURT_PMUCNT1); + counters[2] = qurt_pmu_get(QURT_PMUCNT2); + counters[3] = qurt_pmu_get(QURT_PMUCNT3); + counters[4] = qurt_pmu_get(QURT_PMUCNT4); + counters[5] = qurt_pmu_get(QURT_PMUCNT5); + counters[6] = qurt_pmu_get(QURT_PMUCNT6); + counters[7] = qurt_pmu_get(QURT_PMUCNT7); + // qurt_pmu_get_pmucnt(counters); +#endif +} + #endif /* HEX_UTILS_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 78455e6b0..f8c89211a 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -10,6 +10,7 @@ #include #include #include +#include #define HTP_MAX_NTHREADS 10 #define HTP_MAX_MMAPS 16 @@ -66,7 +67,9 @@ struct htp_context { int thread_id; int thread_prio; - int hmx_enabled; + bool hmx_enabled; + bool etm; + uint32_t profiler; uint8_t * vtcm_base; size_t vtcm_size; diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 62d6ec022..56d7b398d 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -42,9 +42,9 @@ enum htp_data_type { // Mask to enable various stages of the Ops. // Used for debugging and profiling. -enum htp_op_mask { - HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP) - HTP_OPMASK_COMPUTE = (1 << 1), // Enable Compute +enum htp_op_stage { + HTP_OPSTAGE_QUEUE = (1 << 0), // Enable Queueing (ie calls into NPU) + HTP_OPSTAGE_COMPUTE = (1 << 1), // Enable Compute }; // Do not reorder first 4 (used as an index) @@ -137,27 +137,45 @@ struct htp_op_desc { int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices uint16_t dst; // Output tensor index +}; - // the rest is filled in-place by the NPU - uint32_t prof_usecs; // Number of usec per request - uint32_t prof_cycles; // Number of cycles per request - uint32_t prof_pkts; // Number of instruction packets per request - uint32_t unused; +enum htp_profiler_mode { + HTP_PROF_DISABLED = 0, + HTP_PROF_BASIC = 1, + HTP_PROF_PMU = 2, +}; + +#define HTP_PROF_PMU_NCNT 8 + +// Profile descriptor +struct htp_prof_desc { + uint32_t opcode; // GGML/HTP Op + uint32_t usecs; // Number of usec + uint32_t cycles; // Number of cycles + uint32_t pad; // Unused + uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters }; struct htp_opbatch_req { + uint32_t id; // Batch id uint32_t n_bufs; // Number of buffers uint32_t n_tensors; // Number of tensors uint32_t n_ops; // Number of ops uint32_t flags; // unused + uint32_t pad; // unused // struct htp_buf_desc bufs[]; -- dspqueue buf 0 // struct htp_tensor tensors[]; -- dspqueue buf 0 // struct htp_op_desc ops[]; -- dspqueue buf 0 }; struct htp_opbatch_rsp { + uint32_t id; // Batch id uint32_t status; // HTP_STATUS_... - // struct htp_op_req ops[]; -- dspqueue buf 0 + uint32_t n_bufs; // Number of buffers + uint32_t n_tensors; // Number of tensors + uint32_t n_ops; // Number of op profile descriptors + uint32_t pad; // unused + // struct htp_prof_desc profs[]; -- dspqueue buf 0 }; #endif /* HTP_OPS_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp_iface.idl b/ggml/src/ggml-hexagon/htp/htp_iface.idl index 3eb5d5a69..dbcafd1d8 100644 --- a/ggml/src/ggml-hexagon/htp/htp_iface.idl +++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl @@ -6,13 +6,17 @@ #include "AEEStdDef.idl" #include "remote.idl" +struct htp_iface_pmu_conf { + uint32 events[8]; +}; + interface htp_iface : remote_handle64 { AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx); AEEResult stop(); AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned); AEEResult munmap(in uint32 fd); - AEEResult enable_etm(); - AEEResult disable_etm(); + AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu); + AEEResult etm(in uint32 enable); }; #endif /* HTP_IDL */ diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 9185c9ffe..088434a63 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -27,6 +27,7 @@ #include "htp-ctx.h" #include "htp-ops.h" #include "htp-ops.h" +#include "htp_iface.h" #include "worker-pool.h" AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) { @@ -103,6 +104,54 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) { return AEE_SUCCESS; } +AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) { + int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable(); + if (err) { + if (err == AEE_EVERSIONNOTSUPPORT) { + FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n"); + } else { + FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err); + } + } + return err; +} + +AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) { + struct htp_context * ctx = (struct htp_context *) handle; + if (!ctx) { + return AEE_EBADPARM; + } + + if (mode == HTP_PROF_PMU) { + const uint32_t* events = pmu_conf->events; + + // Pack 4 event IDs (low 8 bits) into each 32-bit config register + uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0; + for (; i < HEX_NUM_PMU_COUNTERS/2; i++) { + evtcfg |= ((events[i + 0] & 0xFF) << (i * 8)); + evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8)); + } + + // For events >255 pack high 2 bits of all 8 event IDs into cfg register + // 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc. + for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) { + cfg |= (((events[i] >> 8) & 3) << (i * 2)); + } + + FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg); + + // Configure PMU registers + qurt_pmu_set(QURT_PMUCFG, cfg); + qurt_pmu_set(QURT_PMUEVTCFG, evtcfg); + qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1); + qurt_pmu_enable(1); + } + + ctx->profiler = mode; + + return AEE_SUCCESS; +} + AEEResult htp_iface_close(remote_handle64 handle) { struct htp_context * ctx = (struct htp_context *) handle; @@ -129,35 +178,19 @@ AEEResult htp_iface_close(remote_handle64 handle) { } } + if (ctx->profiler) { + qurt_pmu_enable(1); + } + + if (ctx->etm) { + HAP_user_etm_disable(); + } + free(ctx); return AEE_SUCCESS; } -AEEResult htp_iface_enable_etm(remote_handle64 handle) { - int err = HAP_user_etm_enable(); - if (err) { - if (err == AEE_EVERSIONNOTSUPPORT) { - FARF(ERROR, "API HAP_user_etm_enable is not supported\n"); - } else { - FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err); - } - } - return err; -} - -AEEResult htp_iface_disable_etm(remote_handle64 handle) { - int err = HAP_user_etm_disable(); - if (err) { - if (err == AEE_EVERSIONNOTSUPPORT) { - FARF(ERROR, "API HAP_user_etm_disable is not supported\n"); - } else { - FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err); - } - } - return err; -} - -AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) { +AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) { struct htp_context * ctx = (struct htp_context *) handle; if (!ctx) { return AEE_EBADPARM; @@ -204,7 +237,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t return AEE_ENOMEMORY; } -AEEResult htp_iface_munmap(remote_handle64 handle, int fd) { +AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) { struct htp_context * ctx = (struct htp_context *) handle; if (!ctx) { return AEE_EBADPARM; @@ -434,19 +467,39 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) { struct profile_data { uint64_t usecs; uint64_t cycles; - uint64_t pkts; + uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS]; }; -static inline void profile_start(struct profile_data * d) { - d->usecs = HAP_perf_get_qtimer_count(); - d->cycles = hex_get_cycles(); - d->pkts = hex_get_pktcnt(); +static inline void profile_start(uint32_t mode, struct profile_data * d) { + switch (mode) { + case HTP_PROF_PMU: + hex_get_pmu(d->pmu_counters); + // fallthrough + case HTP_PROF_BASIC: + d->usecs = HAP_perf_get_qtimer_count(); + d->cycles = hex_get_cycles(); + break; + default: + break; + } } -static inline void profile_stop(struct profile_data * d) { - d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs); - d->cycles = hex_get_cycles() - d->cycles; - d->pkts = hex_get_pktcnt() - d->pkts; +static inline void profile_stop(uint32_t mode, struct profile_data * d) { + uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS]; + switch (mode) { + case HTP_PROF_PMU: + hex_get_pmu(pmu_counters); + for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) { + d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i]; + } + // fallthrough + case HTP_PROF_BASIC: + d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs); + d->cycles = hex_get_cycles() - d->cycles; + break; + default: + break; + } } static int execute_op(struct htp_ops_context * octx) { @@ -726,30 +779,33 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { continue; } + // Reset poll count for valid requests + poll_count = DSPQUEUE_POLL_COUNT; + const uint32_t n_bufs = req.n_bufs; const uint32_t n_tens = req.n_tensors; const uint32_t n_ops = req.n_ops; - const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs; - const uint32_t t_size = sizeof(struct htp_tensor) * n_tens; - const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops; + const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs; + const uint32_t t_size = sizeof(struct htp_tensor) * n_tens; + const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops; + const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops; - if (dbuf.size < b_size + t_size + o_size) { + if (dbuf.size < b_size + t_size + o_size + p_size) { FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size); break; } - // Reset poll count for valid requests - poll_count = DSPQUEUE_POLL_COUNT; - - uint8_t * m_ptr = dbuf.ptr; - struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size; - struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size; - struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; - - FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", + FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id, n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size); + // Setup descriptor pointers + uint8_t * m_ptr = dbuf.ptr; + struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size; + struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size; + struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; m_ptr += o_size; + struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr; + prep_op_bufs(ctx, bufs, n_bufs); prep_tensors(ctx, bufs, tens, n_tens); @@ -760,22 +816,34 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { for (uint32_t i=0; i < n_ops; i++) { struct profile_data prof; - profile_start(&prof); + + profile_start(ctx->profiler, &prof); proc_op_req(octx, tens, i, &ops[i]); - profile_stop(&prof); - ops[i].prof_usecs = prof.usecs; - ops[i].prof_cycles = prof.cycles; - ops[i].prof_pkts = prof.pkts; + profile_stop(ctx->profiler, &prof); + + if (ctx->profiler) { + pds[i].opcode = ops[i].opcode; + pds[i].usecs = prof.usecs; + pds[i].cycles = prof.cycles; + for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) { + pds[i].pmu[j] = prof.pmu_counters[j]; + } + } } // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0); struct htp_opbatch_rsp rsp; - rsp.status = HTP_STATUS_OK; // FIXME + rsp.id = req.id; + rsp.status = HTP_STATUS_OK; + rsp.n_bufs = n_bufs; + rsp.n_tensors = n_tens; + rsp.n_ops = n_ops; dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; + err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE); if (err != 0) { FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err); diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index bac06693d..a0c265132 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -3017,6 +3017,10 @@ int op_matmul(struct htp_ops_context * octx) { const int act_stride = (int)(src1->nb[1] / sizeof(float)); const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16)); + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { + return HTP_STATUS_OK; + } + if (src0->type == HTP_TYPE_F16) { if (is_batched) { hmx_matmul_w16a32_batched_params_t batch_params = { diff --git a/scripts/snapdragon/adb/run-bench.sh b/scripts/snapdragon/adb/run-bench.sh index 36c908da7..27459df24 100755 --- a/scripts/snapdragon/adb/run-bench.sh +++ b/scripts/snapdragon/adb/run-bench.sh @@ -23,10 +23,10 @@ verbose= [ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v" profile= -[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v" +[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v" opmask= -[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK" +[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE" nhvx= [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" diff --git a/scripts/snapdragon/adb/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh index 901d7eff1..e1f0ac0eb 100755 --- a/scripts/snapdragon/adb/run-cli.sh +++ b/scripts/snapdragon/adb/run-cli.sh @@ -28,10 +28,10 @@ sched= [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" profile= -[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v" +[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v" opmask= -[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK" +[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE" nhvx= [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" diff --git a/scripts/snapdragon/adb/run-completion.sh b/scripts/snapdragon/adb/run-completion.sh index f7290825a..7b84106dc 100755 --- a/scripts/snapdragon/adb/run-completion.sh +++ b/scripts/snapdragon/adb/run-completion.sh @@ -28,10 +28,10 @@ sched= [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" profile= -[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v" +[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v" opmask= -[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK" +[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE" nhvx= [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" diff --git a/scripts/snapdragon/adb/run-mtmd.sh b/scripts/snapdragon/adb/run-mtmd.sh index 0c1cf8928..38467beba 100755 --- a/scripts/snapdragon/adb/run-mtmd.sh +++ b/scripts/snapdragon/adb/run-mtmd.sh @@ -37,10 +37,10 @@ sched= [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" profile= -[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" +[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" opmask= -[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK" +[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE" nhvx= [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" diff --git a/scripts/snapdragon/adb/run-tool.sh b/scripts/snapdragon/adb/run-tool.sh index 70ed407e8..27cbb2b6d 100755 --- a/scripts/snapdragon/adb/run-tool.sh +++ b/scripts/snapdragon/adb/run-tool.sh @@ -25,10 +25,10 @@ sched= [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" profile= -[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" +[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" opmask= -[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK" +[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE" nhvx= [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX" diff --git a/scripts/snapdragon/ggml-hexagon-profile.py b/scripts/snapdragon/ggml-hexagon-profile.py new file mode 100755 index 000000000..3edaacd27 --- /dev/null +++ b/scripts/snapdragon/ggml-hexagon-profile.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 + +import sys +import os +import re +import argparse +import statistics +import logging + +from collections import defaultdict + +# Mapping of cli-friendly names to (internal_data_key, Display Header, numeric_sort_key) +COL_MAP = { + "op": ("op", "Op", "op"), + "dims": ("dims", "Dims", "dims"), + "dtypes": ("dtypes", "DTypes", "dtypes"), + "count": ("count", "Count", "_sort_count"), + "max-usec": ("max_usec", "Max usec", "_sort_max_usec"), + "avg-usec": ("avg_usec", "Avg usec", "_sort_avg_usec"), + "max-cycles": ("max_cycles", "Max Cycles", "_sort_max_cycles"), + "avg-cycles": ("avg_cycles", "Avg Cycles", "_sort_avg_cycles"), + "max-pmu": ("max_pmu", "Max PMU", "_sort_max_pmu"), + "avg-pmu": ("avg_pmu", "Avg PMU", "_sort_avg_pmu"), +} + +op_pattern = re.compile( + r"profile-op\s+(?P[A-Z_0-9]+):\s+.*?\s+:\s+(?P[\d:x\s\->!]+)\s+:\s+(?P[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P\d+)\s+cycles\s+(?P\d+)(?:\s+pmu\s+\[(?P[\d,\s]+)\])?" +) + +logger = logging.getLogger("ggml-hexagon-profile") + + +def parse_log(file_path, pmu_index=None): + try: + if file_path != "-": + f = open(file_path, 'r', encoding='utf-8', errors='ignore') + else: + f = os.fdopen(0, 'r', encoding='utf-8', errors='ignore') + except FileNotFoundError: + logger.error(f"file '{file_path}' not found.") + sys.exit(1) + + all_ops = [] + for line in f: + match = op_pattern.search(line) + if not match: continue + + pmu_raw = match.group('pmu') + pmu_val = None + if pmu_raw and pmu_index is not None: + try: + pmu_list = [int(x.strip()) for x in pmu_raw.split(',')] + if len(pmu_list) > pmu_index: + pmu_val = pmu_list[pmu_index] + except (ValueError, IndexError): + pmu_val = None + + all_ops.append({ + 'name': match.group('op_name'), + 'dims': match.group('dims').strip(), + 'types': match.group('types').strip(), + 'usec': int(match.group('usec')), + 'cycles': int(match.group('cycles')), + 'pmu_val': pmu_val + }) + + f.close() + + return all_ops + + +def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None): + if not ops: + logger.info("No valid records found.") + return + + grouped = defaultdict(list) + for op in ops: + key = (op['name'], op['dims'], op['types']) + grouped[key].append(op) + + group_stats = [] + for (name, dims, types), group_ops in grouped.items(): + usecs = [o['usec'] for o in group_ops] + cycles = [o['cycles'] for o in group_ops] + pmu_vals = [o['pmu_val'] for o in group_ops if o['pmu_val'] is not None] + + group_stats.append({ + 'op': name, + 'dims': dims, + 'dtypes': types, + 'count': str(len(group_ops)), + 'max_usec': str(max(usecs)), + 'avg_usec': f"{statistics.mean(usecs):.2f}", + 'max_cycles': str(max(cycles)), + 'avg_cycles': f"{statistics.mean(cycles):.2f}", + 'max_pmu': str(max(pmu_vals)) if pmu_vals else "0", + 'avg_pmu': f"{statistics.mean(pmu_vals):.2f}" if pmu_vals else "0.00", + # Numeric values for accurate sorting + '_sort_count': len(group_ops), + '_sort_max_usec': max(usecs), + '_sort_avg_usec': statistics.mean(usecs), + '_sort_max_cycles': max(cycles), + '_sort_avg_cycles': statistics.mean(cycles), + '_sort_max_pmu': max(pmu_vals) if pmu_vals else 0, + '_sort_avg_pmu': statistics.mean(pmu_vals) if pmu_vals else 0 + }) + + # Sorting logic + actual_sort_key = COL_MAP[sort_col][2] + # We sort numeric fields descending, strings (op/dims) ascending + is_numeric = actual_sort_key.startswith("_") or actual_sort_key == "count" + sorted_groups = sorted(group_stats, key=lambda x: x[actual_sort_key], reverse=is_numeric)[:top_n] + + # Define initial column order + active_cols = ["op", "dims", "dtypes"] + if pmu_name: + active_cols += ["max-pmu", "avg-pmu"] + active_cols += ["max-usec", "avg-usec", "max-cycles", "avg-cycles", "count"] + + final_headers, final_keys, final_widths = [], [], [] + + for col_name in active_cols: + data_key, header_text, _ = COL_MAP[col_name] + if "pmu" in col_name and pmu_name: + header_text = header_text.replace("PMU", pmu_name) + + natural_width = max([len(row[data_key]) for row in sorted_groups] + [len(header_text)]) + target_width = width_overrides.get(col_name, natural_width) + + if target_width == 0: + continue + + final_headers.append(header_text) + final_keys.append(data_key) + final_widths.append(target_width) + + # Print Report + logger.info(f"\n# Profile Report (Top {top_n} Ops sorted by {sort_col})\n") + header_line = "| " + " | ".join(f"{h:<{final_widths[i]}}" for i, h in enumerate(final_headers)) + " |" + sep_line = "| " + " | ".join("-" * final_widths[i] for i in range(len(final_headers))) + " |" + logger.info(header_line) + logger.info(sep_line) + + for group in sorted_groups: + row_vals = [] + for i, key in enumerate(final_keys): + val = group[key] + if len(val) > final_widths[i]: + val = val[:final_widths[i] - 3] + "..." + row_vals.append(f"{val:<{final_widths[i]}}") + logger.info("| " + " | ".join(row_vals) + " |") + + +def main(): + parser = argparse.ArgumentParser(description="Post-process Op profile info.") + parser.add_argument("logfile") + parser.add_argument("-n", "--top", type=int, default=100) + parser.add_argument("--sort", type=str, default="max-usec", choices=list(COL_MAP.keys())) + parser.add_argument("--pmu-index", type=int) + parser.add_argument("--pmu-name", type=str) + parser.add_argument("--width", action='append', default=['dims:40'], help="Override column width, e.g. --width dims:50") + + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format='%(message)s') + + # Sort validation: can't sort by PMU if index isn't provided + if "pmu" in args.sort and args.pmu_index is None: + logger.error(f"Cannot sort by '{args.sort}' without --pmu-index.") + sys.exit(1) + + overrides = {} + if args.width: + for w in args.width: + try: + name, val = w.split(':') + overrides[name.lower()] = int(val) + except ValueError: + logger.warning(f"Invalid width format '{w}'") + + final_pmu_name = (args.pmu_name or f"#{args.pmu_index}") if args.pmu_index is not None else None + ops = parse_log(args.logfile, pmu_index=args.pmu_index) + generate_report(ops, args.top, overrides, args.sort, pmu_name=final_pmu_name) + + +if __name__ == "__main__": + main() diff --git a/scripts/snapdragon/windows/run-bench.ps1 b/scripts/snapdragon/windows/run-bench.ps1 index 5a3a9074d..8bf6939d2 100644 --- a/scripts/snapdragon/windows/run-bench.ps1 +++ b/scripts/snapdragon/windows/run-bench.ps1 @@ -21,11 +21,11 @@ if ($null -ne $env:V) { } if ($null -ne $env:PROF) { - $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 + $env:GGML_HEXAGON_PROFILE=$env:PROF } -if ($null -ne $env:OPMASK) { - $env:GGML_HEXAGON_OPMASK=$env:OPMASK +if ($null -ne $env:OPSTAGE) { + $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE } if ($null -ne $env:NHVX) { diff --git a/scripts/snapdragon/windows/run-cli.ps1 b/scripts/snapdragon/windows/run-cli.ps1 index c64aaf725..104452f9b 100644 --- a/scripts/snapdragon/windows/run-cli.ps1 +++ b/scripts/snapdragon/windows/run-cli.ps1 @@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) { } if ($null -ne $env:PROF) { - $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 + $env:GGML_HEXAGON_PROFILE=$env:PROF } -if ($null -ne $env:OPMASK) { - $env:GGML_HEXAGON_OPMASK=$env:OPMASK +if ($null -ne $env:OPSTAGE) { + $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE } if ($null -ne $env:NHVX) { diff --git a/scripts/snapdragon/windows/run-completion.ps1 b/scripts/snapdragon/windows/run-completion.ps1 index a896cd352..5841a82fa 100644 --- a/scripts/snapdragon/windows/run-completion.ps1 +++ b/scripts/snapdragon/windows/run-completion.ps1 @@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) { } if ($null -ne $env:PROF) { - $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 + $env:GGML_HEXAGON_PROFILE=$env:PROF } -if ($null -ne $env:OPMASK) { - $env:GGML_HEXAGON_OPMASK=$env:OPMASK +if ($null -ne $env:OPSTAGE) { + $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE } if ($null -ne $env:NHVX) { diff --git a/scripts/snapdragon/windows/run-mtmd.ps1 b/scripts/snapdragon/windows/run-mtmd.ps1 index f230ac5a6..be8178751 100644 --- a/scripts/snapdragon/windows/run-mtmd.ps1 +++ b/scripts/snapdragon/windows/run-mtmd.ps1 @@ -34,11 +34,11 @@ if ($null -ne $env:SCHED) { } if ($null -ne $env:PROF) { - $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 + $env:GGML_HEXAGON_PROFILE=$env:PROF } -if ($null -ne $env:OPMASK) { - $env:GGML_HEXAGON_OPMASK=$env:OPMASK +if ($null -ne $env:OPSTAGE) { + $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE } if ($null -ne $env:NHVX) { diff --git a/scripts/snapdragon/windows/run-tool.ps1 b/scripts/snapdragon/windows/run-tool.ps1 index 39edbfcf7..15c880f2d 100644 --- a/scripts/snapdragon/windows/run-tool.ps1 +++ b/scripts/snapdragon/windows/run-tool.ps1 @@ -31,11 +31,11 @@ if ($null -ne $env:SCHED) { } if ($null -ne $env:PROF) { - $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 + $env:GGML_HEXAGON_PROFILE=$env:PROF } -if ($null -ne $env:OPMASK) { - $env:GGML_HEXAGON_OPMASK=$env:OPMASK +if ($null -ne $env:OPSTAGE) { + $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE } if ($null -ne $env:NHVX) {