hexagon: add support for basic and extended Op profiling (#22269)

* hexagon: restore HTP_OPMASK_QUEUE * hexagon: honor OPMASK_SKIP_COMPUTE in hmx-matmul * hex-prof: restore op profiling * hex-prof: enable PMU * hexagon: simplify and improve op-queuing with full profiling support Add separate profile descriptors. * hexagon: remove opsync and rename opmask into opstage opsync is no longer needed since the profiler is fully async now. opmask name was confusing and opstage is more accurate. * hexagon: refactor opbatch queue handling * hexagon: add iface hooks for enabling profiler from the host Also move all the PMU setup stuff out of the hex-utils since it's not inteded for normal use. * hexagon: make profiler mode configurable On older devices getting PMU counters is expensive so it's now optional. * hexagon: add support for setting profiler pmu events from env * hexagon: simplify profiler output (no need to print buffs, etc) * hexagon: simplify pmu counter formating * hexagon: add a simple profile post-proc tool * hex-prof: add support for reading logs from stdin * hexagon: document GGML_HEXAGON_PROFILE * hex-prof: update default width for dims field * hex-prof: fix linter warnings and errors * Update ggml/src/ggml-hexagon/htp/htp-ops.h Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update scripts/snapdragon/ggml-hexagon-profile.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Trivikram Reddy <tamarnat@qti.qualcomm.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-28 03:30:20 +00:00 · 2026-04-23 14:17:21 -07:00 · 2026-04-23 14:17:21 -07:00 · 5d2b52d80d
commit 5d2b52d80d
parent 187a456370
19 changed files with 671 additions and 245 deletions
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@ -249,18 +249,27 @@ build: 6a8cf8914 (6733)
  ```

 - `GGML_HEXAGON_PROFILE=1`
-  Generates a host-side profile for the ggml-hexagon Ops.
+  Enables Op profiling:

- `GGML_HEXAGON_OPMASK=0x0`
-  Allows enabling specific stages of the processing pipeline:
+  - `1` Basic profile with per-op `usecs` and `cycles` counters
+  - `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
+  - `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
+
+  The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
+  Examples:
+
+      `GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
+
+- `GGML_HEXAGON_OPSTAGE=0x0`
+  Allows enabling specific stages of the Op processing pipeline:

  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
  - `0x2` Enable Op Compute (MUL_MAT, etc.)

  Examples:

-      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
-      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
+      `GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
+      `GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)

 - `GGML_HEXAGON_OPFILTER=regex`
  Allows filtering (disabling) Ops that match the regex pattern:
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@ -12,9 +12,12 @@
 #include <cstddef>
 #include <stdexcept>
 #include <string>
+#include <sstream>
+#include <iomanip>
 #include <unordered_set>
 #include <unordered_map>
 #include <regex>
+#include <queue>

 #ifdef _WIN32
 #    include <sal.h>
@ -41,18 +44,26 @@
 #include "htp_iface.h"
 #include "htp-drv.h"

+using intvec  = std::vector<int>;
+using uintvec = std::vector<unsigned int>;
+using u32vec  = std::vector<uint32_t>;
+
 static size_t opt_ndev         = 1;
 static size_t opt_nhvx         = 0; // use all
 static int    opt_arch         = 0; // autodetect
 static int    opt_etm          = 0;
 static int    opt_verbose      = 0;
-static int    opt_profile      = 0;
+static int    opt_profile      = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
 static int    opt_hostbuf      = 1; // hostbuf ON by default
 static int    opt_use_hmx      = 1; // when set, enable HMX; when 0, use HVX only

+// Default PMU events, if profiling with PMU (mode=2) is enabled
+// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
+//     https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
+static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
+
 // Enable all stages by default
-static int opt_opmask   = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE;
-static int opt_opsync   = 0;  // synchronous ops
+static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
 static int opt_opbatch  = 1024; // max number of ops in a batch
 static int opt_opqueue  = 16;   // max number of pending batches
 static std::regex* opt_opfilter = NULL; // regex of ops to not claim
@ -104,19 +115,26 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
 }

 static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
-                                      uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
+                                      uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
    if (!opt_profile) return;

    op_desc desc(op);
-    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
-                ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
-                op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
+
+    char pmu_str[256] = "";
+    if (opt_profile > 1) {
+        static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
+        sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
+                pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
+    }
+
+    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
+            ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
 }

 // ** backend sessions

 struct ggml_hexagon_opbatch;
-struct ggml_hexagon_opshm;
+struct ggml_hexagon_opqueue;

 struct ggml_hexagon_session {
    std::string      name;
@ -132,8 +150,8 @@ struct ggml_hexagon_session {
    bool             valid_iface;

    std::atomic<int>      op_pending;
-    ggml_hexagon_opbatch *op_batch;
-    ggml_hexagon_opshm   *op_shm;
+    ggml_hexagon_opbatch* op_batch;
+    ggml_hexagon_opqueue* op_queue;

    ggml_backend_buffer_type buffer_type        = {};
    ggml_backend_buffer_type repack_buffer_type = {};
@ -1521,65 +1539,14 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf

 // Backend session implementation

-struct ggml_hexagon_opshm {
-    ggml_hexagon_shared_buffer *sbuf;
-
-    std::vector<bool> block_mask;
-    size_t            block_size;
-
-    uint8_t * base()     const { return this->sbuf->base; }
-    int       fd()       const { return this->sbuf->fd;   }
-    size_t    n_blocks() const { return this->block_mask.size(); }
-
-    ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) {
-        size_t n_bufs    = HTP_OP_MAX_BUFS;
-        size_t n_ops     = max_batch;
-        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
-
-        block_mask.resize(max_pending, true);
-
-        block_size = sizeof(htp_buf_desc) * n_bufs    +
-                     sizeof(htp_tensor)   * n_tensors +
-                     sizeof(htp_op_desc)  * n_ops;
-
-        sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */);
-
-        if (opt_verbose) {
-            GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n",
-                    sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending);
-        }
-    }
-
-    ~ggml_hexagon_opshm() {
-        delete sbuf;
-    }
-
-    uint8_t * allocate() {
-        auto it = std::find(block_mask.begin(), block_mask.end(), true);
-        if (it == block_mask.end())
-            return nullptr;
-
-        unsigned int i = std::distance(block_mask.begin(), it);
-        uint8_t*  addr = sbuf->base + (i * block_size);
-        block_mask[i]  = false;
-
-        HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
-        return addr;
-    }
-
-    void release(uint8_t * addr) {
-        int i = (addr - sbuf->base) / block_size;
-        block_mask[i] = true;
-        HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
-    }
-};
-
 struct ggml_hexagon_opbatch {
-    const char* name;
+    ggml_hexagon_session*            sess;

-    std::vector<htp_buf_desc> buffers;
-    std::vector<htp_tensor>   tensors;
-    std::vector<htp_op_desc>  ops;
+    std::vector<const ggml_tensor*>  ops;       // pointers to original ops
+
+    std::vector<htp_buf_desc>        h_bufs;    // htp buffer descriptors
+    std::vector<htp_tensor>          h_tens;    // htp tensor descriptors
+    std::vector<htp_op_desc>         h_ops;     // htp op descriptors

    std::unordered_map<int, int>                b_map; // buffer fd   to index
    std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr  to index
@ -1606,19 +1573,21 @@ struct ggml_hexagon_opbatch {
        d_map.clear();
    }

-    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) {
-        name = sess->c_name();
+    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) {
+        this->sess = sess;

        n_bufs_max = HTP_OP_MAX_BUFS;
-        n_ops_max  = max_batch;
+        n_ops_max  = batch_size;
        n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;

        b_vmem_max = HTP_OP_MAX_VMEM;

-        buffers.resize(n_bufs_max);
-        tensors.resize(n_tens_max);
        ops.resize(n_ops_max);

+        h_bufs.resize(n_bufs_max);
+        h_tens.resize(n_tens_max);
+        h_ops.resize(n_ops_max);
+
        b_map.reserve(n_bufs_max);
        t_map.reserve(n_tens_max);
        d_map.reserve(n_tens_max);
@ -1640,7 +1609,7 @@ struct ggml_hexagon_opbatch {

        b_map.insert({sbuf->fd, bi});

-        htp_buf_desc &b = buffers[bi];
+        htp_buf_desc &b = h_bufs[bi];
        b.base = (uint64_t) sbuf->base;
        b.fd   = sbuf->fd;
        b.size = sbuf->size;
@ -1664,7 +1633,7 @@ struct ggml_hexagon_opbatch {
        // First lookup by tensor data
        auto range = d_map.equal_range(t->data);
        for (auto it = range.first; it != range.second; ++it) {
-            htp_tensor * h = &tensors[it->second];
+            htp_tensor * h = &h_tens[it->second];
            if (same_shape(h, t)) { return it->second; }
        }

@ -1682,7 +1651,7 @@ struct ggml_hexagon_opbatch {
        uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
        size_t   t_size   = ggml_nbytes(t);

-        htp_tensor &h = tensors[ti];
+        htp_tensor &h = h_tens[ti];
        h.bi    = add_buffer(sbuf);
        h.data  = t_offset;
        h.size  = t_size;
@ -1737,65 +1706,170 @@ struct ggml_hexagon_opbatch {
    // assumes that fit_op() was called first and returned true
    void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
        // Add new op
-        htp_op_desc &o = ops[n_ops++];
+
+        unsigned int n = n_ops++;
        GGML_ASSERT(n_ops <= n_ops_max);

+        ops[n] = t;
+
+        htp_op_desc &o = h_ops[n];
        memcpy(&o.params, &t->op_params, sizeof(t->op_params));
        o.opcode = opcode;
        o.flags  = 0;

-        if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
            o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
        }

-        ggml_hexagon_dump_op_exec(name, t, o.flags);
+        ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);

        for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
            o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
        }
        o.dst = add_tensor(t);
    }
+};

-    size_t flush(uint8_t * mem_addr, size_t mem_size) {
-        static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
-        static_assert(sizeof(htp_tensor)   % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
-        static_assert(sizeof(htp_op_desc)  % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
+struct ggml_hexagon_opqueue {
+    // Shared buffer for storing batches
+    ggml_hexagon_shared_buffer *shm_buf;
+    size_t                      shm_blk_size;

-        const size_t b_size = sizeof(htp_buf_desc) * n_bufs;
-        const size_t t_size = sizeof(htp_tensor)   * n_tens;
-        const size_t o_size = sizeof(htp_op_desc)  * n_ops;
+    using opvec = std::vector<const ggml_tensor*>;

-        const size_t m_size = b_size + t_size + o_size;
-        GGML_ASSERT(m_size <= mem_size);
+    std::queue<unsigned int>    done;       // completed batch ids
+    std::vector<opvec>          op_cache;   // per batch op cache
+    std::vector<uint64_t>       start_usec; // per batch start time

-        uint8_t * b_ptr = (uint8_t *) mem_addr;
-        uint8_t * t_ptr = (uint8_t *) b_ptr + b_size;
-        uint8_t * o_ptr = (uint8_t *) t_ptr + t_size;
+    ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
+        size_t n_bufs    = HTP_OP_MAX_BUFS;
+        size_t n_ops     = batch_size;
+        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;

-        memcpy(b_ptr, (void *) buffers.data(), b_size);
-        memcpy(t_ptr, (void *) tensors.data(), t_size);
-        memcpy(o_ptr, (void *) ops.data(),     o_size);
+        shm_blk_size = sizeof(htp_buf_desc)  * n_bufs    +
+                       sizeof(htp_tensor)    * n_tensors +
+                       sizeof(htp_op_desc)   * n_ops     +
+                       sizeof(htp_prof_desc) * n_ops;

-        HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n",
-                name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size);
+        shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
+
+        op_cache.resize(depth);
+        start_usec.resize(depth, 0);
+
+        // init done queue
+        for (unsigned int i = 0; i < depth; i++) { done.push(i); }
+
+        if (opt_verbose) {
+            GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
+                    sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
+        }
+    }
+
+    ~ggml_hexagon_opqueue() {
+        delete shm_buf;
+    }
+
+    // push new batch
+    bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
+        static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
+        static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
+        static_assert(sizeof(htp_buf_desc)    % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
+        static_assert(sizeof(htp_tensor)      % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
+        static_assert(sizeof(htp_op_desc)     % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
+        static_assert(sizeof(htp_prof_desc)   % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
+
+        if (done.empty()) { return false; }
+
+        req.id        = done.front(); done.pop(); // batch id
+        req.n_bufs    = op_batch->n_bufs;
+        req.n_tensors = op_batch->n_tens;
+        req.n_ops     = op_batch->n_ops;
+
+        op_cache[req.id]   = op_batch->ops;
+        start_usec[req.id] = ggml_time_us();
+
+        const size_t b_size = sizeof(htp_buf_desc)  * req.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * req.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * req.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
+
+        dbuf.ptr      = shm_buf->base + (req.id * shm_blk_size);
+        dbuf.fd       = shm_buf->fd;
+        dbuf.flags    = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+        dbuf.offset   = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
+        dbuf.size     = b_size + t_size + o_size + p_size;
+
+        GGML_ASSERT(dbuf.size <= shm_blk_size);
+
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * b_ptr = m_ptr; m_ptr += b_size;
+        uint8_t * t_ptr = m_ptr; m_ptr += t_size;
+        uint8_t * o_ptr = m_ptr;
+
+        memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
+        memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
+        memcpy(o_ptr, (void *) op_batch->h_ops.data(),  o_size);
+
+        HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
+                shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
+                b_size, t_size, o_size, (size_t) dbuf.size);
+
+        op_batch->reset();

        if (opt_verbose > 1) {
            htp_buf_desc *b = (htp_buf_desc*) b_ptr;
-            for (unsigned int i=0; i < n_bufs; i++) {
-                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i,
+            for (unsigned int i=0; i < req.n_bufs; i++) {
+                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
                            b[i].fd, (void *) b[i].base, (size_t) b[i].size);
            }
            htp_tensor *t = (htp_tensor*) t_ptr;
-            for (unsigned int i=0; i < n_tens; i++) {
+            for (unsigned int i=0; i < req.n_tensors; i++) {
                GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
-                            name, i, t[i].bi, t[i].data, t[i].size,
+                            shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
                            (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
            }
        }

-        reset();
+        return true;
+    }

-        return m_size;
+    void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
+        GGML_ASSERT(rsp.id < op_cache.size());
+
+        done.push(rsp.id);
+
+        const size_t b_size = sizeof(htp_buf_desc)  * rsp.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * rsp.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * rsp.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
+
+        const size_t m_size = b_size + t_size + o_size + p_size;
+        GGML_ASSERT(m_size <= shm_blk_size);
+
+        HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
+                shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
+                (size_t) dbuf.size, b_size, t_size, o_size);
+
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
+
+        if (opt_profile && rsp.n_ops > 0) {
+            auto & ops = op_cache[rsp.id];
+
+            uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
+            uint32_t htp_usec   = 0;
+
+            GGML_ASSERT(rsp.n_ops <= ops.size());
+
+            const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
+            for (uint32_t i = 0; i < rsp.n_ops; i++) {
+                htp_usec += pd[i].usecs;
+                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
+            }
+
+            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
+                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
+        }
    }
 };

@ -1824,17 +1898,12 @@ void ggml_hexagon_session::flush_pending(bool all) {
            GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
        }

-        op_shm->release((uint8_t*) dbuf.ptr);
-
        if (rsp.status != HTP_STATUS_OK) {
            GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
            // TODO: handle errors
        }

-        // FIXME: profile will be per opreq
-        // this->prof_usecs  = rsp.prof_usecs;
-        // this->prof_cycles = rsp.prof_cycles;
-        // this->prof_pkts   = rsp.prof_pkts;
+        op_queue->pop(rsp, dbuf);

        this->op_pending--;  // atomic dec

@ -1845,28 +1914,17 @@ void ggml_hexagon_session::flush_pending(bool all) {
 void ggml_hexagon_session::flush_batch() {
    if (op_batch->empty()) { return; }

-    htp_opbatch_req req;
-    req.n_bufs    = op_batch->n_bufs;
-    req.n_tensors = op_batch->n_tens;
-    req.n_ops     = op_batch->n_ops;
+    htp_opbatch_req req {};
+    dspqueue_buffer dbuf{};

-    dspqueue_buffer dbuf;
-    dbuf.fd     = op_shm->fd();
-    dbuf.flags  = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
-    dbuf.ptr    = op_shm->allocate();
-    if (!dbuf.ptr) {
+    if (!op_queue->push(req, dbuf, op_batch)) {
        flush_pending(false);
-        dbuf.ptr = op_shm->allocate();
+        op_queue->push(req, dbuf, op_batch);
    }

-    dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base();
-    dbuf.size   = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size);
-
    // Bump pending flag (cleared in the session::flush once we get the response)
    this->op_pending++;  // atomic inc

-    HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
-
    int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
    if (err != 0) {
        GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
@ -2016,25 +2074,33 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
    }

    if (opt_etm) {
-        err = htp_iface_enable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 1);
        if (err != 0) {
            GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
        }
    }

-    // Start the DSP-side service. We need to pass the queue ID to the
-    // DSP in a FastRPC call; the DSP side will import the queue and start
-    // listening for packets in a callback.
+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
+
+        err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
+
+    // Allocate buffers and state for op batching
+    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
+    this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
+
+    // Start processing op batch requests
    err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
    if (err != 0) {
        GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
        throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
    }
    this->valid_iface = true;
-
-    // Allocate buffers and state for op batching
-    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
-    this->op_shm   = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue);
 }

 void ggml_hexagon_session::release() noexcept(true) {
@ -2043,7 +2109,7 @@ void ggml_hexagon_session::release() noexcept(true) {
    int err;

    delete this->op_batch;
-    delete this->op_shm;
+    delete this->op_queue;

    // Stop the DSP-side service and close the queue
    if (this->valid_iface) {
@ -2054,12 +2120,20 @@ void ggml_hexagon_session::release() noexcept(true) {
    }

    if (opt_etm) {
-        err = htp_iface_disable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 0);
        if (err != 0) {
            GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
        }
    }

+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        err = htp_iface_profiler(this->handle, 0, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
+
    if (this->valid_queue) {
        err = dspqueue_close(queue);
        if (err != 0) {
@ -2077,7 +2151,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
    repack_buffer_type.device = dev;

    op_batch = nullptr;
-    op_shm   = nullptr;
+    op_queue = nullptr;

    try {
        allocate(dev_id);
@ -2698,7 +2772,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg

    for (int i = 0; i < graph->n_nodes; ++i) {
        ggml_tensor * n = graph->nodes[i];
-        if (op_is_compute(n)) {
+        if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
            sess->enqueue_op(op_remap_to_htp(n), n);
        }
    }
@ -3338,6 +3412,26 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
    return NULL;
 }

+template<typename T> std::vector<T> str_to_vec(const char* str) {
+    std::stringstream ss(str);
+    std::vector<T> v;
+    std::string    t;
+
+    while (std::getline(ss, t, ',')) {
+        v.push_back(std::stoul(t, nullptr, 0));
+    }
+
+    return v;
+}
+
+template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
+    std::stringstream ss;
+    ss << std::setbase(BASE) << std::showbase;
+    for (auto i : v) { ss << i << ','; }
+    auto str = ss.str(); str.pop_back(); // drop last comma
+    return str;
+}
+
 static void ggml_hexagon_init(ggml_backend_reg * reg) {
    // Basic sanity checks to make sure definitions match
    static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
@ -3351,8 +3445,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {

    const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
    const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
-    const char * str_opmask  = getenv("GGML_HEXAGON_OPMASK");
-    const char * str_opsync  = getenv("GGML_HEXAGON_OPSYNC");
+    const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
    const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
    const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
    const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER");
@ -3365,19 +3458,30 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {

    auto RE_ICASE = std::regex_constants::icase;

-    opt_opfilter     = str_opfilter     ? new std::regex(str_opfilter, RE_ICASE) : NULL;
-    opt_verbose      = str_verbose ? atoi(str_verbose) : 0;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
-    opt_opmask       = str_opmask  ? strtoul(str_opmask, NULL, 0)  : opt_opmask;
-    opt_opsync       = str_opsync  ? atoi(str_opsync)              : opt_opsync;
-    opt_opbatch      = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
-    opt_opqueue      = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
-    opt_profile      = str_profile ? atoi(str_profile) : 0;
-    opt_etm          = str_etm     ? atoi(str_etm)     : 0;
-    opt_nhvx         = str_nhvx    ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
-    opt_use_hmx      = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
-    opt_ndev         = str_ndev    ? strtoul(str_ndev, NULL, 0) : opt_ndev;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
+    opt_opfilter     = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
+    opt_verbose      = str_verbose  ? atoi(str_verbose)             : 0;
+    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
+    opt_opstage      = str_opstage  ? strtoul(str_opstage, NULL, 0) : opt_opstage;
+    opt_opbatch      = str_opbatch  ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
+    opt_opqueue      = str_opqueue  ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
+    opt_etm          = str_etm      ? atoi(str_etm)                 : 0;
+    opt_nhvx         = str_nhvx     ? strtoul(str_nhvx, NULL, 0)    : opt_nhvx;
+    opt_use_hmx      = str_use_hmx  ? atoi(str_use_hmx)             : opt_use_hmx;
+    opt_ndev         = str_ndev     ? strtoul(str_ndev, NULL, 0)    : opt_ndev;
+    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
+
+    if (str_profile) {
+        opt_pmu_evt = [&]() -> std::vector<uint32_t> {
+            auto v  = str_to_vec<uint32_t>(str_profile);
+            switch (v.size()) {
+                case 1:  opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
+                case 8:  opt_profile = 2;    return v;           // mode with custom  pmu events
+                default: opt_profile = 0;    return {};          // garbage input
+            }}();
+        if (opt_profile == 1) opt_pmu_evt = {};
+        GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
+                vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
+    }

    if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
        opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
--- a/ggml/src/ggml-hexagon/htp/hex-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hex-utils.h
@ -4,6 +4,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <qurt_memory.h>
+#include <qurt.h>

 #include "hexagon_types.h"
 #include "hexagon_protos.h"
@ -100,4 +101,31 @@ static inline void hex_pause() {
    asm volatile(" pause(#255)\n");
 }

+#ifndef HEX_NUM_PMU_COUNTERS
+#define HEX_NUM_PMU_COUNTERS 8
+#endif
+
+static inline void hex_get_pmu(uint32_t counters[]) {
+#if __HVX_ARCH__ >= 79
+    asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
+    asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
+    asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
+    asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
+    asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
+    asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
+    asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
+    asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
+#else
+    counters[0] = qurt_pmu_get(QURT_PMUCNT0);
+    counters[1] = qurt_pmu_get(QURT_PMUCNT1);
+    counters[2] = qurt_pmu_get(QURT_PMUCNT2);
+    counters[3] = qurt_pmu_get(QURT_PMUCNT3);
+    counters[4] = qurt_pmu_get(QURT_PMUCNT4);
+    counters[5] = qurt_pmu_get(QURT_PMUCNT5);
+    counters[6] = qurt_pmu_get(QURT_PMUCNT6);
+    counters[7] = qurt_pmu_get(QURT_PMUCNT7);
+    // qurt_pmu_get_pmucnt(counters);
+#endif
+}
+
 #endif /* HEX_UTILS_H */
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@ -10,6 +10,7 @@
 #include <dspqueue.h>
 #include <stdatomic.h>
 #include <stdint.h>
+#include <stdbool.h>

 #define HTP_MAX_NTHREADS 10
 #define HTP_MAX_MMAPS    16
@ -66,7 +67,9 @@ struct htp_context {
    int                    thread_id;
    int                    thread_prio;

-    int                    hmx_enabled;
+    bool                   hmx_enabled;
+    bool                   etm;
+    uint32_t               profiler;

    uint8_t *              vtcm_base;
    size_t                 vtcm_size;
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@ -42,9 +42,9 @@ enum htp_data_type {

 // Mask to enable various stages of the Ops.
 // Used for debugging and profiling.
-enum htp_op_mask {
-    HTP_OPMASK_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into the DSP)
-    HTP_OPMASK_COMPUTE  = (1 << 1),  // Enable Compute
+enum htp_op_stage {
+    HTP_OPSTAGE_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into NPU)
+    HTP_OPSTAGE_COMPUTE  = (1 << 1),  // Enable Compute
 };

 // Do not reorder first 4 (used as an index)
@ -137,27 +137,45 @@ struct htp_op_desc {
    int32_t  params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
    uint16_t src[HTP_OP_MAX_INPUTS];    // Input tensors indices
    uint16_t dst;                       // Output tensor index
+};

-    // the rest is filled in-place by the NPU
-    uint32_t prof_usecs;                // Number of usec per request
-    uint32_t prof_cycles;               // Number of cycles per request
-    uint32_t prof_pkts;                 // Number of instruction packets per request
-    uint32_t unused;
+enum htp_profiler_mode {
+    HTP_PROF_DISABLED = 0,
+    HTP_PROF_BASIC    = 1,
+    HTP_PROF_PMU      = 2,
+};
+
+#define HTP_PROF_PMU_NCNT 8
+
+// Profile descriptor
+struct htp_prof_desc {
+    uint32_t opcode;                 // GGML/HTP Op
+    uint32_t usecs;                  // Number of usec
+    uint32_t cycles;                 // Number of cycles
+    uint32_t pad;                    // Unused
+    uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
 };

 struct htp_opbatch_req {
+    uint32_t id;          // Batch id
    uint32_t n_bufs;      // Number of buffers
    uint32_t n_tensors;   // Number of tensors
    uint32_t n_ops;       // Number of ops
    uint32_t flags;       // unused
+    uint32_t pad;         // unused
    // struct htp_buf_desc  bufs[];    -- dspqueue buf 0
    // struct htp_tensor    tensors[]; -- dspqueue buf 0
    // struct htp_op_desc   ops[];     -- dspqueue buf 0
 };

 struct htp_opbatch_rsp {
+    uint32_t id;         // Batch id
    uint32_t status;     // HTP_STATUS_...
-    // struct htp_op_req ops[];     -- dspqueue buf 0
+    uint32_t n_bufs;     // Number of buffers
+    uint32_t n_tensors;  // Number of tensors
+    uint32_t n_ops;      // Number of op profile descriptors
+    uint32_t pad;        // unused
+    // struct htp_prof_desc profs[];  -- dspqueue buf 0
 };

 #endif /* HTP_OPS_H */
--- a/ggml/src/ggml-hexagon/htp/htp_iface.idl
+++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl
@ -6,13 +6,17 @@
 #include "AEEStdDef.idl"
 #include "remote.idl"

+struct htp_iface_pmu_conf {
+    uint32 events[8];
+};
+
 interface htp_iface : remote_handle64 {
    AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
    AEEResult stop();
    AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
    AEEResult munmap(in uint32 fd);
-    AEEResult enable_etm();
-    AEEResult disable_etm();
+    AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
+    AEEResult etm(in uint32 enable);
 };

 #endif /* HTP_IDL */
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@ -27,6 +27,7 @@
 #include "htp-ctx.h"
 #include "htp-ops.h"
 #include "htp-ops.h"
+#include "htp_iface.h"
 #include "worker-pool.h"

 AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@ -103,6 +104,54 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
    return AEE_SUCCESS;
 }

+AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
+    int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
+    if (err) {
+        if (err == AEE_EVERSIONNOTSUPPORT) {
+            FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
+        } else {
+            FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
+        }
+    }
+    return err;
+}
+
+AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
+    struct htp_context * ctx = (struct htp_context *) handle;
+    if (!ctx) {
+        return AEE_EBADPARM;
+    }
+
+    if (mode == HTP_PROF_PMU) {
+        const uint32_t* events = pmu_conf->events;
+
+        // Pack 4 event IDs (low 8 bits) into each 32-bit config register
+        uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
+        for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
+            evtcfg  |= ((events[i + 0] & 0xFF) << (i * 8));
+            evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
+        }
+
+        // For events >255 pack high 2 bits of all 8 event IDs into cfg register
+        // 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
+        for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
+            cfg |= (((events[i] >> 8) & 3) << (i * 2));
+        }
+
+        FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
+
+        // Configure PMU registers
+        qurt_pmu_set(QURT_PMUCFG,     cfg);
+        qurt_pmu_set(QURT_PMUEVTCFG,  evtcfg);
+        qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
+        qurt_pmu_enable(1);
+    }
+
+    ctx->profiler = mode;
+
+    return AEE_SUCCESS;
+}
+
 AEEResult htp_iface_close(remote_handle64 handle) {
    struct htp_context * ctx = (struct htp_context *) handle;

@ -129,35 +178,19 @@ AEEResult htp_iface_close(remote_handle64 handle) {
        }
    }

+    if (ctx->profiler) {
+        qurt_pmu_enable(1);
+    }
+
+    if (ctx->etm) {
+        HAP_user_etm_disable();
+    }
+
    free(ctx);
    return AEE_SUCCESS;
 }

-AEEResult htp_iface_enable_etm(remote_handle64 handle) {
-    int err = HAP_user_etm_enable();
-    if (err) {
-        if (err == AEE_EVERSIONNOTSUPPORT) {
-            FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
-        } else {
-            FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
-        }
-    }
-    return err;
-}
-
-AEEResult htp_iface_disable_etm(remote_handle64 handle) {
-    int err = HAP_user_etm_disable();
-    if (err) {
-        if (err == AEE_EVERSIONNOTSUPPORT) {
-            FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
-        } else {
-            FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
-        }
-    }
-    return err;
-}
-
-AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) {
+AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
@ -204,7 +237,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t
    return AEE_ENOMEMORY;
 }

-AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
+AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
@ -434,19 +467,39 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
 struct profile_data {
    uint64_t usecs;
    uint64_t cycles;
-    uint64_t pkts;
+    uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
 };

-static inline void profile_start(struct profile_data * d) {
-    d->usecs  = HAP_perf_get_qtimer_count();
-    d->cycles = hex_get_cycles();
-    d->pkts   = hex_get_pktcnt();
+static inline void profile_start(uint32_t mode, struct profile_data * d) {
+    switch (mode) {
+        case HTP_PROF_PMU:
+            hex_get_pmu(d->pmu_counters);
+            // fallthrough
+        case HTP_PROF_BASIC:
+            d->usecs  = HAP_perf_get_qtimer_count();
+            d->cycles = hex_get_cycles();
+            break;
+        default:
+            break;
+    }
 }

-static inline void profile_stop(struct profile_data * d) {
-    d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
-    d->cycles = hex_get_cycles() - d->cycles;
-    d->pkts   = hex_get_pktcnt() - d->pkts;
+static inline void profile_stop(uint32_t mode, struct profile_data * d) {
+    uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
+    switch (mode) {
+        case HTP_PROF_PMU:
+            hex_get_pmu(pmu_counters);
+            for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
+                d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
+            }
+            // fallthrough
+        case HTP_PROF_BASIC:
+            d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
+            d->cycles = hex_get_cycles() - d->cycles;
+            break;
+        default:
+            break;
+    }
 }

 static int execute_op(struct htp_ops_context * octx) {
@ -726,30 +779,33 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
            continue;
        }

+        // Reset poll count for valid requests
+        poll_count = DSPQUEUE_POLL_COUNT;
+
        const uint32_t n_bufs = req.n_bufs;
        const uint32_t n_tens = req.n_tensors;
        const uint32_t n_ops  = req.n_ops;

-        const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
-        const uint32_t t_size = sizeof(struct htp_tensor)   * n_tens;
-        const uint32_t o_size = sizeof(struct htp_op_desc)  * n_ops;
+        const uint32_t b_size = sizeof(struct htp_buf_desc)  * n_bufs;
+        const uint32_t t_size = sizeof(struct htp_tensor)    * n_tens;
+        const uint32_t o_size = sizeof(struct htp_op_desc)   * n_ops;
+        const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;

-        if (dbuf.size < b_size + t_size + o_size) {
+        if (dbuf.size < b_size + t_size + o_size + p_size) {
            FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
            break;
        }

-        // Reset poll count for valid requests
-        poll_count = DSPQUEUE_POLL_COUNT;
-
-        uint8_t * m_ptr = dbuf.ptr;
-        struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
-        struct htp_tensor*   tens = (struct htp_tensor*)   m_ptr; m_ptr += t_size;
-        struct htp_op_desc*   ops = (struct htp_op_desc*)  m_ptr;
-
-        FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u",
+        FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
                n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);

+        // Setup descriptor pointers
+        uint8_t * m_ptr = dbuf.ptr;
+        struct htp_buf_desc* bufs = (struct htp_buf_desc*)  m_ptr; m_ptr += b_size;
+        struct htp_tensor*   tens = (struct htp_tensor*)    m_ptr; m_ptr += t_size;
+        struct htp_op_desc*   ops = (struct htp_op_desc*)   m_ptr; m_ptr += o_size;
+        struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
+
        prep_op_bufs(ctx, bufs, n_bufs);
        prep_tensors(ctx, bufs, tens, n_tens);

@ -760,22 +816,34 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {

        for (uint32_t i=0; i < n_ops; i++) {
            struct profile_data prof;
-            profile_start(&prof);
+
+            profile_start(ctx->profiler, &prof);

            proc_op_req(octx, tens, i, &ops[i]);

-            profile_stop(&prof);
-            ops[i].prof_usecs  = prof.usecs;
-            ops[i].prof_cycles = prof.cycles;
-            ops[i].prof_pkts   = prof.pkts;
+            profile_stop(ctx->profiler, &prof);
+
+            if (ctx->profiler) {
+                pds[i].opcode = ops[i].opcode;
+                pds[i].usecs  = prof.usecs;
+                pds[i].cycles = prof.cycles;
+                for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
+                    pds[i].pmu[j] = prof.pmu_counters[j];
+                }
+            }
        }

        // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);

        struct htp_opbatch_rsp rsp;
-        rsp.status = HTP_STATUS_OK; // FIXME
+        rsp.id        = req.id;
+        rsp.status    = HTP_STATUS_OK;
+        rsp.n_bufs    = n_bufs;
+        rsp.n_tensors = n_tens;
+        rsp.n_ops     = n_ops;

        dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+
        err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
        if (err != 0) {
            FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@ -3017,6 +3017,10 @@ int op_matmul(struct htp_ops_context * octx) {
    const int act_stride = (int)(src1->nb[1] / sizeof(float));
    const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));

+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
    if (src0->type == HTP_TYPE_F16) {
        if (is_batched) {
            hmx_matmul_w16a32_batched_params_t batch_params = {
--- a/scripts/snapdragon/adb/run-bench.sh
+++ b/scripts/snapdragon/adb/run-bench.sh
@ -23,10 +23,10 @@ verbose=
 [ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"

 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"

 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"

 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
--- a/scripts/snapdragon/adb/run-cli.sh
+++ b/scripts/snapdragon/adb/run-cli.sh
@ -28,10 +28,10 @@ sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"

 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"

 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"

 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
--- a/scripts/snapdragon/adb/run-completion.sh
+++ b/scripts/snapdragon/adb/run-completion.sh
@ -28,10 +28,10 @@ sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"

 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"

 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"

 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
--- a/scripts/snapdragon/adb/run-mtmd.sh
+++ b/scripts/snapdragon/adb/run-mtmd.sh
@ -37,10 +37,10 @@ sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"

 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"

 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"

 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
--- a/scripts/snapdragon/adb/run-tool.sh
+++ b/scripts/snapdragon/adb/run-tool.sh
@ -25,10 +25,10 @@ sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"

 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"

 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"

 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
--- a/scripts/snapdragon/ggml-hexagon-profile.py
+++ b/scripts/snapdragon/ggml-hexagon-profile.py
@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import re
+import argparse
+import statistics
+import logging
+
+from collections import defaultdict
+
+# Mapping of cli-friendly names to (internal_data_key, Display Header, numeric_sort_key)
+COL_MAP = {
+    "op":         ("op",         "Op",         "op"),
+    "dims":       ("dims",       "Dims",       "dims"),
+    "dtypes":     ("dtypes",     "DTypes",     "dtypes"),
+    "count":      ("count",      "Count",      "_sort_count"),
+    "max-usec":   ("max_usec",   "Max usec",   "_sort_max_usec"),
+    "avg-usec":   ("avg_usec",   "Avg usec",   "_sort_avg_usec"),
+    "max-cycles": ("max_cycles", "Max Cycles", "_sort_max_cycles"),
+    "avg-cycles": ("avg_cycles", "Avg Cycles", "_sort_avg_cycles"),
+    "max-pmu":    ("max_pmu",    "Max PMU",    "_sort_max_pmu"),
+    "avg-pmu":    ("avg_pmu",    "Avg PMU",    "_sort_avg_pmu"),
+}
+
+op_pattern = re.compile(
+    r"profile-op\s+(?P<op_name>[A-Z_0-9]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
+)
+
+logger = logging.getLogger("ggml-hexagon-profile")
+
+
+def parse_log(file_path, pmu_index=None):
+    try:
+        if file_path != "-":
+            f = open(file_path, 'r', encoding='utf-8', errors='ignore')
+        else:
+            f = os.fdopen(0, 'r', encoding='utf-8', errors='ignore')
+    except FileNotFoundError:
+        logger.error(f"file '{file_path}' not found.")
+        sys.exit(1)
+
+    all_ops = []
+    for line in f:
+        match = op_pattern.search(line)
+        if not match: continue
+
+        pmu_raw = match.group('pmu')
+        pmu_val = None
+        if pmu_raw and pmu_index is not None:
+            try:
+                pmu_list = [int(x.strip()) for x in pmu_raw.split(',')]
+                if len(pmu_list) > pmu_index:
+                    pmu_val = pmu_list[pmu_index]
+            except (ValueError, IndexError):
+                pmu_val = None
+
+        all_ops.append({
+            'name':    match.group('op_name'),
+            'dims':    match.group('dims').strip(),
+            'types':   match.group('types').strip(),
+            'usec':    int(match.group('usec')),
+            'cycles':  int(match.group('cycles')),
+            'pmu_val': pmu_val
+        })
+
+    f.close()
+
+    return all_ops
+
+
+def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None):
+    if not ops:
+        logger.info("No valid records found.")
+        return
+
+    grouped = defaultdict(list)
+    for op in ops:
+        key = (op['name'], op['dims'], op['types'])
+        grouped[key].append(op)
+
+    group_stats = []
+    for (name, dims, types), group_ops in grouped.items():
+        usecs = [o['usec'] for o in group_ops]
+        cycles = [o['cycles'] for o in group_ops]
+        pmu_vals = [o['pmu_val'] for o in group_ops if o['pmu_val'] is not None]
+
+        group_stats.append({
+            'op':               name,
+            'dims':             dims,
+            'dtypes':           types,
+            'count':            str(len(group_ops)),
+            'max_usec':         str(max(usecs)),
+            'avg_usec':         f"{statistics.mean(usecs):.2f}",
+            'max_cycles':       str(max(cycles)),
+            'avg_cycles':       f"{statistics.mean(cycles):.2f}",
+            'max_pmu':          str(max(pmu_vals)) if pmu_vals else "0",
+            'avg_pmu':          f"{statistics.mean(pmu_vals):.2f}" if pmu_vals else "0.00",
+            # Numeric values for accurate sorting
+            '_sort_count':      len(group_ops),
+            '_sort_max_usec':   max(usecs),
+            '_sort_avg_usec':   statistics.mean(usecs),
+            '_sort_max_cycles': max(cycles),
+            '_sort_avg_cycles': statistics.mean(cycles),
+            '_sort_max_pmu':    max(pmu_vals) if pmu_vals else 0,
+            '_sort_avg_pmu':    statistics.mean(pmu_vals) if pmu_vals else 0
+        })
+
+    # Sorting logic
+    actual_sort_key = COL_MAP[sort_col][2]
+    # We sort numeric fields descending, strings (op/dims) ascending
+    is_numeric    = actual_sort_key.startswith("_") or actual_sort_key == "count"
+    sorted_groups = sorted(group_stats, key=lambda x: x[actual_sort_key], reverse=is_numeric)[:top_n]
+
+    # Define initial column order
+    active_cols = ["op", "dims", "dtypes"]
+    if pmu_name:
+        active_cols += ["max-pmu", "avg-pmu"]
+    active_cols += ["max-usec", "avg-usec", "max-cycles", "avg-cycles", "count"]
+
+    final_headers, final_keys, final_widths = [], [], []
+
+    for col_name in active_cols:
+        data_key, header_text, _ = COL_MAP[col_name]
+        if "pmu" in col_name and pmu_name:
+            header_text = header_text.replace("PMU", pmu_name)
+
+        natural_width = max([len(row[data_key]) for row in sorted_groups] + [len(header_text)])
+        target_width  = width_overrides.get(col_name, natural_width)
+
+        if target_width == 0:
+            continue
+
+        final_headers.append(header_text)
+        final_keys.append(data_key)
+        final_widths.append(target_width)
+
+    # Print Report
+    logger.info(f"\n# Profile Report (Top {top_n} Ops sorted by {sort_col})\n")
+    header_line = "| " + " | ".join(f"{h:<{final_widths[i]}}" for i, h in enumerate(final_headers)) + " |"
+    sep_line    = "| " + " | ".join("-" * final_widths[i] for i in range(len(final_headers))) + " |"
+    logger.info(header_line)
+    logger.info(sep_line)
+
+    for group in sorted_groups:
+        row_vals = []
+        for i, key in enumerate(final_keys):
+            val = group[key]
+            if len(val) > final_widths[i]:
+                val = val[:final_widths[i] - 3] + "..."
+            row_vals.append(f"{val:<{final_widths[i]}}")
+        logger.info("| " + " | ".join(row_vals) + " |")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Post-process Op profile info.")
+    parser.add_argument("logfile")
+    parser.add_argument("-n", "--top", type=int, default=100)
+    parser.add_argument("--sort", type=str, default="max-usec", choices=list(COL_MAP.keys()))
+    parser.add_argument("--pmu-index", type=int)
+    parser.add_argument("--pmu-name", type=str)
+    parser.add_argument("--width", action='append', default=['dims:40'], help="Override column width, e.g. --width dims:50")
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+    # Sort validation: can't sort by PMU if index isn't provided
+    if "pmu" in args.sort and args.pmu_index is None:
+        logger.error(f"Cannot sort by '{args.sort}' without --pmu-index.")
+        sys.exit(1)
+
+    overrides = {}
+    if args.width:
+        for w in args.width:
+            try:
+                name, val = w.split(':')
+                overrides[name.lower()] = int(val)
+            except ValueError:
+                logger.warning(f"Invalid width format '{w}'")
+
+    final_pmu_name = (args.pmu_name or f"#{args.pmu_index}") if args.pmu_index is not None else None
+    ops = parse_log(args.logfile, pmu_index=args.pmu_index)
+    generate_report(ops, args.top, overrides, args.sort, pmu_name=final_pmu_name)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/snapdragon/windows/run-bench.ps1
+++ b/scripts/snapdragon/windows/run-bench.ps1
@ -21,11 +21,11 @@ if ($null -ne $env:V) {
 }

 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }

-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }

 if ($null -ne $env:NHVX) {
--- a/scripts/snapdragon/windows/run-cli.ps1
+++ b/scripts/snapdragon/windows/run-cli.ps1
@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) {
 }

 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }

-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }

 if ($null -ne $env:NHVX) {
--- a/scripts/snapdragon/windows/run-completion.ps1
+++ b/scripts/snapdragon/windows/run-completion.ps1
@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) {
 }

 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }

-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }

 if ($null -ne $env:NHVX) {
--- a/scripts/snapdragon/windows/run-mtmd.ps1
+++ b/scripts/snapdragon/windows/run-mtmd.ps1
@ -34,11 +34,11 @@ if ($null -ne $env:SCHED) {
 }

 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }

-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }

 if ($null -ne $env:NHVX) {
--- a/scripts/snapdragon/windows/run-tool.ps1
+++ b/scripts/snapdragon/windows/run-tool.ps1
@ -31,11 +31,11 @@ if ($null -ne $env:SCHED) {
 }

 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }

-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }

 if ($null -ne $env:NHVX) {