From 125a0d1e21107009a7222acdaa98541db38177d6 Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 12:51:19 -0500
Subject: [PATCH 01/14] mpi: tag GPU-aware MPI calls with leading '*' in
 profile

Introspect L0/CUDA/HIP pointers at runtime (via dlopen, no build deps)
for every buffer-bearing MPI call and emit a buffer_info tracepoint.
The interval consumer prefixes the call name with '*' when any buffer
resides in device or shared/managed memory.
---
 backends/mpi/Makefile.am                   |   3 +-
 backends/mpi/btx_mpiinterval_callbacks.cpp |  49 ++-
 backends/mpi/gen_mpi.rb                    |   6 +
 backends/mpi/mpi.h.include                 |   3 +-
 backends/mpi/mpi_events.yaml               |  23 ++
 backends/mpi/mpi_model.rb                  |  39 +++
 backends/mpi/tracer_mpi.sh.in              |   2 +
 backends/mpi/tracer_mpi_helpers.include.c  | 384 +++++++++++++++++++++
 8 files changed, 505 insertions(+), 4 deletions(-)
diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am
index f3d0ea28c..13400366e 100644
--- a/backends/mpi/Makefile.am
+++ b/backends/mpi/Makefile.am
@@ -62,7 +62,8 @@ MPI_PROBES_INCL = $(MPI_PROBES:=.h)
 MPI_PROBES_SRC = $(MPI_PROBES:=.c)
 
 MPI_STATIC_PROBES = \
-	mpi_type
+	mpi_type \
+	mpi_properties
 
 MPI_STATIC_PROBES_TP = $(MPI_STATIC_PROBES:=.tp)
 MPI_STATIC_PROBES_INCL = $(MPI_STATIC_PROBES:=.h)
diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
index 65f1cedd2..1e29ed0ef 100644
--- a/backends/mpi/btx_mpiinterval_callbacks.cpp
+++ b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -5,9 +5,22 @@
 #include <sstream>
 #include <string>
 #include <tuple>
+#include <unordered_set>
+
+// Must match the enum in backends/mpi/tracer_mpi_helpers.include.c.
+enum {
+  THAPI_MPI_PTR_UNKNOWN = 0,
+  THAPI_MPI_PTR_HOST    = 1,
+  THAPI_MPI_PTR_DEVICE  = 2,
+  THAPI_MPI_PTR_SHARED  = 3
+};
 
 struct data_s {
   EntryState entry_state;
+  // Set of {hostname, vpid, vtid} for which the in-flight MPI call has been
+  // observed to carry at least one device/shared GPU buffer.  Populated by
+  // buffer_info_callback, consumed (and cleared) by send_host_message.
+  std::unordered_set<hpt_t> gpu_aware_calls;
 };
 
 typedef struct data_s data_t;
@@ -21,9 +34,16 @@ static void send_host_message(void *btx_handle,
                               uint64_t vtid,
                               bool err) {
 
+  auto *data = static_cast<data_t *>(usr_data);
+  const hpt_t hpt{hostname, vpid, vtid};
+
   std::string event_class_name_striped = strip_event_class_name_exit(event_class_name);
-  const int64_t entry_ts =
-      static_cast<data_t *>(usr_data)->entry_state.get_ts({hostname, vpid, vtid});
+  // GPU-aware MPI calls are prefixed with '*' so they stand out in the
+  // profile.  The flag is set by buffer_info_callback during this call.
+  if (data->gpu_aware_calls.erase(hpt) > 0)
+    event_class_name_striped.insert(0, 1, '*');
+
+  const int64_t entry_ts = data->entry_state.get_ts(hpt);
 
   btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI,
                               event_class_name_striped.c_str(), (ts - entry_ts), err);
@@ -149,6 +169,29 @@ static void type_property_callback(void *btx_handle,
   mpi_datatype_info[(uint64_t)datatype] = {std::to_string((uint64_t)datatype), size};
 }
 
+// Emitted from the prologue of every buffer-bearing MPI call, once per
+// buffer argument.  We only care whether at least one buffer is on the GPU
+// (device or shared/managed); that's enough to tag the call as GPU-aware.
+static void buffer_info_callback(void *btx_handle,
+                                 void *usr_data,
+                                 int64_t ts,
+                                 const char *hostname,
+                                 int64_t vpid,
+                                 uint64_t vtid,
+                                 uintptr_t ptr,
+                                 int role,
+                                 int kind,
+                                 int backend,
+                                 uintptr_t base,
+                                 uint64_t size,
+                                 uint32_t device_ordinal) {
+
+  if (kind == THAPI_MPI_PTR_DEVICE || kind == THAPI_MPI_PTR_SHARED) {
+    auto *data = static_cast<data_t *>(usr_data);
+    data->gpu_aware_calls.insert({hostname, vpid, vtid});
+  }
+}
+
 static void traffic_MPI_Count_entry_callback(void *btx_handle,
                                              void *usr_data,
                                              int64_t ts,
@@ -209,4 +252,6 @@ void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback);
 
   btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback);
+
+  btx_register_callbacks_lttng_ust_mpi_properties_buffer_info(btx_handle, &buffer_info_callback);
 }
diff --git a/backends/mpi/gen_mpi.rb b/backends/mpi/gen_mpi.rb
index bd939a9bb..3c0e349cd 100644
--- a/backends/mpi/gen_mpi.rb
+++ b/backends/mpi/gen_mpi.rb
@@ -19,6 +19,10 @@ def common_block(c, provider)
   tracepoint_params = c.tracepoint_parameters.reject { |p| p.after? }.collect(&:name)
   puts "  tracepoint(#{provider}, #{c.name}_#{START}, #{(tp_params + tracepoint_params).join(', ')});"
 
+  c.prologues.each do |p|
+    puts p
+  end
+
   params = c.parameters.collect(&:name)
   if c.has_return_type?
     puts "  #{c.type} _retval;"
@@ -75,10 +79,12 @@ def define_and_find_mpi_symbols
 
 puts <<~EOF
   #include <stdint.h>
+  #include <string.h>
   #define MPICH_FORTRAN_SYMBOLS_NONABI
   #include <mpi.h>
   #include "mpi_tracepoints.h"
   #include "mpi_type.h"
+  #include "mpi_properties.h"
   #include <dlfcn.h>
   #include <pthread.h>
 EOF
diff --git a/backends/mpi/mpi.h.include b/backends/mpi/mpi.h.include
index aed167967..4fe43271b 100644
--- a/backends/mpi/mpi.h.include
+++ b/backends/mpi/mpi.h.include
@@ -1,2 +1,3 @@
 #include <mpi.h>
-
+#include <stdint.h>
+#include <stddef.h>
diff --git a/backends/mpi/mpi_events.yaml b/backends/mpi/mpi_events.yaml
index 48c6dcc96..8a8060873 100644
--- a/backends/mpi/mpi_events.yaml
+++ b/backends/mpi/mpi_events.yaml
@@ -9,3 +9,26 @@ lttng_ust_mpi_type:
     - [ ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype" ]
     - [ ctf_integer, int, size, size ]
 
+lttng_ust_mpi_properties:
+  events:
+  # Emitted once per buffer argument of a buffer-bearing MPI call.
+  # `kind`    : 0 unknown, 1 host, 2 device, 3 shared/managed
+  # `backend` : 0 none,    1 level-zero, 2 cuda, 3 hip
+  # `role`    : 0 input (send-like), 1 output (recv-like)
+  - name: buffer_info
+    args:
+    - [ const void *, ptr ]
+    - [ int, role ]
+    - [ int, kind ]
+    - [ int, backend ]
+    - [ uintptr_t, base ]
+    - [ uint64_t, size ]
+    - [ uint32_t, device_ordinal ]
+    fields:
+    - [ ctf_integer_hex, uintptr_t, ptr, "(uintptr_t)ptr" ]
+    - [ ctf_integer, int, role, role ]
+    - [ ctf_integer, int, kind, kind ]
+    - [ ctf_integer, int, backend, backend ]
+    - [ ctf_integer_hex, uintptr_t, base, base ]
+    - [ ctf_integer, uint64_t, size, size ]
+    - [ ctf_integer, uint32_t, device_ordinal, device_ordinal ]
diff --git a/backends/mpi/mpi_model.rb b/backends/mpi/mpi_model.rb
index c8f3cc56c..6084c4084 100644
--- a/backends/mpi/mpi_model.rb
+++ b/backends/mpi/mpi_model.rb
@@ -89,3 +89,42 @@ def underscore(camel_cased_word)
   if (tracepoint_enabled(lttng_ust_mpi_type, property))#{' '}
     tracepoint(lttng_ust_mpi_type, property, *datatype, size);
 EOF
+
+# ---------------------------------------------------------------------------
+# GPU-aware MPI introspection
+#
+# For every MPI call that carries a user payload buffer, register a prologue
+# that emits an `lttng_ust_mpi_properties:buffer_info` event per buffer.  The
+# event records whether the pointer is host / device / shared memory and which
+# GPU runtime (Level Zero, CUDA, HIP) owns it.  Consumers (extract/,
+# btx_mpiinterval_callbacks.cpp) use that to tag each MPI call as "GPU-aware"
+# or "CPU-only".
+#
+# The set of buffer parameters is derived from mpi_meta_parameters.yaml so
+# new MPI APIs picked up by codegen are classified automatically: any
+# InScalar/OutScalar parameter whose name matches one of the canonical MPI
+# payload-buffer names below is treated as a user buffer.
+# ---------------------------------------------------------------------------
+MPI_PAYLOAD_BUFFER_NAMES = %w[
+  buf buffer sendbuf recvbuf inbuf outbuf inoutbuf
+  origin_addr result_addr compare_addr
+].freeze
+
+$mpi_meta_parameters.fetch('meta_parameters', []).each do |func, list|
+  buffers = list.select do |type, name|
+    %w[InScalar OutScalar].include?(type) &&
+      MPI_PAYLOAD_BUFFER_NAMES.include?(name)
+  end
+  next if buffers.empty?
+
+  body = buffers.map do |type, name|
+    role = type == 'InScalar' ? 'THAPI_MPI_ROLE_IN' : 'THAPI_MPI_ROLE_OUT'
+    "  _dump_buffer_info((const void *)#{name}, #{role});"
+  end.join("\n")
+
+  register_prologue func, <<~EOF
+    if (tracepoint_enabled(lttng_ust_mpi_properties, buffer_info)) {
+    #{body}
+    }
+  EOF
+end
diff --git a/backends/mpi/tracer_mpi.sh.in b/backends/mpi/tracer_mpi.sh.in
index 55343f6ca..e841ad94b 100644
--- a/backends/mpi/tracer_mpi.sh.in
+++ b/backends/mpi/tracer_mpi.sh.in
@@ -81,6 +81,8 @@ fi
 lttng enable-channel --userspace --blocking-timeout=inf blocking-channel
 lttng add-context --userspace --channel=blocking-channel -t vpid -t vtid
 lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi:*
+lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi_type:*
+lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi_properties:*
 if [ -z "$LTTNG_UST_MPI_LIBMPI" ]; then
   LTTNG_UST_MPI_LIBMPI=$(whichlib64_head libmpi.so)
   if [ -n "$LTTNG_UST_MPI_LIBMPI" ]; then
diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c
index a432d2183..874bd7c22 100644
--- a/backends/mpi/tracer_mpi_helpers.include.c
+++ b/backends/mpi/tracer_mpi_helpers.include.c
@@ -2,6 +2,389 @@ static pthread_once_t _init = PTHREAD_ONCE_INIT;
 static __thread volatile int in_init = 0;
 static volatile unsigned int _initialized = 0;
 
+/* ---------------------------------------------------------------------------
+ * GPU pointer introspection
+ *
+ * Goal: when an MPI call carries a buffer pointer, decide at runtime whether
+ * the pointer refers to host memory or to a GPU allocation, so that downstream
+ * tools can tag the call as "GPU-aware MPI" vs "CPU-only MPI".
+ *
+ * Design constraints (matching the existing MPI backend):
+ *   - No build-time dependency on Level Zero / CUDA / HIP.  Each loader is
+ *     dlopen'd lazily, and only the symbols we need are looked up via dlsym
+ *     using locally-declared function-pointer signatures over opaque types.
+ *   - All overhead is gated by tracepoint_enabled() and a small per-thread
+ *     cache keyed on the allocation base address, so a buffer reused across
+ *     iterations is classified only once.
+ * ------------------------------------------------------------------------- */
+
+/* Values reported in the buffer_info tracepoint.  Keep in sync with consumers
+ * (extract/, btx_mpiinterval_callbacks.cpp). */
+enum thapi_mpi_ptr_kind {
+  THAPI_MPI_PTR_UNKNOWN = 0,
+  THAPI_MPI_PTR_HOST    = 1,
+  THAPI_MPI_PTR_DEVICE  = 2,
+  THAPI_MPI_PTR_SHARED  = 3
+};
+
+enum thapi_mpi_ptr_backend {
+  THAPI_MPI_BACKEND_NONE = 0,
+  THAPI_MPI_BACKEND_L0   = 1,
+  THAPI_MPI_BACKEND_CUDA = 2,
+  THAPI_MPI_BACKEND_HIP  = 3
+};
+
+enum thapi_mpi_buffer_role {
+  THAPI_MPI_ROLE_IN  = 0,
+  THAPI_MPI_ROLE_OUT = 1
+};
+
+/* --- Level Zero ----------------------------------------------------------- */
+/* Opaque handle types and the minimal enums/structs needed to call
+ * zeMemGetAllocProperties / zeMemGetAddressRange without including ze_api.h. */
+typedef int                        _thapi_ze_result_t;     /* ze_result_t */
+typedef struct _thapi_ze_driver   *_thapi_ze_driver_handle_t;
+typedef struct _thapi_ze_context  *_thapi_ze_context_handle_t;
+typedef struct _thapi_ze_device   *_thapi_ze_device_handle_t;
+
+#define _THAPI_ZE_RESULT_SUCCESS                          0
+#define _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES 0x000c
+#define _THAPI_ZE_INIT_FLAG_GPU_ONLY                   0x01
+
+typedef enum {
+  _THAPI_ZE_MEMORY_TYPE_UNKNOWN = 0,
+  _THAPI_ZE_MEMORY_TYPE_HOST    = 1,
+  _THAPI_ZE_MEMORY_TYPE_DEVICE  = 2,
+  _THAPI_ZE_MEMORY_TYPE_SHARED  = 3
+} _thapi_ze_memory_type_t;
+
+typedef struct {
+  uint32_t                stype;
+  const void             *pNext;
+  _thapi_ze_memory_type_t type;
+  uint64_t                id;
+  uint64_t                pageSize;
+} _thapi_ze_memory_allocation_properties_t;
+
+typedef _thapi_ze_result_t (*_thapi_ze_init_fn_t)(uint32_t flags);
+typedef _thapi_ze_result_t (*_thapi_ze_driver_get_fn_t)(uint32_t *pCount,
+                                                       _thapi_ze_driver_handle_t *phDrivers);
+typedef _thapi_ze_result_t (*_thapi_ze_context_create_fn_t)(_thapi_ze_driver_handle_t hDriver,
+                                                            const void *desc,
+                                                            _thapi_ze_context_handle_t *phContext);
+typedef _thapi_ze_result_t (*_thapi_ze_mem_get_alloc_properties_fn_t)(
+    _thapi_ze_context_handle_t hContext, const void *ptr,
+    _thapi_ze_memory_allocation_properties_t *pProps,
+    _thapi_ze_device_handle_t *phDevice);
+typedef _thapi_ze_result_t (*_thapi_ze_mem_get_address_range_fn_t)(
+    _thapi_ze_context_handle_t hContext, const void *ptr,
+    void **pBase, size_t *pSize);
+
+static _thapi_ze_mem_get_alloc_properties_fn_t _ze_mem_get_alloc_properties = NULL;
+static _thapi_ze_mem_get_address_range_fn_t    _ze_mem_get_address_range    = NULL;
+static _thapi_ze_context_handle_t              _ze_introspection_ctx        = NULL;
+
+/* --- CUDA ----------------------------------------------------------------- */
+/* Driver API (libcuda.so) is preferred: it's the same on every CUDA install
+ * and doesn't drag in libcudart. */
+typedef int          _thapi_cu_result_t;          /* CUresult */
+typedef unsigned int _thapi_cu_attr_t;            /* CUpointer_attribute */
+typedef unsigned long long _thapi_cu_deviceptr_t; /* CUdeviceptr */
+
+#define _THAPI_CU_SUCCESS                        0
+#define _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE  2
+#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START  7
+#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE   8
+#define _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL 9
+#define _THAPI_CU_MEMORYTYPE_HOST                1
+#define _THAPI_CU_MEMORYTYPE_DEVICE              2
+#define _THAPI_CU_MEMORYTYPE_ARRAY               3
+#define _THAPI_CU_MEMORYTYPE_UNIFIED             4
+
+typedef _thapi_cu_result_t (*_thapi_cu_init_fn_t)(unsigned int flags);
+typedef _thapi_cu_result_t (*_thapi_cu_pointer_get_attribute_fn_t)(void *data,
+                                                                   _thapi_cu_attr_t attr,
+                                                                   _thapi_cu_deviceptr_t ptr);
+
+static _thapi_cu_pointer_get_attribute_fn_t _cu_pointer_get_attribute = NULL;
+
+/* --- HIP ------------------------------------------------------------------ */
+typedef int _thapi_hip_error_t; /* hipError_t */
+
+#define _THAPI_HIP_SUCCESS               0
+#define _THAPI_HIP_MEMORY_TYPE_HOST      0
+#define _THAPI_HIP_MEMORY_TYPE_DEVICE    1
+#define _THAPI_HIP_MEMORY_TYPE_ARRAY     2
+#define _THAPI_HIP_MEMORY_TYPE_UNIFIED   3
+#define _THAPI_HIP_MEMORY_TYPE_MANAGED   4
+
+typedef struct {
+  unsigned int  type;
+  int           device;
+  void         *devicePointer;
+  void         *hostPointer;
+  int           isManaged;
+  unsigned int  allocationFlags;
+} _thapi_hip_pointer_attribute_t;
+
+typedef _thapi_hip_error_t (*_thapi_hip_pointer_get_attributes_fn_t)(
+    _thapi_hip_pointer_attribute_t *attr, const void *ptr);
+typedef _thapi_hip_error_t (*_thapi_hip_mem_get_address_range_fn_t)(
+    void **pbase, size_t *psize, void *dptr);
+
+static _thapi_hip_pointer_get_attributes_fn_t _hip_pointer_get_attributes = NULL;
+static _thapi_hip_mem_get_address_range_fn_t  _hip_mem_get_address_range  = NULL;
+
+/* --- Per-thread address-range cache --------------------------------------- */
+/* Tiny LRU keyed on allocation base address.  Buffers reused across MPI
+ * iterations (the common case in HPC) hit the cache and skip the runtime
+ * query entirely.  Sized small enough to live cheaply per-thread. */
+#define _THAPI_MPI_CACHE_SLOTS 16
+
+struct _thapi_mpi_cache_entry {
+  uintptr_t base;
+  size_t    size;
+  uint8_t   kind;
+  uint8_t   backend;
+  uint32_t  device_ordinal;
+  uint8_t   valid;
+};
+
+static __thread struct _thapi_mpi_cache_entry
+    _thapi_mpi_cache[_THAPI_MPI_CACHE_SLOTS];
+static __thread unsigned int _thapi_mpi_cache_next = 0;
+
+static inline int _thapi_mpi_cache_lookup(const void *ptr,
+                                          struct _thapi_mpi_cache_entry *out) {
+  uintptr_t p = (uintptr_t)ptr;
+  for (unsigned int i = 0; i < _THAPI_MPI_CACHE_SLOTS; i++) {
+    struct _thapi_mpi_cache_entry *e = &_thapi_mpi_cache[i];
+    if (e->valid && p >= e->base && p < e->base + e->size) {
+      *out = *e;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static inline void _thapi_mpi_cache_insert(const struct _thapi_mpi_cache_entry *e) {
+  _thapi_mpi_cache[_thapi_mpi_cache_next] = *e;
+  _thapi_mpi_cache_next =
+      (_thapi_mpi_cache_next + 1) % _THAPI_MPI_CACHE_SLOTS;
+}
+
+/* --- Runtime probing ------------------------------------------------------ */
+static int _try_classify_ze(const void *ptr,
+                            struct _thapi_mpi_cache_entry *out) {
+  if (!_ze_mem_get_alloc_properties || !_ze_introspection_ctx)
+    return 0;
+
+  _thapi_ze_memory_allocation_properties_t props;
+  props.stype = _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES;
+  props.pNext = NULL;
+  props.type  = _THAPI_ZE_MEMORY_TYPE_UNKNOWN;
+  _thapi_ze_device_handle_t hDev = NULL;
+
+  if (_ze_mem_get_alloc_properties(_ze_introspection_ctx, ptr, &props, &hDev)
+      != _THAPI_ZE_RESULT_SUCCESS)
+    return 0;
+  if (props.type == _THAPI_ZE_MEMORY_TYPE_UNKNOWN)
+    return 0;
+
+  void  *base = NULL;
+  size_t size = 0;
+  if (_ze_mem_get_address_range &&
+      _ze_mem_get_address_range(_ze_introspection_ctx, ptr, &base, &size)
+          != _THAPI_ZE_RESULT_SUCCESS) {
+    base = (void *)ptr;
+    size = 1;
+  }
+
+  out->base = (uintptr_t)base;
+  out->size = size ? size : 1;
+  out->backend = THAPI_MPI_BACKEND_L0;
+  out->device_ordinal = 0; /* L0 device ordinal lookup would need extra calls */
+  switch (props.type) {
+    case _THAPI_ZE_MEMORY_TYPE_HOST:   out->kind = THAPI_MPI_PTR_HOST;   break;
+    case _THAPI_ZE_MEMORY_TYPE_DEVICE: out->kind = THAPI_MPI_PTR_DEVICE; break;
+    case _THAPI_ZE_MEMORY_TYPE_SHARED: out->kind = THAPI_MPI_PTR_SHARED; break;
+    default:                           out->kind = THAPI_MPI_PTR_UNKNOWN; break;
+  }
+  out->valid = 1;
+  return 1;
+}
+
+static int _try_classify_cuda(const void *ptr,
+                              struct _thapi_mpi_cache_entry *out) {
+  if (!_cu_pointer_get_attribute)
+    return 0;
+
+  unsigned int           mem_type = 0;
+  _thapi_cu_deviceptr_t  range_start = 0;
+  size_t                 range_size = 0;
+  int                    device_ordinal = 0;
+  _thapi_cu_deviceptr_t  dptr = (_thapi_cu_deviceptr_t)(uintptr_t)ptr;
+
+  if (_cu_pointer_get_attribute(&mem_type,
+                                _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dptr)
+      != _THAPI_CU_SUCCESS || mem_type == 0)
+    return 0;
+
+  (void)_cu_pointer_get_attribute(&range_start,
+                                  _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START, dptr);
+  (void)_cu_pointer_get_attribute(&range_size,
+                                  _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE, dptr);
+  (void)_cu_pointer_get_attribute(&device_ordinal,
+                                  _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dptr);
+
+  out->base = range_start ? (uintptr_t)range_start : (uintptr_t)ptr;
+  out->size = range_size ? range_size : 1;
+  out->backend = THAPI_MPI_BACKEND_CUDA;
+  out->device_ordinal = (uint32_t)device_ordinal;
+  switch (mem_type) {
+    case _THAPI_CU_MEMORYTYPE_HOST:    out->kind = THAPI_MPI_PTR_HOST;   break;
+    case _THAPI_CU_MEMORYTYPE_DEVICE:  out->kind = THAPI_MPI_PTR_DEVICE; break;
+    case _THAPI_CU_MEMORYTYPE_UNIFIED: out->kind = THAPI_MPI_PTR_SHARED; break;
+    default:                           out->kind = THAPI_MPI_PTR_UNKNOWN; break;
+  }
+  out->valid = 1;
+  return 1;
+}
+
+static int _try_classify_hip(const void *ptr,
+                             struct _thapi_mpi_cache_entry *out) {
+  if (!_hip_pointer_get_attributes)
+    return 0;
+
+  _thapi_hip_pointer_attribute_t attr;
+  memset(&attr, 0, sizeof(attr));
+  if (_hip_pointer_get_attributes(&attr, ptr) != _THAPI_HIP_SUCCESS)
+    return 0;
+
+  void  *base = (void *)ptr;
+  size_t size = 1;
+  if (_hip_mem_get_address_range)
+    (void)_hip_mem_get_address_range(&base, &size, (void *)ptr);
+
+  out->base = (uintptr_t)base;
+  out->size = size ? size : 1;
+  out->backend = THAPI_MPI_BACKEND_HIP;
+  out->device_ordinal = (uint32_t)attr.device;
+  if (attr.isManaged || attr.type == _THAPI_HIP_MEMORY_TYPE_UNIFIED ||
+      attr.type == _THAPI_HIP_MEMORY_TYPE_MANAGED)
+    out->kind = THAPI_MPI_PTR_SHARED;
+  else if (attr.type == _THAPI_HIP_MEMORY_TYPE_DEVICE)
+    out->kind = THAPI_MPI_PTR_DEVICE;
+  else if (attr.type == _THAPI_HIP_MEMORY_TYPE_HOST)
+    out->kind = THAPI_MPI_PTR_HOST;
+  else
+    out->kind = THAPI_MPI_PTR_UNKNOWN;
+  out->valid = 1;
+  return 1;
+}
+
+static inline void _dump_buffer_info(const void *ptr, int role) {
+  if (!ptr)
+    return;
+  /* MPI_IN_PLACE / MPI_BOTTOM are sentinel values, not real pointers. */
+  if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM)
+    return;
+  if (!tracepoint_enabled(lttng_ust_mpi_properties, buffer_info))
+    return;
+
+  struct _thapi_mpi_cache_entry e;
+  if (!_thapi_mpi_cache_lookup(ptr, &e)) {
+    e.valid = 0;
+    if (!_try_classify_ze(ptr, &e) &&
+        !_try_classify_cuda(ptr, &e) &&
+        !_try_classify_hip(ptr, &e)) {
+      /* Nobody claims it: treat as host. Cache a 1-byte range so we still
+       * answer in O(1) on the next call, but don't pollute the cache with
+       * adjacent host allocations. */
+      e.base = (uintptr_t)ptr;
+      e.size = 1;
+      e.kind = THAPI_MPI_PTR_HOST;
+      e.backend = THAPI_MPI_BACKEND_NONE;
+      e.device_ordinal = 0;
+      e.valid = 1;
+    }
+    _thapi_mpi_cache_insert(&e);
+  }
+
+  do_tracepoint(lttng_ust_mpi_properties, buffer_info,
+                ptr, role, (int)e.kind, (int)e.backend,
+                e.base, (uint64_t)e.size, e.device_ordinal);
+}
+
+/* --- Loader -------------------------------------------------------------- */
+static void _load_gpu_introspection(int verbose) {
+  /* Each loader is optional: if the runtime isn't present on the system, the
+   * corresponding classifier stays NULL and is silently skipped. */
+  void *h_ze = dlopen("libze_loader.so.1", RTLD_LAZY | RTLD_LOCAL);
+  if (!h_ze)
+    h_ze = dlopen("libze_loader.so",   RTLD_LAZY | RTLD_LOCAL);
+  if (h_ze) {
+    _thapi_ze_init_fn_t           ze_init    = (_thapi_ze_init_fn_t)
+        dlsym(h_ze, "zeInit");
+    _thapi_ze_driver_get_fn_t     ze_driver  = (_thapi_ze_driver_get_fn_t)
+        dlsym(h_ze, "zeDriverGet");
+    _thapi_ze_context_create_fn_t ze_ctx_new = (_thapi_ze_context_create_fn_t)
+        dlsym(h_ze, "zeContextCreate");
+    _ze_mem_get_alloc_properties = (_thapi_ze_mem_get_alloc_properties_fn_t)
+        dlsym(h_ze, "zeMemGetAllocProperties");
+    _ze_mem_get_address_range    = (_thapi_ze_mem_get_address_range_fn_t)
+        dlsym(h_ze, "zeMemGetAddressRange");
+
+    /* zeMemGetAllocProperties needs a context.  Create a private one against
+     * the first driver so introspection never touches application contexts. */
+    uint32_t n = 1;
+    _thapi_ze_driver_handle_t hDrv = NULL;
+    struct { uint32_t stype; const void *pNext; uint32_t flags; } desc = {
+        0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0 };
+    if (ze_init && ze_driver && ze_ctx_new &&
+        ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS &&
+        ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv &&
+        ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx)
+            == _THAPI_ZE_RESULT_SUCCESS) {
+      if (verbose)
+        fprintf(stderr, "THAPI/MPI: Level Zero pointer introspection enabled\n");
+    } else {
+      _ze_mem_get_alloc_properties = NULL;
+      _ze_mem_get_address_range    = NULL;
+      _ze_introspection_ctx        = NULL;
+    }
+  }
+
+  void *h_cu = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_LOCAL);
+  if (!h_cu)
+    h_cu = dlopen("libcuda.so",   RTLD_LAZY | RTLD_LOCAL);
+  if (h_cu) {
+    _thapi_cu_init_fn_t cu_init =
+        (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit");
+    _cu_pointer_get_attribute = (_thapi_cu_pointer_get_attribute_fn_t)
+        dlsym(h_cu, "cuPointerGetAttribute");
+    if (cu_init && _cu_pointer_get_attribute &&
+        cu_init(0) == _THAPI_CU_SUCCESS) {
+      if (verbose)
+        fprintf(stderr, "THAPI/MPI: CUDA pointer introspection enabled\n");
+    } else {
+      _cu_pointer_get_attribute = NULL;
+    }
+  }
+
+  void *h_hip = dlopen("libamdhip64.so.5", RTLD_LAZY | RTLD_LOCAL);
+  if (!h_hip)
+    h_hip = dlopen("libamdhip64.so",   RTLD_LAZY | RTLD_LOCAL);
+  if (h_hip) {
+    _hip_pointer_get_attributes = (_thapi_hip_pointer_get_attributes_fn_t)
+        dlsym(h_hip, "hipPointerGetAttributes");
+    _hip_mem_get_address_range  = (_thapi_hip_mem_get_address_range_fn_t)
+        dlsym(h_hip, "hipMemGetAddressRange");
+    if (_hip_pointer_get_attributes && verbose)
+      fprintf(stderr, "THAPI/MPI: HIP pointer introspection enabled\n");
+  }
+}
+
 static void _load_tracer(void) {
   char *s = NULL;
   void *handle = NULL;
@@ -30,6 +413,7 @@ static void _load_tracer(void) {
     verbose = 1;
 
   find_mpi_symbols(handle, verbose);
+  _load_gpu_introspection(verbose);
 }
 
 static inline void _init_tracer(void) {

From 6daf724c5c8918320a2e1f624fbf19129fa4c7df Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 13:15:42 -0500
Subject: [PATCH 02/14] mpi: defer GPU introspection loader to first buffer
 query

Avoid creating an L0 context inside MPI_Init's pthread_once. The
application's runtime setup (zeInit, ZE_AFFINITY_MASK, etc.) is
allowed to complete before we touch the GPU stack.
---
 backends/mpi/tracer_mpi_helpers.include.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c
index 874bd7c22..03cd65712 100644
--- a/backends/mpi/tracer_mpi_helpers.include.c
+++ b/backends/mpi/tracer_mpi_helpers.include.c
@@ -283,6 +283,18 @@ static int _try_classify_hip(const void *ptr,
   return 1;
 }
 
+/* Lazy initialisation of the GPU introspection loaders.  Deferred until the
+ * first buffer classification so that the application has finished its own
+ * runtime setup (zeInit / cuInit / ZE_AFFINITY_MASK / etc.) by the time we
+ * peek at the GPU stack.  Doing this from _load_tracer() instead would create
+ * an L0 context before the app has set its environment, which is bad
+ * hygiene and can in rare cases trigger reentrancy through the L0 loader. */
+static void _load_gpu_introspection(int verbose);
+static pthread_once_t _gpu_introspection_once = PTHREAD_ONCE_INIT;
+static void _load_gpu_introspection_once(void) {
+  _load_gpu_introspection(getenv("LTTNG_UST_MPI_VERBOSE") ? 1 : 0);
+}
+
 static inline void _dump_buffer_info(const void *ptr, int role) {
   if (!ptr)
     return;
@@ -292,6 +304,8 @@ static inline void _dump_buffer_info(const void *ptr, int role) {
   if (!tracepoint_enabled(lttng_ust_mpi_properties, buffer_info))
     return;
 
+  pthread_once(&_gpu_introspection_once, _load_gpu_introspection_once);
+
   struct _thapi_mpi_cache_entry e;
   if (!_thapi_mpi_cache_lookup(ptr, &e)) {
     e.valid = 0;
@@ -413,7 +427,9 @@ static void _load_tracer(void) {
     verbose = 1;
 
   find_mpi_symbols(handle, verbose);
-  _load_gpu_introspection(verbose);
+  /* GPU pointer introspection is loaded lazily on first use (see
+   * _load_gpu_introspection_once), so the application's runtime setup
+   * completes before we touch L0/CUDA/HIP. */
 }
 
 static inline void _init_tracer(void) {

From 3a48873ec7893065a97fce8ebb383da7c3f21ace Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 13:25:35 -0500
Subject: [PATCH 03/14] mpi: drop GPU introspection diagnostic prints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make pointer introspection a silent default. Removes the only new
getenv on the branch — the existing LTTNG_UST_MPI_VERBOSE no longer
gates anything we added.
---
 backends/mpi/tracer_mpi_helpers.include.c | 29 ++++++++---------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c
index 03cd65712..725a6712d 100644
--- a/backends/mpi/tracer_mpi_helpers.include.c
+++ b/backends/mpi/tracer_mpi_helpers.include.c
@@ -289,10 +289,10 @@ static int _try_classify_hip(const void *ptr,
  * peek at the GPU stack.  Doing this from _load_tracer() instead would create
  * an L0 context before the app has set its environment, which is bad
  * hygiene and can in rare cases trigger reentrancy through the L0 loader. */
-static void _load_gpu_introspection(int verbose);
+static void _load_gpu_introspection(void);
 static pthread_once_t _gpu_introspection_once = PTHREAD_ONCE_INIT;
 static void _load_gpu_introspection_once(void) {
-  _load_gpu_introspection(getenv("LTTNG_UST_MPI_VERBOSE") ? 1 : 0);
+  _load_gpu_introspection();
 }
 
 static inline void _dump_buffer_info(const void *ptr, int role) {
@@ -331,7 +331,7 @@ static inline void _dump_buffer_info(const void *ptr, int role) {
 }
 
 /* --- Loader -------------------------------------------------------------- */
-static void _load_gpu_introspection(int verbose) {
+static void _load_gpu_introspection(void) {
   /* Each loader is optional: if the runtime isn't present on the system, the
    * corresponding classifier stays NULL and is silently skipped. */
   void *h_ze = dlopen("libze_loader.so.1", RTLD_LAZY | RTLD_LOCAL);
@@ -355,14 +355,11 @@ static void _load_gpu_introspection(int verbose) {
     _thapi_ze_driver_handle_t hDrv = NULL;
     struct { uint32_t stype; const void *pNext; uint32_t flags; } desc = {
         0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0 };
-    if (ze_init && ze_driver && ze_ctx_new &&
-        ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS &&
-        ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv &&
-        ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx)
-            == _THAPI_ZE_RESULT_SUCCESS) {
-      if (verbose)
-        fprintf(stderr, "THAPI/MPI: Level Zero pointer introspection enabled\n");
-    } else {
+    if (!(ze_init && ze_driver && ze_ctx_new &&
+          ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS &&
+          ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv &&
+          ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx)
+              == _THAPI_ZE_RESULT_SUCCESS)) {
       _ze_mem_get_alloc_properties = NULL;
       _ze_mem_get_address_range    = NULL;
       _ze_introspection_ctx        = NULL;
@@ -377,13 +374,9 @@ static void _load_gpu_introspection(int verbose) {
         (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit");
     _cu_pointer_get_attribute = (_thapi_cu_pointer_get_attribute_fn_t)
         dlsym(h_cu, "cuPointerGetAttribute");
-    if (cu_init && _cu_pointer_get_attribute &&
-        cu_init(0) == _THAPI_CU_SUCCESS) {
-      if (verbose)
-        fprintf(stderr, "THAPI/MPI: CUDA pointer introspection enabled\n");
-    } else {
+    if (!(cu_init && _cu_pointer_get_attribute &&
+          cu_init(0) == _THAPI_CU_SUCCESS))
       _cu_pointer_get_attribute = NULL;
-    }
   }
 
   void *h_hip = dlopen("libamdhip64.so.5", RTLD_LAZY | RTLD_LOCAL);
@@ -394,8 +387,6 @@ static void _load_gpu_introspection(int verbose) {
         dlsym(h_hip, "hipPointerGetAttributes");
     _hip_mem_get_address_range  = (_thapi_hip_mem_get_address_range_fn_t)
         dlsym(h_hip, "hipMemGetAddressRange");
-    if (_hip_pointer_get_attributes && verbose)
-      fprintf(stderr, "THAPI/MPI: HIP pointer introspection enabled\n");
   }
 }
 

From 1f6a85d6888175e07ef1c617f734c1a3c9e2ddb3 Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 15:50:21 -0500
Subject: [PATCH 04/14] mpi: fix buffer_info_callback ptr type to match
 generated typedef

metababel strips qualifiers when emitting the callback typedef, so the
'const void *' field in mpi_events.yaml becomes 'void *' in the
generated lttng_ust_mpi_properties_buffer_info_callback_f signature.
Match it in the callback.
---
 backends/mpi/btx_mpiinterval_callbacks.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
index 1e29ed0ef..cada09ff6 100644
--- a/backends/mpi/btx_mpiinterval_callbacks.cpp
+++ b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -178,7 +178,7 @@ static void buffer_info_callback(void *btx_handle,
                                  const char *hostname,
                                  int64_t vpid,
                                  uint64_t vtid,
-                                 uintptr_t ptr,
+                                 void *ptr,
                                  int role,
                                  int kind,
                                  int backend,

From 3f2360a79d07545cd34049b9ffca663d20f2887d Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 15:54:01 -0500
Subject: [PATCH 05/14] mpi: use symbol key when reading
 mpi_meta_parameters.yaml

The YAML top-level key is the Ruby symbol :meta_parameters
(serialised as ':meta_parameters:'), so fetch('meta_parameters', ...)
returns the empty default. Switch to fetch(:meta_parameters, {}) so
prologues are actually registered for all 145 buffer-bearing MPI
calls. Without this, _dump_buffer_info was unreferenced in the
generated tracer_mpi.c and the build failed under -Werror.
---
 backends/mpi/mpi_model.rb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backends/mpi/mpi_model.rb b/backends/mpi/mpi_model.rb
index 6084c4084..eda8c8459 100644
--- a/backends/mpi/mpi_model.rb
+++ b/backends/mpi/mpi_model.rb
@@ -110,7 +110,9 @@ def underscore(camel_cased_word)
   origin_addr result_addr compare_addr
 ].freeze
 
-$mpi_meta_parameters.fetch('meta_parameters', []).each do |func, list|
+# Note: the YAML top-level key is the Ruby symbol :meta_parameters (it's
+# serialised as ":meta_parameters:"), not the string "meta_parameters".
+$mpi_meta_parameters.fetch(:meta_parameters, {}).each do |func, list|
   buffers = list.select do |type, name|
     %w[InScalar OutScalar].include?(type) &&
       MPI_PAYLOAD_BUFFER_NAMES.include?(name)

From f2dfaf3a65d20a385015aafdd96578162e9bb9ec Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 16:27:53 -0500
Subject: [PATCH 06/14] xprof: enable lttng_ust_mpi_properties:* events in
 iprof sessions

iprof manages its own LTTng session and ignores tracer_mpi.sh, so the
enable-event line previously added to tracer_mpi.sh.in never took
effect under iprof. Add the matching line in enable_events_mpi() so
buffer_info events make it into the trace under both invocation paths.
---
 xprof/xprof.rb.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in
index 9b4804dd6..553312e38 100755
--- a/xprof/xprof.rb.in
+++ b/xprof/xprof.rb.in
@@ -639,6 +639,7 @@ def enable_events_mpi(channel_name, tracing_mode: 'default', profiling: true)
   end
   LOGGER.debug('Profiling is ignored for mpi') if profiling
   exec("#{lttng_enable} lttng_ust_mpi_type:*")
+  exec("#{lttng_enable} lttng_ust_mpi_properties:*")
 end
 
 def enable_events_cxi(channel_name, tracing_mode: 'default', profiling: true); end

From a1dadb67fb4f09bd5199bab6a5b5cde530fbccd5 Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 17:08:41 -0500
Subject: [PATCH 07/14] mpi: apply clang-format, yamlfmt, and rubocop fixes

- clang-format-18 on btx_mpiinterval_callbacks.cpp and
  tracer_mpi_helpers.include.c (strip vertical alignment, collapse
  multi-statement switch cases).
- yamlfmt on mpi_events.yaml (drop --- marker, strip inner spaces
  from flow sequences).
- rubocop: 'Note:' -> 'NOTE:' in mpi_model.rb (Style/CommentAnnotation).
---
 backends/mpi/btx_mpiinterval_callbacks.cpp |   6 +-
 backends/mpi/mpi_events.yaml               |  38 ++-
 backends/mpi/mpi_model.rb                  |   2 +-
 backends/mpi/tracer_mpi_helpers.include.c  | 270 ++++++++++-----------
 4 files changed, 156 insertions(+), 160 deletions(-)

diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
index cada09ff6..43dca7878 100644
--- a/backends/mpi/btx_mpiinterval_callbacks.cpp
+++ b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -10,9 +10,9 @@
 // Must match the enum in backends/mpi/tracer_mpi_helpers.include.c.
 enum {
   THAPI_MPI_PTR_UNKNOWN = 0,
-  THAPI_MPI_PTR_HOST    = 1,
-  THAPI_MPI_PTR_DEVICE  = 2,
-  THAPI_MPI_PTR_SHARED  = 3
+  THAPI_MPI_PTR_HOST = 1,
+  THAPI_MPI_PTR_DEVICE = 2,
+  THAPI_MPI_PTR_SHARED = 3
 };
 
 struct data_s {
diff --git a/backends/mpi/mpi_events.yaml b/backends/mpi/mpi_events.yaml
index 8a8060873..51a03be66 100644
--- a/backends/mpi/mpi_events.yaml
+++ b/backends/mpi/mpi_events.yaml
@@ -1,14 +1,12 @@
----
 lttng_ust_mpi_type:
   events:
   - name: property
     args:
-    - [ MPI_Datatype, datatype]
-    - [ int, size]
+    - [MPI_Datatype, datatype]
+    - [int, size]
     fields:
-    - [ ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype" ]
-    - [ ctf_integer, int, size, size ]
-
+    - [ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype"]
+    - [ctf_integer, int, size, size]
 lttng_ust_mpi_properties:
   events:
   # Emitted once per buffer argument of a buffer-bearing MPI call.
@@ -17,18 +15,18 @@ lttng_ust_mpi_properties:
   # `role`    : 0 input (send-like), 1 output (recv-like)
   - name: buffer_info
     args:
-    - [ const void *, ptr ]
-    - [ int, role ]
-    - [ int, kind ]
-    - [ int, backend ]
-    - [ uintptr_t, base ]
-    - [ uint64_t, size ]
-    - [ uint32_t, device_ordinal ]
+    - [const void *, ptr]
+    - [int, role]
+    - [int, kind]
+    - [int, backend]
+    - [uintptr_t, base]
+    - [uint64_t, size]
+    - [uint32_t, device_ordinal]
     fields:
-    - [ ctf_integer_hex, uintptr_t, ptr, "(uintptr_t)ptr" ]
-    - [ ctf_integer, int, role, role ]
-    - [ ctf_integer, int, kind, kind ]
-    - [ ctf_integer, int, backend, backend ]
-    - [ ctf_integer_hex, uintptr_t, base, base ]
-    - [ ctf_integer, uint64_t, size, size ]
-    - [ ctf_integer, uint32_t, device_ordinal, device_ordinal ]
+    - [ctf_integer_hex, uintptr_t, ptr, "(uintptr_t)ptr"]
+    - [ctf_integer, int, role, role]
+    - [ctf_integer, int, kind, kind]
+    - [ctf_integer, int, backend, backend]
+    - [ctf_integer_hex, uintptr_t, base, base]
+    - [ctf_integer, uint64_t, size, size]
+    - [ctf_integer, uint32_t, device_ordinal, device_ordinal]
diff --git a/backends/mpi/mpi_model.rb b/backends/mpi/mpi_model.rb
index eda8c8459..0568314df 100644
--- a/backends/mpi/mpi_model.rb
+++ b/backends/mpi/mpi_model.rb
@@ -110,7 +110,7 @@ def underscore(camel_cased_word)
   origin_addr result_addr compare_addr
 ].freeze
 
-# Note: the YAML top-level key is the Ruby symbol :meta_parameters (it's
+# NOTE: the YAML top-level key is the Ruby symbol :meta_parameters (it's
 # serialised as ":meta_parameters:"), not the string "meta_parameters".
 $mpi_meta_parameters.fetch(:meta_parameters, {}).each do |func, list|
   buffers = list.select do |type, name|
diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c
index 725a6712d..b4fbcf43f 100644
--- a/backends/mpi/tracer_mpi_helpers.include.c
+++ b/backends/mpi/tracer_mpi_helpers.include.c
@@ -22,84 +22,81 @@ static volatile unsigned int _initialized = 0;
  * (extract/, btx_mpiinterval_callbacks.cpp). */
 enum thapi_mpi_ptr_kind {
   THAPI_MPI_PTR_UNKNOWN = 0,
-  THAPI_MPI_PTR_HOST    = 1,
-  THAPI_MPI_PTR_DEVICE  = 2,
-  THAPI_MPI_PTR_SHARED  = 3
+  THAPI_MPI_PTR_HOST = 1,
+  THAPI_MPI_PTR_DEVICE = 2,
+  THAPI_MPI_PTR_SHARED = 3
 };
 
 enum thapi_mpi_ptr_backend {
   THAPI_MPI_BACKEND_NONE = 0,
-  THAPI_MPI_BACKEND_L0   = 1,
+  THAPI_MPI_BACKEND_L0 = 1,
   THAPI_MPI_BACKEND_CUDA = 2,
-  THAPI_MPI_BACKEND_HIP  = 3
+  THAPI_MPI_BACKEND_HIP = 3
 };
 
-enum thapi_mpi_buffer_role {
-  THAPI_MPI_ROLE_IN  = 0,
-  THAPI_MPI_ROLE_OUT = 1
-};
+enum thapi_mpi_buffer_role { THAPI_MPI_ROLE_IN = 0, THAPI_MPI_ROLE_OUT = 1 };
 
 /* --- Level Zero ----------------------------------------------------------- */
 /* Opaque handle types and the minimal enums/structs needed to call
  * zeMemGetAllocProperties / zeMemGetAddressRange without including ze_api.h. */
-typedef int                        _thapi_ze_result_t;     /* ze_result_t */
-typedef struct _thapi_ze_driver   *_thapi_ze_driver_handle_t;
-typedef struct _thapi_ze_context  *_thapi_ze_context_handle_t;
-typedef struct _thapi_ze_device   *_thapi_ze_device_handle_t;
+typedef int _thapi_ze_result_t; /* ze_result_t */
+typedef struct _thapi_ze_driver *_thapi_ze_driver_handle_t;
+typedef struct _thapi_ze_context *_thapi_ze_context_handle_t;
+typedef struct _thapi_ze_device *_thapi_ze_device_handle_t;
 
-#define _THAPI_ZE_RESULT_SUCCESS                          0
+#define _THAPI_ZE_RESULT_SUCCESS 0
 #define _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES 0x000c
-#define _THAPI_ZE_INIT_FLAG_GPU_ONLY                   0x01
+#define _THAPI_ZE_INIT_FLAG_GPU_ONLY 0x01
 
 typedef enum {
   _THAPI_ZE_MEMORY_TYPE_UNKNOWN = 0,
-  _THAPI_ZE_MEMORY_TYPE_HOST    = 1,
-  _THAPI_ZE_MEMORY_TYPE_DEVICE  = 2,
-  _THAPI_ZE_MEMORY_TYPE_SHARED  = 3
+  _THAPI_ZE_MEMORY_TYPE_HOST = 1,
+  _THAPI_ZE_MEMORY_TYPE_DEVICE = 2,
+  _THAPI_ZE_MEMORY_TYPE_SHARED = 3
 } _thapi_ze_memory_type_t;
 
 typedef struct {
-  uint32_t                stype;
-  const void             *pNext;
+  uint32_t stype;
+  const void *pNext;
   _thapi_ze_memory_type_t type;
-  uint64_t                id;
-  uint64_t                pageSize;
+  uint64_t id;
+  uint64_t pageSize;
 } _thapi_ze_memory_allocation_properties_t;
 
 typedef _thapi_ze_result_t (*_thapi_ze_init_fn_t)(uint32_t flags);
 typedef _thapi_ze_result_t (*_thapi_ze_driver_get_fn_t)(uint32_t *pCount,
-                                                       _thapi_ze_driver_handle_t *phDrivers);
+                                                        _thapi_ze_driver_handle_t *phDrivers);
 typedef _thapi_ze_result_t (*_thapi_ze_context_create_fn_t)(_thapi_ze_driver_handle_t hDriver,
                                                             const void *desc,
                                                             _thapi_ze_context_handle_t *phContext);
 typedef _thapi_ze_result_t (*_thapi_ze_mem_get_alloc_properties_fn_t)(
-    _thapi_ze_context_handle_t hContext, const void *ptr,
+    _thapi_ze_context_handle_t hContext,
+    const void *ptr,
     _thapi_ze_memory_allocation_properties_t *pProps,
     _thapi_ze_device_handle_t *phDevice);
 typedef _thapi_ze_result_t (*_thapi_ze_mem_get_address_range_fn_t)(
-    _thapi_ze_context_handle_t hContext, const void *ptr,
-    void **pBase, size_t *pSize);
+    _thapi_ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize);
 
 static _thapi_ze_mem_get_alloc_properties_fn_t _ze_mem_get_alloc_properties = NULL;
-static _thapi_ze_mem_get_address_range_fn_t    _ze_mem_get_address_range    = NULL;
-static _thapi_ze_context_handle_t              _ze_introspection_ctx        = NULL;
+static _thapi_ze_mem_get_address_range_fn_t _ze_mem_get_address_range = NULL;
+static _thapi_ze_context_handle_t _ze_introspection_ctx = NULL;
 
 /* --- CUDA ----------------------------------------------------------------- */
 /* Driver API (libcuda.so) is preferred: it's the same on every CUDA install
  * and doesn't drag in libcudart. */
-typedef int          _thapi_cu_result_t;          /* CUresult */
+typedef int _thapi_cu_result_t;                   /* CUresult */
 typedef unsigned int _thapi_cu_attr_t;            /* CUpointer_attribute */
 typedef unsigned long long _thapi_cu_deviceptr_t; /* CUdeviceptr */
 
-#define _THAPI_CU_SUCCESS                        0
-#define _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE  2
-#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START  7
-#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE   8
+#define _THAPI_CU_SUCCESS 0
+#define _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE 2
+#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START 7
+#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE 8
 #define _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL 9
-#define _THAPI_CU_MEMORYTYPE_HOST                1
-#define _THAPI_CU_MEMORYTYPE_DEVICE              2
-#define _THAPI_CU_MEMORYTYPE_ARRAY               3
-#define _THAPI_CU_MEMORYTYPE_UNIFIED             4
+#define _THAPI_CU_MEMORYTYPE_HOST 1
+#define _THAPI_CU_MEMORYTYPE_DEVICE 2
+#define _THAPI_CU_MEMORYTYPE_ARRAY 3
+#define _THAPI_CU_MEMORYTYPE_UNIFIED 4
 
 typedef _thapi_cu_result_t (*_thapi_cu_init_fn_t)(unsigned int flags);
 typedef _thapi_cu_result_t (*_thapi_cu_pointer_get_attribute_fn_t)(void *data,
@@ -111,29 +108,30 @@ static _thapi_cu_pointer_get_attribute_fn_t _cu_pointer_get_attribute = NULL;
 /* --- HIP ------------------------------------------------------------------ */
 typedef int _thapi_hip_error_t; /* hipError_t */
 
-#define _THAPI_HIP_SUCCESS               0
-#define _THAPI_HIP_MEMORY_TYPE_HOST      0
-#define _THAPI_HIP_MEMORY_TYPE_DEVICE    1
-#define _THAPI_HIP_MEMORY_TYPE_ARRAY     2
-#define _THAPI_HIP_MEMORY_TYPE_UNIFIED   3
-#define _THAPI_HIP_MEMORY_TYPE_MANAGED   4
+#define _THAPI_HIP_SUCCESS 0
+#define _THAPI_HIP_MEMORY_TYPE_HOST 0
+#define _THAPI_HIP_MEMORY_TYPE_DEVICE 1
+#define _THAPI_HIP_MEMORY_TYPE_ARRAY 2
+#define _THAPI_HIP_MEMORY_TYPE_UNIFIED 3
+#define _THAPI_HIP_MEMORY_TYPE_MANAGED 4
 
 typedef struct {
-  unsigned int  type;
-  int           device;
-  void         *devicePointer;
-  void         *hostPointer;
-  int           isManaged;
-  unsigned int  allocationFlags;
+  unsigned int type;
+  int device;
+  void *devicePointer;
+  void *hostPointer;
+  int isManaged;
+  unsigned int allocationFlags;
 } _thapi_hip_pointer_attribute_t;
 
 typedef _thapi_hip_error_t (*_thapi_hip_pointer_get_attributes_fn_t)(
     _thapi_hip_pointer_attribute_t *attr, const void *ptr);
-typedef _thapi_hip_error_t (*_thapi_hip_mem_get_address_range_fn_t)(
-    void **pbase, size_t *psize, void *dptr);
+typedef _thapi_hip_error_t (*_thapi_hip_mem_get_address_range_fn_t)(void **pbase,
+                                                                    size_t *psize,
+                                                                    void *dptr);
 
 static _thapi_hip_pointer_get_attributes_fn_t _hip_pointer_get_attributes = NULL;
-static _thapi_hip_mem_get_address_range_fn_t  _hip_mem_get_address_range  = NULL;
+static _thapi_hip_mem_get_address_range_fn_t _hip_mem_get_address_range = NULL;
 
 /* --- Per-thread address-range cache --------------------------------------- */
 /* Tiny LRU keyed on allocation base address.  Buffers reused across MPI
@@ -143,19 +141,17 @@ static _thapi_hip_mem_get_address_range_fn_t  _hip_mem_get_address_range  = NULL
 
 struct _thapi_mpi_cache_entry {
   uintptr_t base;
-  size_t    size;
-  uint8_t   kind;
-  uint8_t   backend;
-  uint32_t  device_ordinal;
-  uint8_t   valid;
+  size_t size;
+  uint8_t kind;
+  uint8_t backend;
+  uint32_t device_ordinal;
+  uint8_t valid;
 };
 
-static __thread struct _thapi_mpi_cache_entry
-    _thapi_mpi_cache[_THAPI_MPI_CACHE_SLOTS];
+static __thread struct _thapi_mpi_cache_entry _thapi_mpi_cache[_THAPI_MPI_CACHE_SLOTS];
 static __thread unsigned int _thapi_mpi_cache_next = 0;
 
-static inline int _thapi_mpi_cache_lookup(const void *ptr,
-                                          struct _thapi_mpi_cache_entry *out) {
+static inline int _thapi_mpi_cache_lookup(const void *ptr, struct _thapi_mpi_cache_entry *out) {
   uintptr_t p = (uintptr_t)ptr;
   for (unsigned int i = 0; i < _THAPI_MPI_CACHE_SLOTS; i++) {
     struct _thapi_mpi_cache_entry *e = &_thapi_mpi_cache[i];
@@ -169,33 +165,30 @@ static inline int _thapi_mpi_cache_lookup(const void *ptr,
 
 static inline void _thapi_mpi_cache_insert(const struct _thapi_mpi_cache_entry *e) {
   _thapi_mpi_cache[_thapi_mpi_cache_next] = *e;
-  _thapi_mpi_cache_next =
-      (_thapi_mpi_cache_next + 1) % _THAPI_MPI_CACHE_SLOTS;
+  _thapi_mpi_cache_next = (_thapi_mpi_cache_next + 1) % _THAPI_MPI_CACHE_SLOTS;
 }
 
 /* --- Runtime probing ------------------------------------------------------ */
-static int _try_classify_ze(const void *ptr,
-                            struct _thapi_mpi_cache_entry *out) {
+static int _try_classify_ze(const void *ptr, struct _thapi_mpi_cache_entry *out) {
   if (!_ze_mem_get_alloc_properties || !_ze_introspection_ctx)
     return 0;
 
   _thapi_ze_memory_allocation_properties_t props;
   props.stype = _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES;
   props.pNext = NULL;
-  props.type  = _THAPI_ZE_MEMORY_TYPE_UNKNOWN;
+  props.type = _THAPI_ZE_MEMORY_TYPE_UNKNOWN;
   _thapi_ze_device_handle_t hDev = NULL;
 
-  if (_ze_mem_get_alloc_properties(_ze_introspection_ctx, ptr, &props, &hDev)
-      != _THAPI_ZE_RESULT_SUCCESS)
+  if (_ze_mem_get_alloc_properties(_ze_introspection_ctx, ptr, &props, &hDev) !=
+      _THAPI_ZE_RESULT_SUCCESS)
     return 0;
   if (props.type == _THAPI_ZE_MEMORY_TYPE_UNKNOWN)
     return 0;
 
-  void  *base = NULL;
+  void *base = NULL;
   size_t size = 0;
-  if (_ze_mem_get_address_range &&
-      _ze_mem_get_address_range(_ze_introspection_ctx, ptr, &base, &size)
-          != _THAPI_ZE_RESULT_SUCCESS) {
+  if (_ze_mem_get_address_range && _ze_mem_get_address_range(_ze_introspection_ctx, ptr, &base,
+                                                             &size) != _THAPI_ZE_RESULT_SUCCESS) {
     base = (void *)ptr;
     size = 1;
   }
@@ -205,54 +198,66 @@ static int _try_classify_ze(const void *ptr,
   out->backend = THAPI_MPI_BACKEND_L0;
   out->device_ordinal = 0; /* L0 device ordinal lookup would need extra calls */
   switch (props.type) {
-    case _THAPI_ZE_MEMORY_TYPE_HOST:   out->kind = THAPI_MPI_PTR_HOST;   break;
-    case _THAPI_ZE_MEMORY_TYPE_DEVICE: out->kind = THAPI_MPI_PTR_DEVICE; break;
-    case _THAPI_ZE_MEMORY_TYPE_SHARED: out->kind = THAPI_MPI_PTR_SHARED; break;
-    default:                           out->kind = THAPI_MPI_PTR_UNKNOWN; break;
+  case _THAPI_ZE_MEMORY_TYPE_HOST:
+    out->kind = THAPI_MPI_PTR_HOST;
+    break;
+  case _THAPI_ZE_MEMORY_TYPE_DEVICE:
+    out->kind = THAPI_MPI_PTR_DEVICE;
+    break;
+  case _THAPI_ZE_MEMORY_TYPE_SHARED:
+    out->kind = THAPI_MPI_PTR_SHARED;
+    break;
+  default:
+    out->kind = THAPI_MPI_PTR_UNKNOWN;
+    break;
   }
   out->valid = 1;
   return 1;
 }
 
-static int _try_classify_cuda(const void *ptr,
-                              struct _thapi_mpi_cache_entry *out) {
+static int _try_classify_cuda(const void *ptr, struct _thapi_mpi_cache_entry *out) {
   if (!_cu_pointer_get_attribute)
     return 0;
 
-  unsigned int           mem_type = 0;
-  _thapi_cu_deviceptr_t  range_start = 0;
-  size_t                 range_size = 0;
-  int                    device_ordinal = 0;
-  _thapi_cu_deviceptr_t  dptr = (_thapi_cu_deviceptr_t)(uintptr_t)ptr;
+  unsigned int mem_type = 0;
+  _thapi_cu_deviceptr_t range_start = 0;
+  size_t range_size = 0;
+  int device_ordinal = 0;
+  _thapi_cu_deviceptr_t dptr = (_thapi_cu_deviceptr_t)(uintptr_t)ptr;
 
-  if (_cu_pointer_get_attribute(&mem_type,
-                                _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dptr)
-      != _THAPI_CU_SUCCESS || mem_type == 0)
+  if (_cu_pointer_get_attribute(&mem_type, _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dptr) !=
+          _THAPI_CU_SUCCESS ||
+      mem_type == 0)
     return 0;
 
-  (void)_cu_pointer_get_attribute(&range_start,
-                                  _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START, dptr);
-  (void)_cu_pointer_get_attribute(&range_size,
-                                  _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE, dptr);
-  (void)_cu_pointer_get_attribute(&device_ordinal,
-                                  _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dptr);
+  (void)_cu_pointer_get_attribute(&range_start, _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START, dptr);
+  (void)_cu_pointer_get_attribute(&range_size, _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE, dptr);
+  (void)_cu_pointer_get_attribute(&device_ordinal, _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+                                  dptr);
 
   out->base = range_start ? (uintptr_t)range_start : (uintptr_t)ptr;
   out->size = range_size ? range_size : 1;
   out->backend = THAPI_MPI_BACKEND_CUDA;
   out->device_ordinal = (uint32_t)device_ordinal;
   switch (mem_type) {
-    case _THAPI_CU_MEMORYTYPE_HOST:    out->kind = THAPI_MPI_PTR_HOST;   break;
-    case _THAPI_CU_MEMORYTYPE_DEVICE:  out->kind = THAPI_MPI_PTR_DEVICE; break;
-    case _THAPI_CU_MEMORYTYPE_UNIFIED: out->kind = THAPI_MPI_PTR_SHARED; break;
-    default:                           out->kind = THAPI_MPI_PTR_UNKNOWN; break;
+  case _THAPI_CU_MEMORYTYPE_HOST:
+    out->kind = THAPI_MPI_PTR_HOST;
+    break;
+  case _THAPI_CU_MEMORYTYPE_DEVICE:
+    out->kind = THAPI_MPI_PTR_DEVICE;
+    break;
+  case _THAPI_CU_MEMORYTYPE_UNIFIED:
+    out->kind = THAPI_MPI_PTR_SHARED;
+    break;
+  default:
+    out->kind = THAPI_MPI_PTR_UNKNOWN;
+    break;
   }
   out->valid = 1;
   return 1;
 }
 
-static int _try_classify_hip(const void *ptr,
-                             struct _thapi_mpi_cache_entry *out) {
+static int _try_classify_hip(const void *ptr, struct _thapi_mpi_cache_entry *out) {
   if (!_hip_pointer_get_attributes)
     return 0;
 
@@ -261,7 +266,7 @@ static int _try_classify_hip(const void *ptr,
   if (_hip_pointer_get_attributes(&attr, ptr) != _THAPI_HIP_SUCCESS)
     return 0;
 
-  void  *base = (void *)ptr;
+  void *base = (void *)ptr;
   size_t size = 1;
   if (_hip_mem_get_address_range)
     (void)_hip_mem_get_address_range(&base, &size, (void *)ptr);
@@ -291,9 +296,7 @@ static int _try_classify_hip(const void *ptr,
  * hygiene and can in rare cases trigger reentrancy through the L0 loader. */
 static void _load_gpu_introspection(void);
 static pthread_once_t _gpu_introspection_once = PTHREAD_ONCE_INIT;
-static void _load_gpu_introspection_once(void) {
-  _load_gpu_introspection();
-}
+static void _load_gpu_introspection_once(void) { _load_gpu_introspection(); }
 
 static inline void _dump_buffer_info(const void *ptr, int role) {
   if (!ptr)
@@ -309,9 +312,7 @@ static inline void _dump_buffer_info(const void *ptr, int role) {
   struct _thapi_mpi_cache_entry e;
   if (!_thapi_mpi_cache_lookup(ptr, &e)) {
     e.valid = 0;
-    if (!_try_classify_ze(ptr, &e) &&
-        !_try_classify_cuda(ptr, &e) &&
-        !_try_classify_hip(ptr, &e)) {
+    if (!_try_classify_ze(ptr, &e) && !_try_classify_cuda(ptr, &e) && !_try_classify_hip(ptr, &e)) {
       /* Nobody claims it: treat as host. Cache a 1-byte range so we still
        * answer in O(1) on the next call, but don't pollute the cache with
        * adjacent host allocations. */
@@ -325,8 +326,7 @@ static inline void _dump_buffer_info(const void *ptr, int role) {
     _thapi_mpi_cache_insert(&e);
   }
 
-  do_tracepoint(lttng_ust_mpi_properties, buffer_info,
-                ptr, role, (int)e.kind, (int)e.backend,
+  do_tracepoint(lttng_ust_mpi_properties, buffer_info, ptr, role, (int)e.kind, (int)e.backend,
                 e.base, (uint64_t)e.size, e.device_ordinal);
 }
 
@@ -336,57 +336,55 @@ static void _load_gpu_introspection(void) {
    * corresponding classifier stays NULL and is silently skipped. */
   void *h_ze = dlopen("libze_loader.so.1", RTLD_LAZY | RTLD_LOCAL);
   if (!h_ze)
-    h_ze = dlopen("libze_loader.so",   RTLD_LAZY | RTLD_LOCAL);
+    h_ze = dlopen("libze_loader.so", RTLD_LAZY | RTLD_LOCAL);
   if (h_ze) {
-    _thapi_ze_init_fn_t           ze_init    = (_thapi_ze_init_fn_t)
-        dlsym(h_ze, "zeInit");
-    _thapi_ze_driver_get_fn_t     ze_driver  = (_thapi_ze_driver_get_fn_t)
-        dlsym(h_ze, "zeDriverGet");
-    _thapi_ze_context_create_fn_t ze_ctx_new = (_thapi_ze_context_create_fn_t)
-        dlsym(h_ze, "zeContextCreate");
-    _ze_mem_get_alloc_properties = (_thapi_ze_mem_get_alloc_properties_fn_t)
-        dlsym(h_ze, "zeMemGetAllocProperties");
-    _ze_mem_get_address_range    = (_thapi_ze_mem_get_address_range_fn_t)
-        dlsym(h_ze, "zeMemGetAddressRange");
+    _thapi_ze_init_fn_t ze_init = (_thapi_ze_init_fn_t)dlsym(h_ze, "zeInit");
+    _thapi_ze_driver_get_fn_t ze_driver = (_thapi_ze_driver_get_fn_t)dlsym(h_ze, "zeDriverGet");
+    _thapi_ze_context_create_fn_t ze_ctx_new =
+        (_thapi_ze_context_create_fn_t)dlsym(h_ze, "zeContextCreate");
+    _ze_mem_get_alloc_properties =
+        (_thapi_ze_mem_get_alloc_properties_fn_t)dlsym(h_ze, "zeMemGetAllocProperties");
+    _ze_mem_get_address_range =
+        (_thapi_ze_mem_get_address_range_fn_t)dlsym(h_ze, "zeMemGetAddressRange");
 
     /* zeMemGetAllocProperties needs a context.  Create a private one against
      * the first driver so introspection never touches application contexts. */
     uint32_t n = 1;
     _thapi_ze_driver_handle_t hDrv = NULL;
-    struct { uint32_t stype; const void *pNext; uint32_t flags; } desc = {
-        0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0 };
+    struct {
+      uint32_t stype;
+      const void *pNext;
+      uint32_t flags;
+    } desc = {0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0};
     if (!(ze_init && ze_driver && ze_ctx_new &&
           ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS &&
           ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv &&
-          ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx)
-              == _THAPI_ZE_RESULT_SUCCESS)) {
+          ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx) == _THAPI_ZE_RESULT_SUCCESS)) {
       _ze_mem_get_alloc_properties = NULL;
-      _ze_mem_get_address_range    = NULL;
-      _ze_introspection_ctx        = NULL;
+      _ze_mem_get_address_range = NULL;
+      _ze_introspection_ctx = NULL;
     }
   }
 
   void *h_cu = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_LOCAL);
   if (!h_cu)
-    h_cu = dlopen("libcuda.so",   RTLD_LAZY | RTLD_LOCAL);
+    h_cu = dlopen("libcuda.so", RTLD_LAZY | RTLD_LOCAL);
   if (h_cu) {
-    _thapi_cu_init_fn_t cu_init =
-        (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit");
-    _cu_pointer_get_attribute = (_thapi_cu_pointer_get_attribute_fn_t)
-        dlsym(h_cu, "cuPointerGetAttribute");
-    if (!(cu_init && _cu_pointer_get_attribute &&
-          cu_init(0) == _THAPI_CU_SUCCESS))
+    _thapi_cu_init_fn_t cu_init = (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit");
+    _cu_pointer_get_attribute =
+        (_thapi_cu_pointer_get_attribute_fn_t)dlsym(h_cu, "cuPointerGetAttribute");
+    if (!(cu_init && _cu_pointer_get_attribute && cu_init(0) == _THAPI_CU_SUCCESS))
       _cu_pointer_get_attribute = NULL;
   }
 
   void *h_hip = dlopen("libamdhip64.so.5", RTLD_LAZY | RTLD_LOCAL);
   if (!h_hip)
-    h_hip = dlopen("libamdhip64.so",   RTLD_LAZY | RTLD_LOCAL);
+    h_hip = dlopen("libamdhip64.so", RTLD_LAZY | RTLD_LOCAL);
   if (h_hip) {
-    _hip_pointer_get_attributes = (_thapi_hip_pointer_get_attributes_fn_t)
-        dlsym(h_hip, "hipPointerGetAttributes");
-    _hip_mem_get_address_range  = (_thapi_hip_mem_get_address_range_fn_t)
-        dlsym(h_hip, "hipMemGetAddressRange");
+    _hip_pointer_get_attributes =
+        (_thapi_hip_pointer_get_attributes_fn_t)dlsym(h_hip, "hipPointerGetAttributes");
+    _hip_mem_get_address_range =
+        (_thapi_hip_mem_get_address_range_fn_t)dlsym(h_hip, "hipMemGetAddressRange");
   }
 }
 

From 7a388deae2c807ff0d8bd3799a2fffb4740fc527 Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 17:23:12 -0500
Subject: [PATCH 08/14] Revert producer-side GPU introspection

Per review (Thomas Applencourt): the MPI tracer should not call into
the GPU runtime to classify pointers. Specifically:

  - calling zeInit can break apps that rely on zesInit being first
    (deprecated API + sysman interaction); THAPI takes care to never
    call it,
  - dlopen('libze_loader.so') from inside an LD_PRELOAD'd libmpi.so
    actually opens the ze backend's own interceptor wrapper, so every
    zeMemGetAllocProperties query produces a spurious ze event and
    self-traces,
  - querying zeMemGetAllocProperties with a private context returns
    correct results only by accident on the current Intel L0 loader.

Revert all producer-side changes (tracer_mpi_helpers.include.c,
mpi_events.yaml, mpi_model.rb prologue registration, gen_mpi.rb
includes, Makefile.am probe entry, tracer_mpi.sh.in / xprof.rb.in
enable-event lines, mpi.h.include). Classification will be done
post-mortem in btx_mpiinterval_callbacks.cpp using the existing
zeMemAlloc*/zeMemFree events emitted by the ze backend, mirroring
the rangeset_memory_{host,device,shared} pattern in
btx_zeinterval_callbacks.cpp.
---
 backends/mpi/Makefile.am                   |   3 +-
 backends/mpi/btx_mpiinterval_callbacks.cpp |  49 +--
 backends/mpi/gen_mpi.rb                    |   6 -
 backends/mpi/mpi.h.include                 |   3 +-
 backends/mpi/mpi_events.yaml               |  33 +-
 backends/mpi/mpi_model.rb                  |  41 ---
 backends/mpi/tracer_mpi.sh.in              |   2 -
 backends/mpi/tracer_mpi_helpers.include.c  | 389 ---------------------
 xprof/xprof.rb.in                          |   1 -
 9 files changed, 10 insertions(+), 517 deletions(-)

diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am
index 13400366e..f3d0ea28c 100644
--- a/backends/mpi/Makefile.am
+++ b/backends/mpi/Makefile.am
@@ -62,8 +62,7 @@ MPI_PROBES_INCL = $(MPI_PROBES:=.h)
 MPI_PROBES_SRC = $(MPI_PROBES:=.c)
 
 MPI_STATIC_PROBES = \
-	mpi_type \
-	mpi_properties
+	mpi_type
 
 MPI_STATIC_PROBES_TP = $(MPI_STATIC_PROBES:=.tp)
 MPI_STATIC_PROBES_INCL = $(MPI_STATIC_PROBES:=.h)
diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
index 43dca7878..65f1cedd2 100644
--- a/backends/mpi/btx_mpiinterval_callbacks.cpp
+++ b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -5,22 +5,9 @@
 #include <sstream>
 #include <string>
 #include <tuple>
-#include <unordered_set>
-
-// Must match the enum in backends/mpi/tracer_mpi_helpers.include.c.
-enum {
-  THAPI_MPI_PTR_UNKNOWN = 0,
-  THAPI_MPI_PTR_HOST = 1,
-  THAPI_MPI_PTR_DEVICE = 2,
-  THAPI_MPI_PTR_SHARED = 3
-};
 
 struct data_s {
   EntryState entry_state;
-  // Set of {hostname, vpid, vtid} for which the in-flight MPI call has been
-  // observed to carry at least one device/shared GPU buffer.  Populated by
-  // buffer_info_callback, consumed (and cleared) by send_host_message.
-  std::unordered_set<hpt_t> gpu_aware_calls;
 };
 
 typedef struct data_s data_t;
@@ -34,16 +21,9 @@ static void send_host_message(void *btx_handle,
                               uint64_t vtid,
                               bool err) {
 
-  auto *data = static_cast<data_t *>(usr_data);
-  const hpt_t hpt{hostname, vpid, vtid};
-
   std::string event_class_name_striped = strip_event_class_name_exit(event_class_name);
-  // GPU-aware MPI calls are prefixed with '*' so they stand out in the
-  // profile.  The flag is set by buffer_info_callback during this call.
-  if (data->gpu_aware_calls.erase(hpt) > 0)
-    event_class_name_striped.insert(0, 1, '*');
-
-  const int64_t entry_ts = data->entry_state.get_ts(hpt);
+  const int64_t entry_ts =
+      static_cast<data_t *>(usr_data)->entry_state.get_ts({hostname, vpid, vtid});
 
   btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI,
                               event_class_name_striped.c_str(), (ts - entry_ts), err);
@@ -169,29 +149,6 @@ static void type_property_callback(void *btx_handle,
   mpi_datatype_info[(uint64_t)datatype] = {std::to_string((uint64_t)datatype), size};
 }
 
-// Emitted from the prologue of every buffer-bearing MPI call, once per
-// buffer argument.  We only care whether at least one buffer is on the GPU
-// (device or shared/managed); that's enough to tag the call as GPU-aware.
-static void buffer_info_callback(void *btx_handle,
-                                 void *usr_data,
-                                 int64_t ts,
-                                 const char *hostname,
-                                 int64_t vpid,
-                                 uint64_t vtid,
-                                 void *ptr,
-                                 int role,
-                                 int kind,
-                                 int backend,
-                                 uintptr_t base,
-                                 uint64_t size,
-                                 uint32_t device_ordinal) {
-
-  if (kind == THAPI_MPI_PTR_DEVICE || kind == THAPI_MPI_PTR_SHARED) {
-    auto *data = static_cast<data_t *>(usr_data);
-    data->gpu_aware_calls.insert({hostname, vpid, vtid});
-  }
-}
-
 static void traffic_MPI_Count_entry_callback(void *btx_handle,
                                              void *usr_data,
                                              int64_t ts,
@@ -252,6 +209,4 @@ void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback);
 
   btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback);
-
-  btx_register_callbacks_lttng_ust_mpi_properties_buffer_info(btx_handle, &buffer_info_callback);
 }
diff --git a/backends/mpi/gen_mpi.rb b/backends/mpi/gen_mpi.rb
index 3c0e349cd..bd939a9bb 100644
--- a/backends/mpi/gen_mpi.rb
+++ b/backends/mpi/gen_mpi.rb
@@ -19,10 +19,6 @@ def common_block(c, provider)
   tracepoint_params = c.tracepoint_parameters.reject { |p| p.after? }.collect(&:name)
   puts "  tracepoint(#{provider}, #{c.name}_#{START}, #{(tp_params + tracepoint_params).join(', ')});"
 
-  c.prologues.each do |p|
-    puts p
-  end
-
   params = c.parameters.collect(&:name)
   if c.has_return_type?
     puts "  #{c.type} _retval;"
@@ -79,12 +75,10 @@ def define_and_find_mpi_symbols
 
 puts <<~EOF
   #include <stdint.h>
-  #include <string.h>
   #define MPICH_FORTRAN_SYMBOLS_NONABI
   #include <mpi.h>
   #include "mpi_tracepoints.h"
   #include "mpi_type.h"
-  #include "mpi_properties.h"
   #include <dlfcn.h>
   #include <pthread.h>
 EOF
diff --git a/backends/mpi/mpi.h.include b/backends/mpi/mpi.h.include
index 4fe43271b..aed167967 100644
--- a/backends/mpi/mpi.h.include
+++ b/backends/mpi/mpi.h.include
@@ -1,3 +1,2 @@
 #include <mpi.h>
-#include <stdint.h>
-#include <stddef.h>
+
diff --git a/backends/mpi/mpi_events.yaml b/backends/mpi/mpi_events.yaml
index 51a03be66..48c6dcc96 100644
--- a/backends/mpi/mpi_events.yaml
+++ b/backends/mpi/mpi_events.yaml
@@ -1,32 +1,11 @@
+---
 lttng_ust_mpi_type:
   events:
   - name: property
     args:
-    - [MPI_Datatype, datatype]
-    - [int, size]
+    - [ MPI_Datatype, datatype]
+    - [ int, size]
     fields:
-    - [ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype"]
-    - [ctf_integer, int, size, size]
-lttng_ust_mpi_properties:
-  events:
-  # Emitted once per buffer argument of a buffer-bearing MPI call.
-  # `kind`    : 0 unknown, 1 host, 2 device, 3 shared/managed
-  # `backend` : 0 none,    1 level-zero, 2 cuda, 3 hip
-  # `role`    : 0 input (send-like), 1 output (recv-like)
-  - name: buffer_info
-    args:
-    - [const void *, ptr]
-    - [int, role]
-    - [int, kind]
-    - [int, backend]
-    - [uintptr_t, base]
-    - [uint64_t, size]
-    - [uint32_t, device_ordinal]
-    fields:
-    - [ctf_integer_hex, uintptr_t, ptr, "(uintptr_t)ptr"]
-    - [ctf_integer, int, role, role]
-    - [ctf_integer, int, kind, kind]
-    - [ctf_integer, int, backend, backend]
-    - [ctf_integer_hex, uintptr_t, base, base]
-    - [ctf_integer, uint64_t, size, size]
-    - [ctf_integer, uint32_t, device_ordinal, device_ordinal]
+    - [ ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype" ]
+    - [ ctf_integer, int, size, size ]
+
diff --git a/backends/mpi/mpi_model.rb b/backends/mpi/mpi_model.rb
index 0568314df..c8f3cc56c 100644
--- a/backends/mpi/mpi_model.rb
+++ b/backends/mpi/mpi_model.rb
@@ -89,44 +89,3 @@ def underscore(camel_cased_word)
   if (tracepoint_enabled(lttng_ust_mpi_type, property))#{' '}
     tracepoint(lttng_ust_mpi_type, property, *datatype, size);
 EOF
-
-# ---------------------------------------------------------------------------
-# GPU-aware MPI introspection
-#
-# For every MPI call that carries a user payload buffer, register a prologue
-# that emits an `lttng_ust_mpi_properties:buffer_info` event per buffer.  The
-# event records whether the pointer is host / device / shared memory and which
-# GPU runtime (Level Zero, CUDA, HIP) owns it.  Consumers (extract/,
-# btx_mpiinterval_callbacks.cpp) use that to tag each MPI call as "GPU-aware"
-# or "CPU-only".
-#
-# The set of buffer parameters is derived from mpi_meta_parameters.yaml so
-# new MPI APIs picked up by codegen are classified automatically: any
-# InScalar/OutScalar parameter whose name matches one of the canonical MPI
-# payload-buffer names below is treated as a user buffer.
-# ---------------------------------------------------------------------------
-MPI_PAYLOAD_BUFFER_NAMES = %w[
-  buf buffer sendbuf recvbuf inbuf outbuf inoutbuf
-  origin_addr result_addr compare_addr
-].freeze
-
-# NOTE: the YAML top-level key is the Ruby symbol :meta_parameters (it's
-# serialised as ":meta_parameters:"), not the string "meta_parameters".
-$mpi_meta_parameters.fetch(:meta_parameters, {}).each do |func, list|
-  buffers = list.select do |type, name|
-    %w[InScalar OutScalar].include?(type) &&
-      MPI_PAYLOAD_BUFFER_NAMES.include?(name)
-  end
-  next if buffers.empty?
-
-  body = buffers.map do |type, name|
-    role = type == 'InScalar' ? 'THAPI_MPI_ROLE_IN' : 'THAPI_MPI_ROLE_OUT'
-    "  _dump_buffer_info((const void *)#{name}, #{role});"
-  end.join("\n")
-
-  register_prologue func, <<~EOF
-    if (tracepoint_enabled(lttng_ust_mpi_properties, buffer_info)) {
-    #{body}
-    }
-  EOF
-end
diff --git a/backends/mpi/tracer_mpi.sh.in b/backends/mpi/tracer_mpi.sh.in
index e841ad94b..55343f6ca 100644
--- a/backends/mpi/tracer_mpi.sh.in
+++ b/backends/mpi/tracer_mpi.sh.in
@@ -81,8 +81,6 @@ fi
 lttng enable-channel --userspace --blocking-timeout=inf blocking-channel
 lttng add-context --userspace --channel=blocking-channel -t vpid -t vtid
 lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi:*
-lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi_type:*
-lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi_properties:*
 if [ -z "$LTTNG_UST_MPI_LIBMPI" ]; then
   LTTNG_UST_MPI_LIBMPI=$(whichlib64_head libmpi.so)
   if [ -n "$LTTNG_UST_MPI_LIBMPI" ]; then
diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c
index b4fbcf43f..a432d2183 100644
--- a/backends/mpi/tracer_mpi_helpers.include.c
+++ b/backends/mpi/tracer_mpi_helpers.include.c
@@ -2,392 +2,6 @@ static pthread_once_t _init = PTHREAD_ONCE_INIT;
 static __thread volatile int in_init = 0;
 static volatile unsigned int _initialized = 0;
 
-/* ---------------------------------------------------------------------------
- * GPU pointer introspection
- *
- * Goal: when an MPI call carries a buffer pointer, decide at runtime whether
- * the pointer refers to host memory or to a GPU allocation, so that downstream
- * tools can tag the call as "GPU-aware MPI" vs "CPU-only MPI".
- *
- * Design constraints (matching the existing MPI backend):
- *   - No build-time dependency on Level Zero / CUDA / HIP.  Each loader is
- *     dlopen'd lazily, and only the symbols we need are looked up via dlsym
- *     using locally-declared function-pointer signatures over opaque types.
- *   - All overhead is gated by tracepoint_enabled() and a small per-thread
- *     cache keyed on the allocation base address, so a buffer reused across
- *     iterations is classified only once.
- * ------------------------------------------------------------------------- */
-
-/* Values reported in the buffer_info tracepoint.  Keep in sync with consumers
- * (extract/, btx_mpiinterval_callbacks.cpp). */
-enum thapi_mpi_ptr_kind {
-  THAPI_MPI_PTR_UNKNOWN = 0,
-  THAPI_MPI_PTR_HOST = 1,
-  THAPI_MPI_PTR_DEVICE = 2,
-  THAPI_MPI_PTR_SHARED = 3
-};
-
-enum thapi_mpi_ptr_backend {
-  THAPI_MPI_BACKEND_NONE = 0,
-  THAPI_MPI_BACKEND_L0 = 1,
-  THAPI_MPI_BACKEND_CUDA = 2,
-  THAPI_MPI_BACKEND_HIP = 3
-};
-
-enum thapi_mpi_buffer_role { THAPI_MPI_ROLE_IN = 0, THAPI_MPI_ROLE_OUT = 1 };
-
-/* --- Level Zero ----------------------------------------------------------- */
-/* Opaque handle types and the minimal enums/structs needed to call
- * zeMemGetAllocProperties / zeMemGetAddressRange without including ze_api.h. */
-typedef int _thapi_ze_result_t; /* ze_result_t */
-typedef struct _thapi_ze_driver *_thapi_ze_driver_handle_t;
-typedef struct _thapi_ze_context *_thapi_ze_context_handle_t;
-typedef struct _thapi_ze_device *_thapi_ze_device_handle_t;
-
-#define _THAPI_ZE_RESULT_SUCCESS 0
-#define _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES 0x000c
-#define _THAPI_ZE_INIT_FLAG_GPU_ONLY 0x01
-
-typedef enum {
-  _THAPI_ZE_MEMORY_TYPE_UNKNOWN = 0,
-  _THAPI_ZE_MEMORY_TYPE_HOST = 1,
-  _THAPI_ZE_MEMORY_TYPE_DEVICE = 2,
-  _THAPI_ZE_MEMORY_TYPE_SHARED = 3
-} _thapi_ze_memory_type_t;
-
-typedef struct {
-  uint32_t stype;
-  const void *pNext;
-  _thapi_ze_memory_type_t type;
-  uint64_t id;
-  uint64_t pageSize;
-} _thapi_ze_memory_allocation_properties_t;
-
-typedef _thapi_ze_result_t (*_thapi_ze_init_fn_t)(uint32_t flags);
-typedef _thapi_ze_result_t (*_thapi_ze_driver_get_fn_t)(uint32_t *pCount,
-                                                        _thapi_ze_driver_handle_t *phDrivers);
-typedef _thapi_ze_result_t (*_thapi_ze_context_create_fn_t)(_thapi_ze_driver_handle_t hDriver,
-                                                            const void *desc,
-                                                            _thapi_ze_context_handle_t *phContext);
-typedef _thapi_ze_result_t (*_thapi_ze_mem_get_alloc_properties_fn_t)(
-    _thapi_ze_context_handle_t hContext,
-    const void *ptr,
-    _thapi_ze_memory_allocation_properties_t *pProps,
-    _thapi_ze_device_handle_t *phDevice);
-typedef _thapi_ze_result_t (*_thapi_ze_mem_get_address_range_fn_t)(
-    _thapi_ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize);
-
-static _thapi_ze_mem_get_alloc_properties_fn_t _ze_mem_get_alloc_properties = NULL;
-static _thapi_ze_mem_get_address_range_fn_t _ze_mem_get_address_range = NULL;
-static _thapi_ze_context_handle_t _ze_introspection_ctx = NULL;
-
-/* --- CUDA ----------------------------------------------------------------- */
-/* Driver API (libcuda.so) is preferred: it's the same on every CUDA install
- * and doesn't drag in libcudart. */
-typedef int _thapi_cu_result_t;                   /* CUresult */
-typedef unsigned int _thapi_cu_attr_t;            /* CUpointer_attribute */
-typedef unsigned long long _thapi_cu_deviceptr_t; /* CUdeviceptr */
-
-#define _THAPI_CU_SUCCESS 0
-#define _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE 2
-#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START 7
-#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE 8
-#define _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL 9
-#define _THAPI_CU_MEMORYTYPE_HOST 1
-#define _THAPI_CU_MEMORYTYPE_DEVICE 2
-#define _THAPI_CU_MEMORYTYPE_ARRAY 3
-#define _THAPI_CU_MEMORYTYPE_UNIFIED 4
-
-typedef _thapi_cu_result_t (*_thapi_cu_init_fn_t)(unsigned int flags);
-typedef _thapi_cu_result_t (*_thapi_cu_pointer_get_attribute_fn_t)(void *data,
-                                                                   _thapi_cu_attr_t attr,
-                                                                   _thapi_cu_deviceptr_t ptr);
-
-static _thapi_cu_pointer_get_attribute_fn_t _cu_pointer_get_attribute = NULL;
-
-/* --- HIP ------------------------------------------------------------------ */
-typedef int _thapi_hip_error_t; /* hipError_t */
-
-#define _THAPI_HIP_SUCCESS 0
-#define _THAPI_HIP_MEMORY_TYPE_HOST 0
-#define _THAPI_HIP_MEMORY_TYPE_DEVICE 1
-#define _THAPI_HIP_MEMORY_TYPE_ARRAY 2
-#define _THAPI_HIP_MEMORY_TYPE_UNIFIED 3
-#define _THAPI_HIP_MEMORY_TYPE_MANAGED 4
-
-typedef struct {
-  unsigned int type;
-  int device;
-  void *devicePointer;
-  void *hostPointer;
-  int isManaged;
-  unsigned int allocationFlags;
-} _thapi_hip_pointer_attribute_t;
-
-typedef _thapi_hip_error_t (*_thapi_hip_pointer_get_attributes_fn_t)(
-    _thapi_hip_pointer_attribute_t *attr, const void *ptr);
-typedef _thapi_hip_error_t (*_thapi_hip_mem_get_address_range_fn_t)(void **pbase,
-                                                                    size_t *psize,
-                                                                    void *dptr);
-
-static _thapi_hip_pointer_get_attributes_fn_t _hip_pointer_get_attributes = NULL;
-static _thapi_hip_mem_get_address_range_fn_t _hip_mem_get_address_range = NULL;
-
-/* --- Per-thread address-range cache --------------------------------------- */
-/* Tiny LRU keyed on allocation base address.  Buffers reused across MPI
- * iterations (the common case in HPC) hit the cache and skip the runtime
- * query entirely.  Sized small enough to live cheaply per-thread. */
-#define _THAPI_MPI_CACHE_SLOTS 16
-
-struct _thapi_mpi_cache_entry {
-  uintptr_t base;
-  size_t size;
-  uint8_t kind;
-  uint8_t backend;
-  uint32_t device_ordinal;
-  uint8_t valid;
-};
-
-static __thread struct _thapi_mpi_cache_entry _thapi_mpi_cache[_THAPI_MPI_CACHE_SLOTS];
-static __thread unsigned int _thapi_mpi_cache_next = 0;
-
-static inline int _thapi_mpi_cache_lookup(const void *ptr, struct _thapi_mpi_cache_entry *out) {
-  uintptr_t p = (uintptr_t)ptr;
-  for (unsigned int i = 0; i < _THAPI_MPI_CACHE_SLOTS; i++) {
-    struct _thapi_mpi_cache_entry *e = &_thapi_mpi_cache[i];
-    if (e->valid && p >= e->base && p < e->base + e->size) {
-      *out = *e;
-      return 1;
-    }
-  }
-  return 0;
-}
-
-static inline void _thapi_mpi_cache_insert(const struct _thapi_mpi_cache_entry *e) {
-  _thapi_mpi_cache[_thapi_mpi_cache_next] = *e;
-  _thapi_mpi_cache_next = (_thapi_mpi_cache_next + 1) % _THAPI_MPI_CACHE_SLOTS;
-}
-
-/* --- Runtime probing ------------------------------------------------------ */
-static int _try_classify_ze(const void *ptr, struct _thapi_mpi_cache_entry *out) {
-  if (!_ze_mem_get_alloc_properties || !_ze_introspection_ctx)
-    return 0;
-
-  _thapi_ze_memory_allocation_properties_t props;
-  props.stype = _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES;
-  props.pNext = NULL;
-  props.type = _THAPI_ZE_MEMORY_TYPE_UNKNOWN;
-  _thapi_ze_device_handle_t hDev = NULL;
-
-  if (_ze_mem_get_alloc_properties(_ze_introspection_ctx, ptr, &props, &hDev) !=
-      _THAPI_ZE_RESULT_SUCCESS)
-    return 0;
-  if (props.type == _THAPI_ZE_MEMORY_TYPE_UNKNOWN)
-    return 0;
-
-  void *base = NULL;
-  size_t size = 0;
-  if (_ze_mem_get_address_range && _ze_mem_get_address_range(_ze_introspection_ctx, ptr, &base,
-                                                             &size) != _THAPI_ZE_RESULT_SUCCESS) {
-    base = (void *)ptr;
-    size = 1;
-  }
-
-  out->base = (uintptr_t)base;
-  out->size = size ? size : 1;
-  out->backend = THAPI_MPI_BACKEND_L0;
-  out->device_ordinal = 0; /* L0 device ordinal lookup would need extra calls */
-  switch (props.type) {
-  case _THAPI_ZE_MEMORY_TYPE_HOST:
-    out->kind = THAPI_MPI_PTR_HOST;
-    break;
-  case _THAPI_ZE_MEMORY_TYPE_DEVICE:
-    out->kind = THAPI_MPI_PTR_DEVICE;
-    break;
-  case _THAPI_ZE_MEMORY_TYPE_SHARED:
-    out->kind = THAPI_MPI_PTR_SHARED;
-    break;
-  default:
-    out->kind = THAPI_MPI_PTR_UNKNOWN;
-    break;
-  }
-  out->valid = 1;
-  return 1;
-}
-
-static int _try_classify_cuda(const void *ptr, struct _thapi_mpi_cache_entry *out) {
-  if (!_cu_pointer_get_attribute)
-    return 0;
-
-  unsigned int mem_type = 0;
-  _thapi_cu_deviceptr_t range_start = 0;
-  size_t range_size = 0;
-  int device_ordinal = 0;
-  _thapi_cu_deviceptr_t dptr = (_thapi_cu_deviceptr_t)(uintptr_t)ptr;
-
-  if (_cu_pointer_get_attribute(&mem_type, _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dptr) !=
-          _THAPI_CU_SUCCESS ||
-      mem_type == 0)
-    return 0;
-
-  (void)_cu_pointer_get_attribute(&range_start, _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START, dptr);
-  (void)_cu_pointer_get_attribute(&range_size, _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE, dptr);
-  (void)_cu_pointer_get_attribute(&device_ordinal, _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
-                                  dptr);
-
-  out->base = range_start ? (uintptr_t)range_start : (uintptr_t)ptr;
-  out->size = range_size ? range_size : 1;
-  out->backend = THAPI_MPI_BACKEND_CUDA;
-  out->device_ordinal = (uint32_t)device_ordinal;
-  switch (mem_type) {
-  case _THAPI_CU_MEMORYTYPE_HOST:
-    out->kind = THAPI_MPI_PTR_HOST;
-    break;
-  case _THAPI_CU_MEMORYTYPE_DEVICE:
-    out->kind = THAPI_MPI_PTR_DEVICE;
-    break;
-  case _THAPI_CU_MEMORYTYPE_UNIFIED:
-    out->kind = THAPI_MPI_PTR_SHARED;
-    break;
-  default:
-    out->kind = THAPI_MPI_PTR_UNKNOWN;
-    break;
-  }
-  out->valid = 1;
-  return 1;
-}
-
-static int _try_classify_hip(const void *ptr, struct _thapi_mpi_cache_entry *out) {
-  if (!_hip_pointer_get_attributes)
-    return 0;
-
-  _thapi_hip_pointer_attribute_t attr;
-  memset(&attr, 0, sizeof(attr));
-  if (_hip_pointer_get_attributes(&attr, ptr) != _THAPI_HIP_SUCCESS)
-    return 0;
-
-  void *base = (void *)ptr;
-  size_t size = 1;
-  if (_hip_mem_get_address_range)
-    (void)_hip_mem_get_address_range(&base, &size, (void *)ptr);
-
-  out->base = (uintptr_t)base;
-  out->size = size ? size : 1;
-  out->backend = THAPI_MPI_BACKEND_HIP;
-  out->device_ordinal = (uint32_t)attr.device;
-  if (attr.isManaged || attr.type == _THAPI_HIP_MEMORY_TYPE_UNIFIED ||
-      attr.type == _THAPI_HIP_MEMORY_TYPE_MANAGED)
-    out->kind = THAPI_MPI_PTR_SHARED;
-  else if (attr.type == _THAPI_HIP_MEMORY_TYPE_DEVICE)
-    out->kind = THAPI_MPI_PTR_DEVICE;
-  else if (attr.type == _THAPI_HIP_MEMORY_TYPE_HOST)
-    out->kind = THAPI_MPI_PTR_HOST;
-  else
-    out->kind = THAPI_MPI_PTR_UNKNOWN;
-  out->valid = 1;
-  return 1;
-}
-
-/* Lazy initialisation of the GPU introspection loaders.  Deferred until the
- * first buffer classification so that the application has finished its own
- * runtime setup (zeInit / cuInit / ZE_AFFINITY_MASK / etc.) by the time we
- * peek at the GPU stack.  Doing this from _load_tracer() instead would create
- * an L0 context before the app has set its environment, which is bad
- * hygiene and can in rare cases trigger reentrancy through the L0 loader. */
-static void _load_gpu_introspection(void);
-static pthread_once_t _gpu_introspection_once = PTHREAD_ONCE_INIT;
-static void _load_gpu_introspection_once(void) { _load_gpu_introspection(); }
-
-static inline void _dump_buffer_info(const void *ptr, int role) {
-  if (!ptr)
-    return;
-  /* MPI_IN_PLACE / MPI_BOTTOM are sentinel values, not real pointers. */
-  if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM)
-    return;
-  if (!tracepoint_enabled(lttng_ust_mpi_properties, buffer_info))
-    return;
-
-  pthread_once(&_gpu_introspection_once, _load_gpu_introspection_once);
-
-  struct _thapi_mpi_cache_entry e;
-  if (!_thapi_mpi_cache_lookup(ptr, &e)) {
-    e.valid = 0;
-    if (!_try_classify_ze(ptr, &e) && !_try_classify_cuda(ptr, &e) && !_try_classify_hip(ptr, &e)) {
-      /* Nobody claims it: treat as host. Cache a 1-byte range so we still
-       * answer in O(1) on the next call, but don't pollute the cache with
-       * adjacent host allocations. */
-      e.base = (uintptr_t)ptr;
-      e.size = 1;
-      e.kind = THAPI_MPI_PTR_HOST;
-      e.backend = THAPI_MPI_BACKEND_NONE;
-      e.device_ordinal = 0;
-      e.valid = 1;
-    }
-    _thapi_mpi_cache_insert(&e);
-  }
-
-  do_tracepoint(lttng_ust_mpi_properties, buffer_info, ptr, role, (int)e.kind, (int)e.backend,
-                e.base, (uint64_t)e.size, e.device_ordinal);
-}
-
-/* --- Loader -------------------------------------------------------------- */
-static void _load_gpu_introspection(void) {
-  /* Each loader is optional: if the runtime isn't present on the system, the
-   * corresponding classifier stays NULL and is silently skipped. */
-  void *h_ze = dlopen("libze_loader.so.1", RTLD_LAZY | RTLD_LOCAL);
-  if (!h_ze)
-    h_ze = dlopen("libze_loader.so", RTLD_LAZY | RTLD_LOCAL);
-  if (h_ze) {
-    _thapi_ze_init_fn_t ze_init = (_thapi_ze_init_fn_t)dlsym(h_ze, "zeInit");
-    _thapi_ze_driver_get_fn_t ze_driver = (_thapi_ze_driver_get_fn_t)dlsym(h_ze, "zeDriverGet");
-    _thapi_ze_context_create_fn_t ze_ctx_new =
-        (_thapi_ze_context_create_fn_t)dlsym(h_ze, "zeContextCreate");
-    _ze_mem_get_alloc_properties =
-        (_thapi_ze_mem_get_alloc_properties_fn_t)dlsym(h_ze, "zeMemGetAllocProperties");
-    _ze_mem_get_address_range =
-        (_thapi_ze_mem_get_address_range_fn_t)dlsym(h_ze, "zeMemGetAddressRange");
-
-    /* zeMemGetAllocProperties needs a context.  Create a private one against
-     * the first driver so introspection never touches application contexts. */
-    uint32_t n = 1;
-    _thapi_ze_driver_handle_t hDrv = NULL;
-    struct {
-      uint32_t stype;
-      const void *pNext;
-      uint32_t flags;
-    } desc = {0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0};
-    if (!(ze_init && ze_driver && ze_ctx_new &&
-          ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS &&
-          ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv &&
-          ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx) == _THAPI_ZE_RESULT_SUCCESS)) {
-      _ze_mem_get_alloc_properties = NULL;
-      _ze_mem_get_address_range = NULL;
-      _ze_introspection_ctx = NULL;
-    }
-  }
-
-  void *h_cu = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_LOCAL);
-  if (!h_cu)
-    h_cu = dlopen("libcuda.so", RTLD_LAZY | RTLD_LOCAL);
-  if (h_cu) {
-    _thapi_cu_init_fn_t cu_init = (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit");
-    _cu_pointer_get_attribute =
-        (_thapi_cu_pointer_get_attribute_fn_t)dlsym(h_cu, "cuPointerGetAttribute");
-    if (!(cu_init && _cu_pointer_get_attribute && cu_init(0) == _THAPI_CU_SUCCESS))
-      _cu_pointer_get_attribute = NULL;
-  }
-
-  void *h_hip = dlopen("libamdhip64.so.5", RTLD_LAZY | RTLD_LOCAL);
-  if (!h_hip)
-    h_hip = dlopen("libamdhip64.so", RTLD_LAZY | RTLD_LOCAL);
-  if (h_hip) {
-    _hip_pointer_get_attributes =
-        (_thapi_hip_pointer_get_attributes_fn_t)dlsym(h_hip, "hipPointerGetAttributes");
-    _hip_mem_get_address_range =
-        (_thapi_hip_mem_get_address_range_fn_t)dlsym(h_hip, "hipMemGetAddressRange");
-  }
-}
-
 static void _load_tracer(void) {
   char *s = NULL;
   void *handle = NULL;
@@ -416,9 +30,6 @@ static void _load_tracer(void) {
     verbose = 1;
 
   find_mpi_symbols(handle, verbose);
-  /* GPU pointer introspection is loaded lazily on first use (see
-   * _load_gpu_introspection_once), so the application's runtime setup
-   * completes before we touch L0/CUDA/HIP. */
 }
 
 static inline void _init_tracer(void) {
diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in
index 553312e38..9b4804dd6 100755
--- a/xprof/xprof.rb.in
+++ b/xprof/xprof.rb.in
@@ -639,7 +639,6 @@ def enable_events_mpi(channel_name, tracing_mode: 'default', profiling: true)
   end
   LOGGER.debug('Profiling is ignored for mpi') if profiling
   exec("#{lttng_enable} lttng_ust_mpi_type:*")
-  exec("#{lttng_enable} lttng_ust_mpi_properties:*")
 end
 
 def enable_events_cxi(channel_name, tracing_mode: 'default', profiling: true); end

From c8cde68ee624d9973008fa569c619a559b0957f0 Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 17:54:34 -0500
Subject: [PATCH 09/14] mpi: consumer-side GPU-aware tagging via ze allocation
 events

Implements Thomas's review suggestion: instead of querying the GPU
runtime from the MPI tracer, classify MPI buffer pointers post-mortem
in btx_mpiinterval_callbacks.cpp by joining the ze backend's
zeMemAlloc/zeMemFree events into the MPI filter's upstream model.

Changes:

  backends/mpi/Makefile.am:
    Union backends/ze/btx_ze_model.yaml into the MPI filter's -u so
    metababel routes ze events to btx_filter_mpi. Adds a build-order
    dependency mpi -> ze.

  backends/mpi/btx_mpimatching_model.yaml:
    - Anchor the existing entries/exits categories to the lttng_ust_mpi
      provider so ze events do not accidentally enter them.
    - Add mpi_1buf_entry / mpi_2buf_entry categories that capture
      payload buffer pointer(s) of buffer-bearing MPI calls.
    - Add ze_alloc_entry / ze_alloc_exit / ze_free_entry categories
      that capture ze USM allocation events.

  backends/mpi/btx_mpiinterval_callbacks.cpp:
    - Maintain per-process address-range maps rangeset_memory_device
      and rangeset_memory_shared, mirroring the pattern in
      backends/ze/btx_zeinterval_callbacks.cpp.
    - Populate the maps from ze_alloc_*/ze_free_entry callbacks (host
      USM is intentionally not classified as GPU-aware).
    - On every buffer-bearing MPI entry event, look up each buffer
      pointer; if it falls in a tracked GPU range, record the call's
      {hostname,vpid,vtid} in gpu_aware_calls.
    - In send_host_message, prepend '*' to the MPI call name when the
      flag is set, so the tally aggregates GPU-aware and CPU-only
      variants as distinct rows.

When the user runs --backend mpi without --backend ze, no ze events
appear in the trace, the range-map stays empty, and the output is
unchanged from upstream (no '*' ever appears). No producer-side
changes; the MPI tracer hot path is unaffected.
---
 backends/mpi/Makefile.am                   |  11 +-
 backends/mpi/btx_mpiinterval_callbacks.cpp | 179 ++++++++++++++++++++-
 backends/mpi/btx_mpimatching_model.yaml    |  67 +++++++-
 3 files changed, 251 insertions(+), 6 deletions(-)

diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am
index f3d0ea28c..c115cf3ff 100644
--- a/backends/mpi/Makefile.am
+++ b/backends/mpi/Makefile.am
@@ -189,8 +189,15 @@ EXTRA_DIST += \
 	$(top_srcdir)/xprof/btx_interval_model.yaml \
 	btx_mpimatching_model.yaml
 
-$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml
-	$(METABABEL) -u btx_mpi_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include
+# The MPI interval filter unions the ze upstream model so that
+# btx_mpiinterval_callbacks.cpp can subscribe to zeMemAlloc/zeMemFree
+# events and tag GPU-aware MPI calls via address-range lookup. This
+# creates a build-order dependency mpi -> ze; the ze backend must be
+# configured/built first when changing only the MPI backend.
+BTX_ZE_MODEL = $(top_builddir)/backends/ze/btx_ze_model.yaml
+
+$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml $(BTX_ZE_MODEL)
+	$(METABABEL) -u btx_mpi_model.yaml,$(BTX_ZE_MODEL) -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include
 
 CLEANFILES += \
 	$(BTX_MPI_GENERATED) \
diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
index 65f1cedd2..efd807071 100644
--- a/backends/mpi/btx_mpiinterval_callbacks.cpp
+++ b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -1,17 +1,69 @@
 #include "xprof_utils.hpp"
 #include <iostream>
+#include <map>
 #include <metababel/metababel.h>
 #include <mpi.h.include>
 #include <sstream>
 #include <string>
 #include <tuple>
+#include <unordered_set>
+
+// ---------------------------------------------------------------------------
+// GPU-aware MPI tagging
+//
+// We maintain a per-process address-range map populated from
+// lttng_ust_ze:zeMemAlloc*_exit / zeMemFree_entry events emitted by the ze
+// backend (joined into our upstream model via backends/mpi/Makefile.am).
+// Each MPI buffer-bearing entry event looks up its buffer pointer in that
+// map; if the pointer falls inside a known GPU allocation, the
+// {hostname,vpid,vtid} key is recorded in gpu_aware_calls. The matching
+// exit callback consumes the key and prepends '*' to the MPI call name
+// before pushing it to the aggregator, producing a separate '*MPI_Foo'
+// row in the tally.
+//
+// When the user enables only --backend mpi, no ze events flow in, the
+// range-map stays empty, and no '*' ever appears (CPU-only profile).
+// ---------------------------------------------------------------------------
+
+// Per-process address-range map: base -> (base + size).
+typedef std::map<uintptr_t, uintptr_t> memory_interval_t;
 
 struct data_s {
   EntryState entry_state;
+  // GPU allocation ranges, keyed on {hostname, vpid}. Mirrors the
+  // rangeset_memory_{host,device,shared} pattern in
+  // backends/ze/btx_zeinterval_callbacks.cpp.
+  std::unordered_map<hp_t, memory_interval_t> rangeset_memory_device;
+  std::unordered_map<hp_t, memory_interval_t> rangeset_memory_shared;
+  // {hostname, vpid, vtid} of MPI calls observed to carry at least one
+  // device/shared GPU buffer. Populated in mpi_*_buf_entry_callback,
+  // consumed (and cleared) in send_host_message.
+  std::unordered_set<hpt_t> gpu_aware_calls;
 };
 
 typedef struct data_s data_t;
 
+// True iff `ptr` lies inside any allocation tracked in `m`.
+static bool memory_includes(const memory_interval_t &m, uintptr_t ptr) {
+  const auto it = m.upper_bound(ptr);
+  if (it == m.cbegin())
+    return false;
+  return ptr < std::prev(it)->second;
+}
+
+// True iff `ptr` lies inside a tracked GPU (device or shared) allocation
+// for the given {hostname, vpid}. Host USM is intentionally NOT tagged
+// as GPU-aware here (it behaves like host memory from MPI's perspective).
+static bool is_gpu_pointer(const data_t *data, const hp_t &hp, uintptr_t ptr) {
+  auto itd = data->rangeset_memory_device.find(hp);
+  if (itd != data->rangeset_memory_device.end() && memory_includes(itd->second, ptr))
+    return true;
+  auto its = data->rangeset_memory_shared.find(hp);
+  if (its != data->rangeset_memory_shared.end() && memory_includes(its->second, ptr))
+    return true;
+  return false;
+}
+
 static void send_host_message(void *btx_handle,
                               void *usr_data,
                               int64_t ts,
@@ -21,9 +73,16 @@ static void send_host_message(void *btx_handle,
                               uint64_t vtid,
                               bool err) {
 
+  auto *data = static_cast<data_t *>(usr_data);
+  const hpt_t hpt{hostname, vpid, vtid};
+
   std::string event_class_name_striped = strip_event_class_name_exit(event_class_name);
-  const int64_t entry_ts =
-      static_cast<data_t *>(usr_data)->entry_state.get_ts({hostname, vpid, vtid});
+  // GPU-aware MPI calls get a leading '*' so they sort and aggregate as a
+  // distinct row from their CPU-only counterparts in the tally output.
+  if (data->gpu_aware_calls.erase(hpt) > 0)
+    event_class_name_striped.insert(0, 1, '*');
+
+  const int64_t entry_ts = data->entry_state.get_ts(hpt);
 
   btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI,
                               event_class_name_striped.c_str(), (ts - entry_ts), err);
@@ -198,6 +257,114 @@ static void traffic_int_entry_callback(void *btx_handle,
                                  count * size, oss.str().c_str());
 }
 
+// --- ze allocation / free tracking ----------------------------------------
+// Bind allocation size at entry; the matching exit callback uses it together
+// with the freshly-returned pointer to populate the range-map. The
+// event_class_name suffix (Device vs Shared vs Host) selects which rangeset
+// the allocation lands in.
+static void ze_alloc_entry_callback(void *btx_handle,
+                                    void *usr_data,
+                                    int64_t ts,
+                                    const char *event_class_name,
+                                    const char *hostname,
+                                    int64_t vpid,
+                                    uint64_t vtid,
+                                    size_t size) {
+  static_cast<data_t *>(usr_data)->entry_state.set_data({hostname, vpid, vtid}, size);
+}
+
+static void ze_alloc_exit_callback(void *btx_handle,
+                                   void *usr_data,
+                                   int64_t ts,
+                                   const char *event_class_name,
+                                   const char *hostname,
+                                   int64_t vpid,
+                                   uint64_t vtid,
+                                   int zeResult,
+                                   void *pptr_val) {
+  auto *data = static_cast<data_t *>(usr_data);
+  auto size = data->entry_state.get_data<size_t>({hostname, vpid, vtid});
+  if (zeResult != 0 || pptr_val == nullptr)
+    return;
+
+  const std::string ev{event_class_name};
+  // Host USM is reachable from the host CPU so MPI sees it as a normal host
+  // pointer; classifying it as GPU-aware would produce false positives.
+  // Device and Shared (managed) USM live (at least partly) on the device,
+  // so MPI calls on them are GPU-aware.
+  std::unordered_map<hp_t, memory_interval_t> *mi = nullptr;
+  if (ev.find("zeMemAllocDevice") != std::string::npos)
+    mi = &data->rangeset_memory_device;
+  else if (ev.find("zeMemAllocShared") != std::string::npos)
+    mi = &data->rangeset_memory_shared;
+  else
+    return; // host USM, ignored
+
+  const uintptr_t base = reinterpret_cast<uintptr_t>(pptr_val);
+  (*mi)[{hostname, vpid}][base] = base + size;
+}
+
+static void ze_free_entry_callback(void *btx_handle,
+                                   void *usr_data,
+                                   int64_t ts,
+                                   const char *event_class_name,
+                                   const char *hostname,
+                                   int64_t vpid,
+                                   uint64_t vtid,
+                                   void *ptr) {
+  auto *data = static_cast<data_t *>(usr_data);
+  if (ptr == nullptr)
+    return;
+  const uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+  const hp_t hp{hostname, vpid};
+  // The freed pointer may have been classified as device OR shared at
+  // allocation; try both.
+  auto itd = data->rangeset_memory_device.find(hp);
+  if (itd != data->rangeset_memory_device.end())
+    itd->second.erase(base);
+  auto its = data->rangeset_memory_shared.find(hp);
+  if (its != data->rangeset_memory_shared.end())
+    its->second.erase(base);
+}
+
+// --- MPI buffer classification --------------------------------------------
+static void
+tag_if_gpu(data_t *data, const char *hostname, int64_t vpid, uint64_t vtid, const void *ptr) {
+  if (ptr == nullptr)
+    return;
+  // MPICH sentinels: MPI_IN_PLACE / MPI_BOTTOM. These would never match a
+  // real allocation but checking explicitly keeps the intent clear.
+  if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM)
+    return;
+  if (is_gpu_pointer(data, {hostname, vpid}, reinterpret_cast<uintptr_t>(ptr)))
+    data->gpu_aware_calls.insert({hostname, vpid, vtid});
+}
+
+static void mpi_1buf_entry_callback(void *btx_handle,
+                                    void *usr_data,
+                                    int64_t ts,
+                                    const char *event_class_name,
+                                    const char *hostname,
+                                    int64_t vpid,
+                                    uint64_t vtid,
+                                    const void *buf) {
+  tag_if_gpu(static_cast<data_t *>(usr_data), hostname, vpid, vtid, buf);
+}
+
+static void mpi_2buf_entry_callback(void *btx_handle,
+                                    void *usr_data,
+                                    int64_t ts,
+                                    const char *event_class_name,
+                                    const char *hostname,
+                                    int64_t vpid,
+                                    uint64_t vtid,
+                                    const void *sendbuf,
+                                    const void *recvbuf) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, sendbuf);
+  tag_if_gpu(data, hostname, vpid, vtid, recvbuf);
+}
+
 void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component);
   btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component);
@@ -209,4 +376,12 @@ void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback);
 
   btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback);
+
+  // GPU-aware MPI: ze alloc/free updates the range-map, MPI buffer entries
+  // look up against it.
+  btx_register_callbacks_ze_alloc_entry(btx_handle, &ze_alloc_entry_callback);
+  btx_register_callbacks_ze_alloc_exit(btx_handle, &ze_alloc_exit_callback);
+  btx_register_callbacks_ze_free_entry(btx_handle, &ze_free_entry_callback);
+  btx_register_callbacks_mpi_1buf_entry(btx_handle, &mpi_1buf_entry_callback);
+  btx_register_callbacks_mpi_2buf_entry(btx_handle, &mpi_2buf_entry_callback);
 }
diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml
index 829bbedc8..7183c1d00 100644
--- a/backends/mpi/btx_mpimatching_model.yaml
+++ b/backends/mpi/btx_mpimatching_model.yaml
@@ -14,11 +14,14 @@
       :field_class:
         :type: integer_unsigned
   :event_classes:
+  # Restrict the bare entry/exit patterns to the MPI provider so that
+  # ze allocation/free events sourced from the ze backend (see below)
+  # do not accidentally match them.
   - :set_id: entries
-    :name: "_entry$"
+    :name: "^lttng_ust_mpi:.*_entry$"
   - :set_id: exits
     :register: false
-    :name: "_exit$"
+    :name: "^lttng_ust_mpi:.*_exit$"
   - :set_id: exits_mpiError_present
     :domain: exits
     :payload_field_class:
@@ -52,3 +55,63 @@
       - :name: ^(datatype|origin_count)$
         :field_class:
           :cast_type: MPI_Datatype
+  # ------------------------------------------------------------------
+  # GPU-aware MPI tagging
+  #
+  # Categories below feed btx_mpiinterval_callbacks.cpp's range-map
+  # lookup. The MPI buffer categories capture the payload pointer(s)
+  # of every buffer-bearing MPI call; the ze alloc/free categories
+  # maintain a {hostname,vpid} -> address-range map. An MPI call whose
+  # buffer falls in the range-map is tagged with a leading '*' in the
+  # tally output. This is only effective when the user enables both
+  # --backend mpi and --backend ze; with only --backend mpi the
+  # range-map stays empty and no '*' ever appears.
+  # ------------------------------------------------------------------
+  # Two-buffer MPI calls (Sendrecv, Allreduce, Reduce, Bcast variants
+  # with sendbuf+recvbuf, Get_accumulate with origin_addr+result_addr,
+  # etc.).
+  - :set_id: mpi_2buf_entry
+    :domain: entries
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^(sendbuf|buf|buffer|inbuf|origin_addr|inoutbuf)$
+        :field_class:
+          :cast_type: const void \*
+      - :name: ^(recvbuf|outbuf|result_addr)$
+        :field_class:
+          :cast_type: const void \*
+  # Single-buffer MPI calls (Send, Recv, Bcast, etc.).
+  - :set_id: mpi_1buf_entry
+    :domain: entries - mpi_2buf_entry
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr|compare_addr)$
+        :field_class:
+          :cast_type: const void \*
+  # Level Zero USM allocation / free events sourced from the ze
+  # backend (see backends/ze/btx_zematching_model.yaml for matching
+  # parents). We bind size at entry and the allocated pointer at exit,
+  # mirroring btx_zeinterval_callbacks.cpp:entries_alloc / exits_alloc.
+  - :set_id: ze_alloc_entry
+    :name: "^lttng_ust_ze:zeMemAlloc.*_entry$"
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^size$
+  - :set_id: ze_alloc_exit
+    :name: "^lttng_ust_ze:zeMemAlloc.*_exit$"
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^zeResult$
+        :field_class:
+          :cast_type: ^int$
+      - :name: ^pptr_val$
+  - :set_id: ze_free_entry
+    :name: "^lttng_ust_ze:zeMemFree_entry$"
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^ptr$

From c40a24c8758bdfaafd5862551a3f82bf01e7b69a Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 21:51:57 -0500
Subject: [PATCH 10/14] mpi: synthesise merged upstream model instead of
 metababel -u union

metababel's bare `-u model_a,model_b` upstream union fails when both
models declare the same environment field, raising:

  Match expression '^hostname$' must match only one member,
  '2' matched ["hostname", "hostname"]

Every backend model generated by utils/gen_babeltrace_model_helper.rb
declares :environment: :entries: [{name: hostname, type: string}],
so any two backend models clash. metababel itself owns the rule and
isn't ours to change.

Side-step it: generate a single merged upstream model
(btx_mpi_with_ze_model.yaml) that starts from btx_mpi_model.yaml and
copies only the ze allocation/free event classes we actually need
into MPI's stream class. One environment declaration, one stream
class, no duplicates.

  - backends/mpi/gen_btx_mpi_with_ze_model.rb: the generator.
  - backends/mpi/Makefile.am: builds the merged YAML, feeds it to
    metababel as the sole `-u` source.
---
 backends/mpi/Makefile.am                  | 22 ++++++----
 backends/mpi/gen_btx_mpi_with_ze_model.rb | 51 +++++++++++++++++++++++
 2 files changed, 66 insertions(+), 7 deletions(-)
 create mode 100644 backends/mpi/gen_btx_mpi_with_ze_model.rb

diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am
index c115cf3ff..681bed984 100644
--- a/backends/mpi/Makefile.am
+++ b/backends/mpi/Makefile.am
@@ -189,15 +189,23 @@ EXTRA_DIST += \
 	$(top_srcdir)/xprof/btx_interval_model.yaml \
 	btx_mpimatching_model.yaml
 
-# The MPI interval filter unions the ze upstream model so that
-# btx_mpiinterval_callbacks.cpp can subscribe to zeMemAlloc/zeMemFree
-# events and tag GPU-aware MPI calls via address-range lookup. This
-# creates a build-order dependency mpi -> ze; the ze backend must be
-# configured/built first when changing only the MPI backend.
+# The MPI interval filter needs to receive Level Zero allocation/free
+# events in addition to MPI events so that btx_mpiinterval_callbacks.cpp
+# can tag GPU-aware MPI calls via address-range lookup. metababel's bare
+# `-u model_a,model_b` union fails when both backend models declare the
+# same environment field (hostname), so we generate a merged upstream
+# model that copies only the ze events we need into MPI's stream class.
+# This adds a build-order dependency mpi -> ze.
 BTX_ZE_MODEL = $(top_builddir)/backends/ze/btx_ze_model.yaml
 
-$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml $(BTX_ZE_MODEL)
-	$(METABABEL) -u btx_mpi_model.yaml,$(BTX_ZE_MODEL) -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include
+btx_mpi_with_ze_model.yaml: $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL)
+	$(RUBY) $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL) > $@
+
+CLEANFILES += btx_mpi_with_ze_model.yaml
+EXTRA_DIST += gen_btx_mpi_with_ze_model.rb
+
+$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_with_ze_model.yaml
+	$(METABABEL) -u btx_mpi_with_ze_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include
 
 CLEANFILES += \
 	$(BTX_MPI_GENERATED) \
diff --git a/backends/mpi/gen_btx_mpi_with_ze_model.rb b/backends/mpi/gen_btx_mpi_with_ze_model.rb
new file mode 100644
index 000000000..1365a593c
--- /dev/null
+++ b/backends/mpi/gen_btx_mpi_with_ze_model.rb
@@ -0,0 +1,51 @@
+#!/usr/bin/env ruby
+# Generate a merged upstream YAML model for the MPI interval filter.
+#
+# btx_filter_mpi needs to receive Level Zero allocation/free events in
+# addition to MPI events so that btx_mpiinterval_callbacks.cpp can
+# maintain an address-range map and tag GPU-aware MPI calls. metababel's
+# -u union of two backend models fails because both declare
+# :environment: :entries: [{name: hostname}], which the matching
+# engine treats as an ambiguity.
+#
+# This generator side-steps the issue by emitting a single model file
+# that:
+#   - starts from btx_mpi_model.yaml,
+#   - copies only the ze allocation/free event classes we actually
+#     need into the same stream class.
+#
+# Usage: ruby gen_btx_mpi_with_ze_model.rb MPI_MODEL.yaml ZE_MODEL.yaml > OUT.yaml
+require 'yaml'
+
+mpi_path = ARGV[0]
+ze_path  = ARGV[1]
+raise 'usage: gen_btx_mpi_with_ze_model.rb MPI_MODEL ZE_MODEL' unless mpi_path && ze_path
+
+ZE_WANTED_EVENT_NAMES = %w[
+  lttng_ust_ze:zeMemAllocHost_entry
+  lttng_ust_ze:zeMemAllocHost_exit
+  lttng_ust_ze:zeMemAllocDevice_entry
+  lttng_ust_ze:zeMemAllocDevice_exit
+  lttng_ust_ze:zeMemAllocShared_entry
+  lttng_ust_ze:zeMemAllocShared_exit
+  lttng_ust_ze:zeMemFree_entry
+  lttng_ust_ze:zeMemFree_exit
+].freeze
+
+mpi_model = YAML.load_file(mpi_path)
+ze_model  = YAML.load_file(ze_path)
+
+ze_events = ze_model.fetch(:stream_classes, []).flat_map do |sc|
+  sc.fetch(:event_classes, []).select { |ec| ZE_WANTED_EVENT_NAMES.include?(ec[:name]) }
+end
+
+if ze_events.empty?
+  warn "WARNING: no ze allocation events found in #{ze_path}; " \
+       'GPU-aware MPI tagging will be inactive.'
+end
+
+# Inject the ze events into the MPI stream class. There is only one
+# stream class per backend model (see utils/gen_babeltrace_model_helper.rb).
+mpi_model[:stream_classes].first[:event_classes].concat(ze_events)
+
+puts mpi_model.to_yaml

From c5094b395007bc7d8f2b6e1d22a6cc33da000843 Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Wed, 3 Jun 2026 21:56:40 -0500
Subject: [PATCH 11/14] mpi: relax matching-model cast_types to match upstream

Two matching failures surfaced when metababel walked the merged
upstream model:

1. MPI buffer parameters are declared with two different cast_types
   in btx_mpi_model.yaml: 'const void *' for input buffers (sendbuf,
   buf on _Send variants, ...) and plain 'void *' for output buffers
   (recvbuf, buf on _Recv variants, ...). The matching model required
   'const void *' on both, so mpi_2buf_entry matched zero events:
   'No event matched mpi_2buf_entry, at least one matching event
   required.'

   Relax to '^(const )?void \\*$' on both buffer slots.

2. zeResult is typed ze_result_t in btx_ze_model.yaml. Matching
   against '^int$' fails. Use ze_result_t directly would force
   btx_mpiinterval_callbacks.cpp to include ze_api.h (heavy build
   dep on the MPI consumer just to spell an enum).

   Drop zeResult from the match entirely; a failed allocation has
   pptr_val == NULL, which the callback already guards against.
---
 backends/mpi/btx_mpiinterval_callbacks.cpp |  5 +++--
 backends/mpi/btx_mpimatching_model.yaml    | 16 +++++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
index efd807071..62fd83852 100644
--- a/backends/mpi/btx_mpiinterval_callbacks.cpp
+++ b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -280,11 +280,12 @@ static void ze_alloc_exit_callback(void *btx_handle,
                                    const char *hostname,
                                    int64_t vpid,
                                    uint64_t vtid,
-                                   int zeResult,
                                    void *pptr_val) {
   auto *data = static_cast<data_t *>(usr_data);
   auto size = data->entry_state.get_data<size_t>({hostname, vpid, vtid});
-  if (zeResult != 0 || pptr_val == nullptr)
+  // We don't see zeResult in the matching model to avoid pulling in
+  // ze_api.h; a failed allocation has pptr_val == NULL.
+  if (pptr_val == nullptr)
     return;
 
   const std::string ev{event_class_name};
diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml
index 7183c1d00..f8d8d7b8a 100644
--- a/backends/mpi/btx_mpimatching_model.yaml
+++ b/backends/mpi/btx_mpimatching_model.yaml
@@ -69,7 +69,9 @@
   # ------------------------------------------------------------------
   # Two-buffer MPI calls (Sendrecv, Allreduce, Reduce, Bcast variants
   # with sendbuf+recvbuf, Get_accumulate with origin_addr+result_addr,
-  # etc.).
+  # etc.). The cast_type intentionally accepts both `const void *`
+  # (input buffers) and plain `void *` (output buffers) since MPI
+  # prototypes use both.
   - :set_id: mpi_2buf_entry
     :domain: entries
     :payload_field_class:
@@ -77,10 +79,10 @@
       :members:
       - :name: ^(sendbuf|buf|buffer|inbuf|origin_addr|inoutbuf)$
         :field_class:
-          :cast_type: const void \*
+          :cast_type: ^(const )?void \*$
       - :name: ^(recvbuf|outbuf|result_addr)$
         :field_class:
-          :cast_type: const void \*
+          :cast_type: ^(const )?void \*$
   # Single-buffer MPI calls (Send, Recv, Bcast, etc.).
   - :set_id: mpi_1buf_entry
     :domain: entries - mpi_2buf_entry
@@ -89,7 +91,7 @@
       :members:
       - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr|compare_addr)$
         :field_class:
-          :cast_type: const void \*
+          :cast_type: ^(const )?void \*$
   # Level Zero USM allocation / free events sourced from the ze
   # backend (see backends/ze/btx_zematching_model.yaml for matching
   # parents). We bind size at entry and the allocated pointer at exit,
@@ -100,14 +102,14 @@
       :type: structure
       :members:
       - :name: ^size$
+  # We deliberately omit zeResult here so that the consumer callback
+  # signature does not need to depend on ze_api.h. A failed allocation
+  # has pptr_val == NULL, which the callback already guards against.
   - :set_id: ze_alloc_exit
     :name: "^lttng_ust_ze:zeMemAlloc.*_exit$"
     :payload_field_class:
       :type: structure
       :members:
-      - :name: ^zeResult$
-        :field_class:
-          :cast_type: ^int$
       - :name: ^pptr_val$
   - :set_id: ze_free_entry
     :name: "^lttng_ust_ze:zeMemFree_entry$"

From e4ca7a4f0101a668768a781f8afbabbc9b7f016a Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Thu, 4 Jun 2026 08:37:16 -0500
Subject: [PATCH 12/14] mpi: split buffer categories per MPI shape to satisfy
 metababel

metababel's matching engine requires each :name: regex to bind to
EXACTLY ONE field per matched event, so the broad regex

  ^(sendbuf|buf|buffer|inbuf|origin_addr|inoutbuf)$

fails on MPI_Reduce_local (has both inbuf and inoutbuf) with:

  Match expression '...' must match only one member,
  '2' matched ["inbuf", "inoutbuf"]

Enumerate one category per distinct MPI buffer-shape so member regexes
never overlap within an event:

  - mpi_cas_entry          : origin_addr + compare_addr + result_addr
                             (MPI_Compare_and_swap)
  - mpi_sendrecv_entry     : sendbuf + recvbuf
                             (Sendrecv, Allreduce, Reduce, Gather, ...)
  - mpi_reduce_local_entry : inbuf + inoutbuf
                             (MPI_Reduce_local)
  - mpi_rma_fetch_entry    : origin_addr + result_addr
                             (MPI_Get_accumulate, MPI_Fetch_and_op)
  - mpi_1buf_entry         : everything else with a single buffer field

Each category subtracts the prior set_ids from its domain so an event
falls into exactly one bucket. Register one consumer-side callback per
shape; all delegate to the same tag_if_gpu helper.
---
 backends/mpi/btx_mpiinterval_callbacks.cpp | 74 ++++++++++++++++++----
 backends/mpi/btx_mpimatching_model.yaml    | 66 +++++++++++++++----
 2 files changed, 118 insertions(+), 22 deletions(-)

diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
index 62fd83852..7b00c52bf 100644
--- a/backends/mpi/btx_mpiinterval_callbacks.cpp
+++ b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -352,20 +352,68 @@ static void mpi_1buf_entry_callback(void *btx_handle,
   tag_if_gpu(static_cast<data_t *>(usr_data), hostname, vpid, vtid, buf);
 }
 
-static void mpi_2buf_entry_callback(void *btx_handle,
-                                    void *usr_data,
-                                    int64_t ts,
-                                    const char *event_class_name,
-                                    const char *hostname,
-                                    int64_t vpid,
-                                    uint64_t vtid,
-                                    const void *sendbuf,
-                                    const void *recvbuf) {
+// MPI_Sendrecv, MPI_Allreduce, MPI_Reduce, MPI_Gather, ... — the bulk.
+static void mpi_sendrecv_entry_callback(void *btx_handle,
+                                        void *usr_data,
+                                        int64_t ts,
+                                        const char *event_class_name,
+                                        const char *hostname,
+                                        int64_t vpid,
+                                        uint64_t vtid,
+                                        const void *sendbuf,
+                                        const void *recvbuf) {
   auto *data = static_cast<data_t *>(usr_data);
   tag_if_gpu(data, hostname, vpid, vtid, sendbuf);
   tag_if_gpu(data, hostname, vpid, vtid, recvbuf);
 }
 
+// MPI_Reduce_local(inbuf, inoutbuf, ...)
+static void mpi_reduce_local_entry_callback(void *btx_handle,
+                                            void *usr_data,
+                                            int64_t ts,
+                                            const char *event_class_name,
+                                            const char *hostname,
+                                            int64_t vpid,
+                                            uint64_t vtid,
+                                            const void *inbuf,
+                                            const void *inoutbuf) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, inbuf);
+  tag_if_gpu(data, hostname, vpid, vtid, inoutbuf);
+}
+
+// MPI_Get_accumulate, MPI_Fetch_and_op (origin_addr + result_addr).
+static void mpi_rma_fetch_entry_callback(void *btx_handle,
+                                         void *usr_data,
+                                         int64_t ts,
+                                         const char *event_class_name,
+                                         const char *hostname,
+                                         int64_t vpid,
+                                         uint64_t vtid,
+                                         const void *origin_addr,
+                                         const void *result_addr) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
+  tag_if_gpu(data, hostname, vpid, vtid, result_addr);
+}
+
+// MPI_Compare_and_swap(origin_addr, compare_addr, result_addr, ...)
+static void mpi_cas_entry_callback(void *btx_handle,
+                                   void *usr_data,
+                                   int64_t ts,
+                                   const char *event_class_name,
+                                   const char *hostname,
+                                   int64_t vpid,
+                                   uint64_t vtid,
+                                   const void *origin_addr,
+                                   const void *compare_addr,
+                                   const void *result_addr) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
+  tag_if_gpu(data, hostname, vpid, vtid, compare_addr);
+  tag_if_gpu(data, hostname, vpid, vtid, result_addr);
+}
+
 void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component);
   btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component);
@@ -379,10 +427,14 @@ void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback);
 
   // GPU-aware MPI: ze alloc/free updates the range-map, MPI buffer entries
-  // look up against it.
+  // look up against it. One callback per MPI buffer-shape (see the matching
+  // model for why we cannot lump them into a single regex).
   btx_register_callbacks_ze_alloc_entry(btx_handle, &ze_alloc_entry_callback);
   btx_register_callbacks_ze_alloc_exit(btx_handle, &ze_alloc_exit_callback);
   btx_register_callbacks_ze_free_entry(btx_handle, &ze_free_entry_callback);
   btx_register_callbacks_mpi_1buf_entry(btx_handle, &mpi_1buf_entry_callback);
-  btx_register_callbacks_mpi_2buf_entry(btx_handle, &mpi_2buf_entry_callback);
+  btx_register_callbacks_mpi_sendrecv_entry(btx_handle, &mpi_sendrecv_entry_callback);
+  btx_register_callbacks_mpi_reduce_local_entry(btx_handle, &mpi_reduce_local_entry_callback);
+  btx_register_callbacks_mpi_rma_fetch_entry(btx_handle, &mpi_rma_fetch_entry_callback);
+  btx_register_callbacks_mpi_cas_entry(btx_handle, &mpi_cas_entry_callback);
 }
diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml
index f8d8d7b8a..b9ca7babe 100644
--- a/backends/mpi/btx_mpimatching_model.yaml
+++ b/backends/mpi/btx_mpimatching_model.yaml
@@ -67,29 +67,73 @@
   # --backend mpi and --backend ze; with only --backend mpi the
   # range-map stays empty and no '*' ever appears.
   # ------------------------------------------------------------------
-  # Two-buffer MPI calls (Sendrecv, Allreduce, Reduce, Bcast variants
-  # with sendbuf+recvbuf, Get_accumulate with origin_addr+result_addr,
-  # etc.). The cast_type intentionally accepts both `const void *`
-  # (input buffers) and plain `void *` (output buffers) since MPI
-  # prototypes use both.
-  - :set_id: mpi_2buf_entry
+  # MPI buffer categories. metababel requires each :name: regex to bind
+  # to EXACTLY ONE field per matched event, so we cannot lump all buffer
+  # names into a single regex slot: an event like MPI_Reduce_local has
+  # both `inbuf` and `inoutbuf`, and MPI_Compare_and_swap has three
+  # buffer fields, which would over-match. Define one category per
+  # distinct MPI buffer-shape, then a catch-all for single-buffer calls.
+  # The cast_type accepts both `const void *` (input buffers) and plain
+  # `void *` (output buffers) since MPI prototypes use both.
+  #
+  # 3-buffer: MPI_Compare_and_swap
+  - :set_id: mpi_cas_entry
     :domain: entries
     :payload_field_class:
       :type: structure
       :members:
-      - :name: ^(sendbuf|buf|buffer|inbuf|origin_addr|inoutbuf)$
+      - :name: ^origin_addr$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+      - :name: ^compare_addr$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+      - :name: ^result_addr$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+  # 2-buffer Send/Recv pair (Sendrecv, Allreduce, Reduce, Gather, ...).
+  - :set_id: mpi_sendrecv_entry
+    :domain: entries - mpi_cas_entry
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^sendbuf$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+      - :name: ^recvbuf$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+  # 2-buffer Reduce_local pair (inbuf + inoutbuf).
+  - :set_id: mpi_reduce_local_entry
+    :domain: entries - mpi_cas_entry - mpi_sendrecv_entry
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^inbuf$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+      - :name: ^inoutbuf$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+  # 2-buffer RMA fetch pair (Get_accumulate, Fetch_and_op).
+  - :set_id: mpi_rma_fetch_entry
+    :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^origin_addr$
         :field_class:
           :cast_type: ^(const )?void \*$
-      - :name: ^(recvbuf|outbuf|result_addr)$
+      - :name: ^result_addr$
         :field_class:
           :cast_type: ^(const )?void \*$
-  # Single-buffer MPI calls (Send, Recv, Bcast, etc.).
+  # 1-buffer everything else (Send/Recv/Bcast/Put/Get/Accumulate/...).
   - :set_id: mpi_1buf_entry
-    :domain: entries - mpi_2buf_entry
+    :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry - mpi_rma_fetch_entry
     :payload_field_class:
       :type: structure
       :members:
-      - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr|compare_addr)$
+      - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr)$
         :field_class:
           :cast_type: ^(const )?void \*$
   # Level Zero USM allocation / free events sourced from the ze

From babf87d10777c1b24a926d03ee66e644847ef9fb Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Thu, 4 Jun 2026 08:38:46 -0500
Subject: [PATCH 13/14] mpi: tighten mpi_1buf_entry regex to actually unique
 names

After enumerating all MPI calls and excluding the 4 multi-buffer
categories, every remaining buffer-bearing MPI call carries exactly
one of {buf, buffer, origin_addr}. Tightening the 1buf regex to
^(buf|buffer|origin_addr)$ removes the last source of in-event
ambiguity for metababel.

Pack/Unpack/Pack_external/Unpack_external are intentionally NOT
classified: they carry inbuf+outbuf but the buffers never traverse
the network, so GPU-aware tagging would be semantically meaningless.
---
 backends/mpi/btx_mpimatching_model.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml
index b9ca7babe..798268888 100644
--- a/backends/mpi/btx_mpimatching_model.yaml
+++ b/backends/mpi/btx_mpimatching_model.yaml
@@ -128,12 +128,17 @@
         :field_class:
           :cast_type: ^(const )?void \*$
   # 1-buffer everything else (Send/Recv/Bcast/Put/Get/Accumulate/...).
+  # After excluding the multi-buffer categories above, every remaining
+  # buffer-bearing MPI call carries exactly one of {buf, buffer,
+  # origin_addr}. Pack/Unpack (inbuf + outbuf) is intentionally NOT
+  # included: the buffers never traverse the network, so GPU-aware
+  # tagging is meaningless there.
   - :set_id: mpi_1buf_entry
     :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry - mpi_rma_fetch_entry
     :payload_field_class:
       :type: structure
       :members:
-      - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr)$
+      - :name: ^(buf|buffer|origin_addr)$
         :field_class:
           :cast_type: ^(const )?void \*$
   # Level Zero USM allocation / free events sourced from the ze

From 1179340189ff026c027ef1df298273f47e7ad04b Mon Sep 17 00:00:00 2001
From: abagusetty <abagusetty@anl.gov>
Date: Thu, 4 Jun 2026 08:54:42 -0500
Subject: [PATCH 14/14] mpi: strip ze-specific fields from merged upstream
 model

Two issues surfaced when the consumer compiled:

1. metababel emitted callback typedefs for the full ze allocation
   events (ze_context_handle_t, ze_device_mem_alloc_desc_t *,
   ze_result_t, ...). libMPIInterval is built with only mpi.h
   visible, so btx_upstream.h failed to compile with 'unknown type
   name ze_context_handle_t' (and friends).

   Teach gen_btx_mpi_with_ze_model.rb to keep only the payload
   fields the MPI consumer actually uses (size, pptr_val, ptr).
   After stripping, every generated typedef references only
   size_t / void *, which mpi.h transitively pulls in.

2. metababel strips qualifiers when generating callback typedefs,
   so the matching model's 'const void *' becomes 'void *' in the
   generated mpi_*_entry_callback_f signatures. Drop 'const' from
   each mpi_*_entry_callback parameter so the signatures match.
   (Same fix shape as the earlier buffer_info_callback fix.)

Also replace filter_map with map.compact for Ruby <= 2.6
compatibility, even though Aurora's Ruby (3.3) supports filter_map.
---
 backends/mpi/btx_mpiinterval_callbacks.cpp | 23 ++++----
 backends/mpi/gen_btx_mpi_with_ze_model.rb  | 66 +++++++++++++++-------
 2 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
index 7b00c52bf..38d39caf0 100644
--- a/backends/mpi/btx_mpiinterval_callbacks.cpp
+++ b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -341,6 +341,9 @@ tag_if_gpu(data_t *data, const char *hostname, int64_t vpid, uint64_t vtid, cons
     data->gpu_aware_calls.insert({hostname, vpid, vtid});
 }
 
+// metababel strips qualifiers when generating callback typedefs, so buffer
+// parameters declared `const void *` in the matching model become `void *`
+// in the generated mpi_*_entry_callback_f typedefs. Match exactly.
 static void mpi_1buf_entry_callback(void *btx_handle,
                                     void *usr_data,
                                     int64_t ts,
@@ -348,7 +351,7 @@ static void mpi_1buf_entry_callback(void *btx_handle,
                                     const char *hostname,
                                     int64_t vpid,
                                     uint64_t vtid,
-                                    const void *buf) {
+                                    void *buf) {
   tag_if_gpu(static_cast<data_t *>(usr_data), hostname, vpid, vtid, buf);
 }
 
@@ -360,8 +363,8 @@ static void mpi_sendrecv_entry_callback(void *btx_handle,
                                         const char *hostname,
                                         int64_t vpid,
                                         uint64_t vtid,
-                                        const void *sendbuf,
-                                        const void *recvbuf) {
+                                        void *sendbuf,
+                                        void *recvbuf) {
   auto *data = static_cast<data_t *>(usr_data);
   tag_if_gpu(data, hostname, vpid, vtid, sendbuf);
   tag_if_gpu(data, hostname, vpid, vtid, recvbuf);
@@ -375,8 +378,8 @@ static void mpi_reduce_local_entry_callback(void *btx_handle,
                                             const char *hostname,
                                             int64_t vpid,
                                             uint64_t vtid,
-                                            const void *inbuf,
-                                            const void *inoutbuf) {
+                                            void *inbuf,
+                                            void *inoutbuf) {
   auto *data = static_cast<data_t *>(usr_data);
   tag_if_gpu(data, hostname, vpid, vtid, inbuf);
   tag_if_gpu(data, hostname, vpid, vtid, inoutbuf);
@@ -390,8 +393,8 @@ static void mpi_rma_fetch_entry_callback(void *btx_handle,
                                          const char *hostname,
                                          int64_t vpid,
                                          uint64_t vtid,
-                                         const void *origin_addr,
-                                         const void *result_addr) {
+                                         void *origin_addr,
+                                         void *result_addr) {
   auto *data = static_cast<data_t *>(usr_data);
   tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
   tag_if_gpu(data, hostname, vpid, vtid, result_addr);
@@ -405,9 +408,9 @@ static void mpi_cas_entry_callback(void *btx_handle,
                                    const char *hostname,
                                    int64_t vpid,
                                    uint64_t vtid,
-                                   const void *origin_addr,
-                                   const void *compare_addr,
-                                   const void *result_addr) {
+                                   void *origin_addr,
+                                   void *compare_addr,
+                                   void *result_addr) {
   auto *data = static_cast<data_t *>(usr_data);
   tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
   tag_if_gpu(data, hostname, vpid, vtid, compare_addr);
diff --git a/backends/mpi/gen_btx_mpi_with_ze_model.rb b/backends/mpi/gen_btx_mpi_with_ze_model.rb
index 1365a593c..e68530f0b 100644
--- a/backends/mpi/gen_btx_mpi_with_ze_model.rb
+++ b/backends/mpi/gen_btx_mpi_with_ze_model.rb
@@ -3,16 +3,26 @@
 #
 # btx_filter_mpi needs to receive Level Zero allocation/free events in
 # addition to MPI events so that btx_mpiinterval_callbacks.cpp can
-# maintain an address-range map and tag GPU-aware MPI calls. metababel's
-# -u union of two backend models fails because both declare
-# :environment: :entries: [{name: hostname}], which the matching
-# engine treats as an ambiguity.
+# maintain an address-range map and tag GPU-aware MPI calls. Two
+# constraints conspire to make this non-trivial:
 #
-# This generator side-steps the issue by emitting a single model file
-# that:
-#   - starts from btx_mpi_model.yaml,
-#   - copies only the ze allocation/free event classes we actually
-#     need into the same stream class.
+#  1. metababel's bare `-u model_a,model_b` union fails because each
+#     backend model declares :environment: :entries:[{name: hostname}],
+#     causing the matching engine to see an ambiguous member.
+#
+#  2. The MPI bt2 plugin (libMPIInterval) is built with only mpi.h on
+#     the include path; pulling in the full ze event payloads would
+#     drag in <ze_api.h> types (ze_context_handle_t, ze_result_t,
+#     ze_device_mem_alloc_desc_t *, ...) that the MPI consumer cannot
+#     resolve.
+#
+# Side-step both by emitting a single merged model that:
+#   - starts from btx_mpi_model.yaml (single :environment:),
+#   - copies only the ze allocation/free event classes we need into
+#     MPI's stream class,
+#   - keeps only the payload fields the MPI consumer actually uses,
+#     so all C types referenced in the generated upstream typedefs
+#     remain primitive (size_t / void *) and need no ze headers.
 #
 # Usage: ruby gen_btx_mpi_with_ze_model.rb MPI_MODEL.yaml ZE_MODEL.yaml > OUT.yaml
 require 'yaml'
@@ -21,22 +31,38 @@
 ze_path  = ARGV[1]
 raise 'usage: gen_btx_mpi_with_ze_model.rb MPI_MODEL ZE_MODEL' unless mpi_path && ze_path
 
-ZE_WANTED_EVENT_NAMES = %w[
-  lttng_ust_ze:zeMemAllocHost_entry
-  lttng_ust_ze:zeMemAllocHost_exit
-  lttng_ust_ze:zeMemAllocDevice_entry
-  lttng_ust_ze:zeMemAllocDevice_exit
-  lttng_ust_ze:zeMemAllocShared_entry
-  lttng_ust_ze:zeMemAllocShared_exit
-  lttng_ust_ze:zeMemFree_entry
-  lttng_ust_ze:zeMemFree_exit
-].freeze
+# (event name) -> (allow-list of payload field names the MPI consumer
+# needs to see). Fields not in the allow-list are dropped from the
+# copied event, keeping the generated upstream typedefs free of any
+# ze-specific types.
+ZE_EVENT_FIELDS = {
+  'lttng_ust_ze:zeMemAllocHost_entry'    => %w[size],
+  'lttng_ust_ze:zeMemAllocHost_exit'     => %w[pptr_val],
+  'lttng_ust_ze:zeMemAllocDevice_entry'  => %w[size],
+  'lttng_ust_ze:zeMemAllocDevice_exit'   => %w[pptr_val],
+  'lttng_ust_ze:zeMemAllocShared_entry'  => %w[size],
+  'lttng_ust_ze:zeMemAllocShared_exit'   => %w[pptr_val],
+  'lttng_ust_ze:zeMemFree_entry'         => %w[ptr],
+  'lttng_ust_ze:zeMemFree_exit'          => %w[],
+}.freeze
 
 mpi_model = YAML.load_file(mpi_path)
 ze_model  = YAML.load_file(ze_path)
 
 ze_events = ze_model.fetch(:stream_classes, []).flat_map do |sc|
-  sc.fetch(:event_classes, []).select { |ec| ZE_WANTED_EVENT_NAMES.include?(ec[:name]) }
+  sc.fetch(:event_classes, []).map do |ec|
+    wanted = ZE_EVENT_FIELDS[ec[:name]]
+    next nil unless wanted
+
+    # Deep-copy via Marshal so we can mutate freely without aliasing the
+    # original ze model.
+    ec = Marshal.load(Marshal.dump(ec))
+    if (pfc = ec[:payload_field_class]) && (members = pfc[:members])
+      pfc[:members] = members.select { |m| wanted.include?(m[:name].to_s) }
+      ec.delete(:payload_field_class) if pfc[:members].empty?
+    end
+    ec
+  end.compact
 end
 
 if ze_events.empty?