argonne-lcf · abagusetty · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am
@@ -189,8 +189,23 @@ EXTRA_DIST += \
 	$(top_srcdir)/xprof/btx_interval_model.yaml \
 	btx_mpimatching_model.yaml
 
-$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml
-	$(METABABEL) -u btx_mpi_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include
+# The MPI interval filter needs to receive Level Zero allocation/free
+# events in addition to MPI events so that btx_mpiinterval_callbacks.cpp
+# can tag GPU-aware MPI calls via address-range lookup. metababel's bare
+# `-u model_a,model_b` union fails when both backend models declare the
+# same environment field (hostname), so we generate a merged upstream
+# model that copies only the ze events we need into MPI's stream class.
+# This adds a build-order dependency mpi -> ze.
+BTX_ZE_MODEL = $(top_builddir)/backends/ze/btx_ze_model.yaml
+
+btx_mpi_with_ze_model.yaml: $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL)
+	$(RUBY) $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL) > $@
+
+CLEANFILES += btx_mpi_with_ze_model.yaml
+EXTRA_DIST += gen_btx_mpi_with_ze_model.rb
+
+$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_with_ze_model.yaml
+	$(METABABEL) -u btx_mpi_with_ze_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include
 
 CLEANFILES += \
 	$(BTX_MPI_GENERATED) \

diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -1,17 +1,69 @@
 #include "xprof_utils.hpp"
 #include <iostream>
+#include <map>
 #include <metababel/metababel.h>
 #include <mpi.h.include>
 #include <sstream>
 #include <string>
 #include <tuple>
+#include <unordered_set>
+
+// ---------------------------------------------------------------------------
+// GPU-aware MPI tagging
+//
+// We maintain a per-process address-range map populated from
+// lttng_ust_ze:zeMemAlloc*_exit / zeMemFree_entry events emitted by the ze
+// backend (joined into our upstream model via backends/mpi/Makefile.am).
+// Each MPI buffer-bearing entry event looks up its buffer pointer in that
+// map; if the pointer falls inside a known GPU allocation, the
+// {hostname,vpid,vtid} key is recorded in gpu_aware_calls. The matching
+// exit callback consumes the key and prepends '*' to the MPI call name
+// before pushing it to the aggregator, producing a separate '*MPI_Foo'
+// row in the tally.
+//
+// When the user enables only --backend mpi, no ze events flow in, the
+// range-map stays empty, and no '*' ever appears (CPU-only profile).
+// ---------------------------------------------------------------------------
+
+// Per-process address-range map: base -> (base + size).
+typedef std::map<uintptr_t, uintptr_t> memory_interval_t;
 
 struct data_s {
   EntryState entry_state;
+  // GPU allocation ranges, keyed on {hostname, vpid}. Mirrors the
+  // rangeset_memory_{host,device,shared} pattern in
+  // backends/ze/btx_zeinterval_callbacks.cpp.
+  std::unordered_map<hp_t, memory_interval_t> rangeset_memory_device;
+  std::unordered_map<hp_t, memory_interval_t> rangeset_memory_shared;
+  // {hostname, vpid, vtid} of MPI calls observed to carry at least one
+  // device/shared GPU buffer. Populated in mpi_*_buf_entry_callback,
+  // consumed (and cleared) in send_host_message.
+  std::unordered_set<hpt_t> gpu_aware_calls;
 };
 
 typedef struct data_s data_t;
 
+// True iff `ptr` lies inside any allocation tracked in `m`.
+static bool memory_includes(const memory_interval_t &m, uintptr_t ptr) {
+  const auto it = m.upper_bound(ptr);
+  if (it == m.cbegin())
+    return false;
+  return ptr < std::prev(it)->second;
+}
+
+// True iff `ptr` lies inside a tracked GPU (device or shared) allocation
+// for the given {hostname, vpid}. Host USM is intentionally NOT tagged
+// as GPU-aware here (it behaves like host memory from MPI's perspective).
+static bool is_gpu_pointer(const data_t *data, const hp_t &hp, uintptr_t ptr) {
+  auto itd = data->rangeset_memory_device.find(hp);
+  if (itd != data->rangeset_memory_device.end() && memory_includes(itd->second, ptr))
+    return true;
+  auto its = data->rangeset_memory_shared.find(hp);
+  if (its != data->rangeset_memory_shared.end() && memory_includes(its->second, ptr))
+    return true;
+  return false;
+}
+
 static void send_host_message(void *btx_handle,
                               void *usr_data,
                               int64_t ts,
@@ -21,9 +73,16 @@ static void send_host_message(void *btx_handle,
                               uint64_t vtid,
                               bool err) {
 
+  auto *data = static_cast<data_t *>(usr_data);
+  const hpt_t hpt{hostname, vpid, vtid};
+
   std::string event_class_name_striped = strip_event_class_name_exit(event_class_name);
-  const int64_t entry_ts =
-      static_cast<data_t *>(usr_data)->entry_state.get_ts({hostname, vpid, vtid});
+  // GPU-aware MPI calls get a leading '*' so they sort and aggregate as a
+  // distinct row from their CPU-only counterparts in the tally output.
+  if (data->gpu_aware_calls.erase(hpt) > 0)
+    event_class_name_striped.insert(0, 1, '*');
+
+  const int64_t entry_ts = data->entry_state.get_ts(hpt);
 
   btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI,
                               event_class_name_striped.c_str(), (ts - entry_ts), err);
@@ -198,6 +257,166 @@ static void traffic_int_entry_callback(void *btx_handle,
                                  count * size, oss.str().c_str());
 }
 
+// --- ze allocation / free tracking ----------------------------------------
+// Bind allocation size at entry; the matching exit callback uses it together
+// with the freshly-returned pointer to populate the range-map. The
+// event_class_name suffix (Device vs Shared vs Host) selects which rangeset
+// the allocation lands in.
+static void ze_alloc_entry_callback(void *btx_handle,
+                                    void *usr_data,
+                                    int64_t ts,
+                                    const char *event_class_name,
+                                    const char *hostname,
+                                    int64_t vpid,
+                                    uint64_t vtid,
+                                    size_t size) {
+  static_cast<data_t *>(usr_data)->entry_state.set_data({hostname, vpid, vtid}, size);
+}
+
+static void ze_alloc_exit_callback(void *btx_handle,
+                                   void *usr_data,
+                                   int64_t ts,
+                                   const char *event_class_name,
+                                   const char *hostname,
+                                   int64_t vpid,
+                                   uint64_t vtid,
+                                   void *pptr_val) {
+  auto *data = static_cast<data_t *>(usr_data);
+  auto size = data->entry_state.get_data<size_t>({hostname, vpid, vtid});
+  // We don't see zeResult in the matching model to avoid pulling in
+  // ze_api.h; a failed allocation has pptr_val == NULL.
+  if (pptr_val == nullptr)
+    return;
+
+  const std::string ev{event_class_name};
+  // Host USM is reachable from the host CPU so MPI sees it as a normal host
+  // pointer; classifying it as GPU-aware would produce false positives.
+  // Device and Shared (managed) USM live (at least partly) on the device,
+  // so MPI calls on them are GPU-aware.
+  std::unordered_map<hp_t, memory_interval_t> *mi = nullptr;
+  if (ev.find("zeMemAllocDevice") != std::string::npos)
+    mi = &data->rangeset_memory_device;
+  else if (ev.find("zeMemAllocShared") != std::string::npos)
+    mi = &data->rangeset_memory_shared;
+  else
+    return; // host USM, ignored
+
+  const uintptr_t base = reinterpret_cast<uintptr_t>(pptr_val);
+  (*mi)[{hostname, vpid}][base] = base + size;
+}
+
+static void ze_free_entry_callback(void *btx_handle,
+                                   void *usr_data,
+                                   int64_t ts,
+                                   const char *event_class_name,
+                                   const char *hostname,
+                                   int64_t vpid,
+                                   uint64_t vtid,
+                                   void *ptr) {
+  auto *data = static_cast<data_t *>(usr_data);
+  if (ptr == nullptr)
+    return;
+  const uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+  const hp_t hp{hostname, vpid};
+  // The freed pointer may have been classified as device OR shared at
+  // allocation; try both.
+  auto itd = data->rangeset_memory_device.find(hp);
+  if (itd != data->rangeset_memory_device.end())
+    itd->second.erase(base);
+  auto its = data->rangeset_memory_shared.find(hp);
+  if (its != data->rangeset_memory_shared.end())
+    its->second.erase(base);
+}
+
+// --- MPI buffer classification --------------------------------------------
+static void
+tag_if_gpu(data_t *data, const char *hostname, int64_t vpid, uint64_t vtid, const void *ptr) {
+  if (ptr == nullptr)
+    return;
+  // MPICH sentinels: MPI_IN_PLACE / MPI_BOTTOM. These would never match a
+  // real allocation but checking explicitly keeps the intent clear.
+  if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM)
+    return;
+  if (is_gpu_pointer(data, {hostname, vpid}, reinterpret_cast<uintptr_t>(ptr)))
+    data->gpu_aware_calls.insert({hostname, vpid, vtid});
+}
+
+// metababel strips qualifiers when generating callback typedefs, so buffer
+// parameters declared `const void *` in the matching model become `void *`
+// in the generated mpi_*_entry_callback_f typedefs. Match exactly.
+static void mpi_1buf_entry_callback(void *btx_handle,
+                                    void *usr_data,
+                                    int64_t ts,
+                                    const char *event_class_name,
+                                    const char *hostname,
+                                    int64_t vpid,
+                                    uint64_t vtid,
+                                    void *buf) {
+  tag_if_gpu(static_cast<data_t *>(usr_data), hostname, vpid, vtid, buf);
+}
+
+// MPI_Sendrecv, MPI_Allreduce, MPI_Reduce, MPI_Gather, ... — the bulk.
+static void mpi_sendrecv_entry_callback(void *btx_handle,
+                                        void *usr_data,
+                                        int64_t ts,
+                                        const char *event_class_name,
+                                        const char *hostname,
+                                        int64_t vpid,
+                                        uint64_t vtid,
+                                        void *sendbuf,
+                                        void *recvbuf) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, sendbuf);
+  tag_if_gpu(data, hostname, vpid, vtid, recvbuf);
+}
+
+// MPI_Reduce_local(inbuf, inoutbuf, ...)
+static void mpi_reduce_local_entry_callback(void *btx_handle,
+                                            void *usr_data,
+                                            int64_t ts,
+                                            const char *event_class_name,
+                                            const char *hostname,
+                                            int64_t vpid,
+                                            uint64_t vtid,
+                                            void *inbuf,
+                                            void *inoutbuf) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, inbuf);
+  tag_if_gpu(data, hostname, vpid, vtid, inoutbuf);
+}
+
+// MPI_Get_accumulate, MPI_Fetch_and_op (origin_addr + result_addr).
+static void mpi_rma_fetch_entry_callback(void *btx_handle,
+                                         void *usr_data,
+                                         int64_t ts,
+                                         const char *event_class_name,
+                                         const char *hostname,
+                                         int64_t vpid,
+                                         uint64_t vtid,
+                                         void *origin_addr,
+                                         void *result_addr) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
+  tag_if_gpu(data, hostname, vpid, vtid, result_addr);
+}
+
+// MPI_Compare_and_swap(origin_addr, compare_addr, result_addr, ...)
+static void mpi_cas_entry_callback(void *btx_handle,
+                                   void *usr_data,
+                                   int64_t ts,
+                                   const char *event_class_name,
+                                   const char *hostname,
+                                   int64_t vpid,
+                                   uint64_t vtid,
+                                   void *origin_addr,
+                                   void *compare_addr,
+                                   void *result_addr) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
+  tag_if_gpu(data, hostname, vpid, vtid, compare_addr);
+  tag_if_gpu(data, hostname, vpid, vtid, result_addr);
+}
+
 void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component);
   btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component);
@@ -209,4 +428,16 @@ void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback);
 
   btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback);
+
+  // GPU-aware MPI: ze alloc/free updates the range-map, MPI buffer entries
+  // look up against it. One callback per MPI buffer-shape (see the matching
+  // model for why we cannot lump them into a single regex).
+  btx_register_callbacks_ze_alloc_entry(btx_handle, &ze_alloc_entry_callback);
+  btx_register_callbacks_ze_alloc_exit(btx_handle, &ze_alloc_exit_callback);
+  btx_register_callbacks_ze_free_entry(btx_handle, &ze_free_entry_callback);
+  btx_register_callbacks_mpi_1buf_entry(btx_handle, &mpi_1buf_entry_callback);
+  btx_register_callbacks_mpi_sendrecv_entry(btx_handle, &mpi_sendrecv_entry_callback);
+  btx_register_callbacks_mpi_reduce_local_entry(btx_handle, &mpi_reduce_local_entry_callback);
+  btx_register_callbacks_mpi_rma_fetch_entry(btx_handle, &mpi_rma_fetch_entry_callback);
+  btx_register_callbacks_mpi_cas_entry(btx_handle, &mpi_cas_entry_callback);
 }