From 125a0d1e21107009a7222acdaa98541db38177d6 Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 12:51:19 -0500 Subject: [PATCH 01/14] mpi: tag GPU-aware MPI calls with leading '*' in profile Introspect L0/CUDA/HIP pointers at runtime (via dlopen, no build deps) for every buffer-bearing MPI call and emit a buffer_info tracepoint. The interval consumer prefixes the call name with '*' when any buffer resides in device or shared/managed memory. --- backends/mpi/Makefile.am | 3 +- backends/mpi/btx_mpiinterval_callbacks.cpp | 49 ++- backends/mpi/gen_mpi.rb | 6 + backends/mpi/mpi.h.include | 3 +- backends/mpi/mpi_events.yaml | 23 ++ backends/mpi/mpi_model.rb | 39 +++ backends/mpi/tracer_mpi.sh.in | 2 + backends/mpi/tracer_mpi_helpers.include.c | 384 +++++++++++++++++++++ 8 files changed, 505 insertions(+), 4 deletions(-) diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am index f3d0ea28c..13400366e 100644 --- a/backends/mpi/Makefile.am +++ b/backends/mpi/Makefile.am @@ -62,7 +62,8 @@ MPI_PROBES_INCL = $(MPI_PROBES:=.h) MPI_PROBES_SRC = $(MPI_PROBES:=.c) MPI_STATIC_PROBES = \ - mpi_type + mpi_type \ + mpi_properties MPI_STATIC_PROBES_TP = $(MPI_STATIC_PROBES:=.tp) MPI_STATIC_PROBES_INCL = $(MPI_STATIC_PROBES:=.h) diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp index 65f1cedd2..1e29ed0ef 100644 --- a/backends/mpi/btx_mpiinterval_callbacks.cpp +++ b/backends/mpi/btx_mpiinterval_callbacks.cpp @@ -5,9 +5,22 @@ #include #include #include +#include + +// Must match the enum in backends/mpi/tracer_mpi_helpers.include.c. +enum { + THAPI_MPI_PTR_UNKNOWN = 0, + THAPI_MPI_PTR_HOST = 1, + THAPI_MPI_PTR_DEVICE = 2, + THAPI_MPI_PTR_SHARED = 3 +}; struct data_s { EntryState entry_state; + // Set of {hostname, vpid, vtid} for which the in-flight MPI call has been + // observed to carry at least one device/shared GPU buffer. Populated by + // buffer_info_callback, consumed (and cleared) by send_host_message. + std::unordered_set gpu_aware_calls; }; typedef struct data_s data_t; @@ -21,9 +34,16 @@ static void send_host_message(void *btx_handle, uint64_t vtid, bool err) { + auto *data = static_cast(usr_data); + const hpt_t hpt{hostname, vpid, vtid}; + std::string event_class_name_striped = strip_event_class_name_exit(event_class_name); - const int64_t entry_ts = - static_cast(usr_data)->entry_state.get_ts({hostname, vpid, vtid}); + // GPU-aware MPI calls are prefixed with '*' so they stand out in the + // profile. The flag is set by buffer_info_callback during this call. + if (data->gpu_aware_calls.erase(hpt) > 0) + event_class_name_striped.insert(0, 1, '*'); + + const int64_t entry_ts = data->entry_state.get_ts(hpt); btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI, event_class_name_striped.c_str(), (ts - entry_ts), err); @@ -149,6 +169,29 @@ static void type_property_callback(void *btx_handle, mpi_datatype_info[(uint64_t)datatype] = {std::to_string((uint64_t)datatype), size}; } +// Emitted from the prologue of every buffer-bearing MPI call, once per +// buffer argument. We only care whether at least one buffer is on the GPU +// (device or shared/managed); that's enough to tag the call as GPU-aware. +static void buffer_info_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *hostname, + int64_t vpid, + uint64_t vtid, + uintptr_t ptr, + int role, + int kind, + int backend, + uintptr_t base, + uint64_t size, + uint32_t device_ordinal) { + + if (kind == THAPI_MPI_PTR_DEVICE || kind == THAPI_MPI_PTR_SHARED) { + auto *data = static_cast(usr_data); + data->gpu_aware_calls.insert({hostname, vpid, vtid}); + } +} + static void traffic_MPI_Count_entry_callback(void *btx_handle, void *usr_data, int64_t ts, @@ -209,4 +252,6 @@ void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback); btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback); + + btx_register_callbacks_lttng_ust_mpi_properties_buffer_info(btx_handle, &buffer_info_callback); } diff --git a/backends/mpi/gen_mpi.rb b/backends/mpi/gen_mpi.rb index bd939a9bb..3c0e349cd 100644 --- a/backends/mpi/gen_mpi.rb +++ b/backends/mpi/gen_mpi.rb @@ -19,6 +19,10 @@ def common_block(c, provider) tracepoint_params = c.tracepoint_parameters.reject { |p| p.after? }.collect(&:name) puts " tracepoint(#{provider}, #{c.name}_#{START}, #{(tp_params + tracepoint_params).join(', ')});" + c.prologues.each do |p| + puts p + end + params = c.parameters.collect(&:name) if c.has_return_type? puts " #{c.type} _retval;" @@ -75,10 +79,12 @@ def define_and_find_mpi_symbols puts <<~EOF #include + #include #define MPICH_FORTRAN_SYMBOLS_NONABI #include #include "mpi_tracepoints.h" #include "mpi_type.h" + #include "mpi_properties.h" #include #include EOF diff --git a/backends/mpi/mpi.h.include b/backends/mpi/mpi.h.include index aed167967..4fe43271b 100644 --- a/backends/mpi/mpi.h.include +++ b/backends/mpi/mpi.h.include @@ -1,2 +1,3 @@ #include - +#include +#include diff --git a/backends/mpi/mpi_events.yaml b/backends/mpi/mpi_events.yaml index 48c6dcc96..8a8060873 100644 --- a/backends/mpi/mpi_events.yaml +++ b/backends/mpi/mpi_events.yaml @@ -9,3 +9,26 @@ lttng_ust_mpi_type: - [ ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype" ] - [ ctf_integer, int, size, size ] +lttng_ust_mpi_properties: + events: + # Emitted once per buffer argument of a buffer-bearing MPI call. + # `kind` : 0 unknown, 1 host, 2 device, 3 shared/managed + # `backend` : 0 none, 1 level-zero, 2 cuda, 3 hip + # `role` : 0 input (send-like), 1 output (recv-like) + - name: buffer_info + args: + - [ const void *, ptr ] + - [ int, role ] + - [ int, kind ] + - [ int, backend ] + - [ uintptr_t, base ] + - [ uint64_t, size ] + - [ uint32_t, device_ordinal ] + fields: + - [ ctf_integer_hex, uintptr_t, ptr, "(uintptr_t)ptr" ] + - [ ctf_integer, int, role, role ] + - [ ctf_integer, int, kind, kind ] + - [ ctf_integer, int, backend, backend ] + - [ ctf_integer_hex, uintptr_t, base, base ] + - [ ctf_integer, uint64_t, size, size ] + - [ ctf_integer, uint32_t, device_ordinal, device_ordinal ] diff --git a/backends/mpi/mpi_model.rb b/backends/mpi/mpi_model.rb index c8f3cc56c..6084c4084 100644 --- a/backends/mpi/mpi_model.rb +++ b/backends/mpi/mpi_model.rb @@ -89,3 +89,42 @@ def underscore(camel_cased_word) if (tracepoint_enabled(lttng_ust_mpi_type, property))#{' '} tracepoint(lttng_ust_mpi_type, property, *datatype, size); EOF + +# --------------------------------------------------------------------------- +# GPU-aware MPI introspection +# +# For every MPI call that carries a user payload buffer, register a prologue +# that emits an `lttng_ust_mpi_properties:buffer_info` event per buffer. The +# event records whether the pointer is host / device / shared memory and which +# GPU runtime (Level Zero, CUDA, HIP) owns it. Consumers (extract/, +# btx_mpiinterval_callbacks.cpp) use that to tag each MPI call as "GPU-aware" +# or "CPU-only". +# +# The set of buffer parameters is derived from mpi_meta_parameters.yaml so +# new MPI APIs picked up by codegen are classified automatically: any +# InScalar/OutScalar parameter whose name matches one of the canonical MPI +# payload-buffer names below is treated as a user buffer. +# --------------------------------------------------------------------------- +MPI_PAYLOAD_BUFFER_NAMES = %w[ + buf buffer sendbuf recvbuf inbuf outbuf inoutbuf + origin_addr result_addr compare_addr +].freeze + +$mpi_meta_parameters.fetch('meta_parameters', []).each do |func, list| + buffers = list.select do |type, name| + %w[InScalar OutScalar].include?(type) && + MPI_PAYLOAD_BUFFER_NAMES.include?(name) + end + next if buffers.empty? + + body = buffers.map do |type, name| + role = type == 'InScalar' ? 'THAPI_MPI_ROLE_IN' : 'THAPI_MPI_ROLE_OUT' + " _dump_buffer_info((const void *)#{name}, #{role});" + end.join("\n") + + register_prologue func, <<~EOF + if (tracepoint_enabled(lttng_ust_mpi_properties, buffer_info)) { + #{body} + } + EOF +end diff --git a/backends/mpi/tracer_mpi.sh.in b/backends/mpi/tracer_mpi.sh.in index 55343f6ca..e841ad94b 100644 --- a/backends/mpi/tracer_mpi.sh.in +++ b/backends/mpi/tracer_mpi.sh.in @@ -81,6 +81,8 @@ fi lttng enable-channel --userspace --blocking-timeout=inf blocking-channel lttng add-context --userspace --channel=blocking-channel -t vpid -t vtid lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi:* +lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi_type:* +lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi_properties:* if [ -z "$LTTNG_UST_MPI_LIBMPI" ]; then LTTNG_UST_MPI_LIBMPI=$(whichlib64_head libmpi.so) if [ -n "$LTTNG_UST_MPI_LIBMPI" ]; then diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c index a432d2183..874bd7c22 100644 --- a/backends/mpi/tracer_mpi_helpers.include.c +++ b/backends/mpi/tracer_mpi_helpers.include.c @@ -2,6 +2,389 @@ static pthread_once_t _init = PTHREAD_ONCE_INIT; static __thread volatile int in_init = 0; static volatile unsigned int _initialized = 0; +/* --------------------------------------------------------------------------- + * GPU pointer introspection + * + * Goal: when an MPI call carries a buffer pointer, decide at runtime whether + * the pointer refers to host memory or to a GPU allocation, so that downstream + * tools can tag the call as "GPU-aware MPI" vs "CPU-only MPI". + * + * Design constraints (matching the existing MPI backend): + * - No build-time dependency on Level Zero / CUDA / HIP. Each loader is + * dlopen'd lazily, and only the symbols we need are looked up via dlsym + * using locally-declared function-pointer signatures over opaque types. + * - All overhead is gated by tracepoint_enabled() and a small per-thread + * cache keyed on the allocation base address, so a buffer reused across + * iterations is classified only once. + * ------------------------------------------------------------------------- */ + +/* Values reported in the buffer_info tracepoint. Keep in sync with consumers + * (extract/, btx_mpiinterval_callbacks.cpp). */ +enum thapi_mpi_ptr_kind { + THAPI_MPI_PTR_UNKNOWN = 0, + THAPI_MPI_PTR_HOST = 1, + THAPI_MPI_PTR_DEVICE = 2, + THAPI_MPI_PTR_SHARED = 3 +}; + +enum thapi_mpi_ptr_backend { + THAPI_MPI_BACKEND_NONE = 0, + THAPI_MPI_BACKEND_L0 = 1, + THAPI_MPI_BACKEND_CUDA = 2, + THAPI_MPI_BACKEND_HIP = 3 +}; + +enum thapi_mpi_buffer_role { + THAPI_MPI_ROLE_IN = 0, + THAPI_MPI_ROLE_OUT = 1 +}; + +/* --- Level Zero ----------------------------------------------------------- */ +/* Opaque handle types and the minimal enums/structs needed to call + * zeMemGetAllocProperties / zeMemGetAddressRange without including ze_api.h. */ +typedef int _thapi_ze_result_t; /* ze_result_t */ +typedef struct _thapi_ze_driver *_thapi_ze_driver_handle_t; +typedef struct _thapi_ze_context *_thapi_ze_context_handle_t; +typedef struct _thapi_ze_device *_thapi_ze_device_handle_t; + +#define _THAPI_ZE_RESULT_SUCCESS 0 +#define _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES 0x000c +#define _THAPI_ZE_INIT_FLAG_GPU_ONLY 0x01 + +typedef enum { + _THAPI_ZE_MEMORY_TYPE_UNKNOWN = 0, + _THAPI_ZE_MEMORY_TYPE_HOST = 1, + _THAPI_ZE_MEMORY_TYPE_DEVICE = 2, + _THAPI_ZE_MEMORY_TYPE_SHARED = 3 +} _thapi_ze_memory_type_t; + +typedef struct { + uint32_t stype; + const void *pNext; + _thapi_ze_memory_type_t type; + uint64_t id; + uint64_t pageSize; +} _thapi_ze_memory_allocation_properties_t; + +typedef _thapi_ze_result_t (*_thapi_ze_init_fn_t)(uint32_t flags); +typedef _thapi_ze_result_t (*_thapi_ze_driver_get_fn_t)(uint32_t *pCount, + _thapi_ze_driver_handle_t *phDrivers); +typedef _thapi_ze_result_t (*_thapi_ze_context_create_fn_t)(_thapi_ze_driver_handle_t hDriver, + const void *desc, + _thapi_ze_context_handle_t *phContext); +typedef _thapi_ze_result_t (*_thapi_ze_mem_get_alloc_properties_fn_t)( + _thapi_ze_context_handle_t hContext, const void *ptr, + _thapi_ze_memory_allocation_properties_t *pProps, + _thapi_ze_device_handle_t *phDevice); +typedef _thapi_ze_result_t (*_thapi_ze_mem_get_address_range_fn_t)( + _thapi_ze_context_handle_t hContext, const void *ptr, + void **pBase, size_t *pSize); + +static _thapi_ze_mem_get_alloc_properties_fn_t _ze_mem_get_alloc_properties = NULL; +static _thapi_ze_mem_get_address_range_fn_t _ze_mem_get_address_range = NULL; +static _thapi_ze_context_handle_t _ze_introspection_ctx = NULL; + +/* --- CUDA ----------------------------------------------------------------- */ +/* Driver API (libcuda.so) is preferred: it's the same on every CUDA install + * and doesn't drag in libcudart. */ +typedef int _thapi_cu_result_t; /* CUresult */ +typedef unsigned int _thapi_cu_attr_t; /* CUpointer_attribute */ +typedef unsigned long long _thapi_cu_deviceptr_t; /* CUdeviceptr */ + +#define _THAPI_CU_SUCCESS 0 +#define _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE 2 +#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START 7 +#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE 8 +#define _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL 9 +#define _THAPI_CU_MEMORYTYPE_HOST 1 +#define _THAPI_CU_MEMORYTYPE_DEVICE 2 +#define _THAPI_CU_MEMORYTYPE_ARRAY 3 +#define _THAPI_CU_MEMORYTYPE_UNIFIED 4 + +typedef _thapi_cu_result_t (*_thapi_cu_init_fn_t)(unsigned int flags); +typedef _thapi_cu_result_t (*_thapi_cu_pointer_get_attribute_fn_t)(void *data, + _thapi_cu_attr_t attr, + _thapi_cu_deviceptr_t ptr); + +static _thapi_cu_pointer_get_attribute_fn_t _cu_pointer_get_attribute = NULL; + +/* --- HIP ------------------------------------------------------------------ */ +typedef int _thapi_hip_error_t; /* hipError_t */ + +#define _THAPI_HIP_SUCCESS 0 +#define _THAPI_HIP_MEMORY_TYPE_HOST 0 +#define _THAPI_HIP_MEMORY_TYPE_DEVICE 1 +#define _THAPI_HIP_MEMORY_TYPE_ARRAY 2 +#define _THAPI_HIP_MEMORY_TYPE_UNIFIED 3 +#define _THAPI_HIP_MEMORY_TYPE_MANAGED 4 + +typedef struct { + unsigned int type; + int device; + void *devicePointer; + void *hostPointer; + int isManaged; + unsigned int allocationFlags; +} _thapi_hip_pointer_attribute_t; + +typedef _thapi_hip_error_t (*_thapi_hip_pointer_get_attributes_fn_t)( + _thapi_hip_pointer_attribute_t *attr, const void *ptr); +typedef _thapi_hip_error_t (*_thapi_hip_mem_get_address_range_fn_t)( + void **pbase, size_t *psize, void *dptr); + +static _thapi_hip_pointer_get_attributes_fn_t _hip_pointer_get_attributes = NULL; +static _thapi_hip_mem_get_address_range_fn_t _hip_mem_get_address_range = NULL; + +/* --- Per-thread address-range cache --------------------------------------- */ +/* Tiny LRU keyed on allocation base address. Buffers reused across MPI + * iterations (the common case in HPC) hit the cache and skip the runtime + * query entirely. Sized small enough to live cheaply per-thread. */ +#define _THAPI_MPI_CACHE_SLOTS 16 + +struct _thapi_mpi_cache_entry { + uintptr_t base; + size_t size; + uint8_t kind; + uint8_t backend; + uint32_t device_ordinal; + uint8_t valid; +}; + +static __thread struct _thapi_mpi_cache_entry + _thapi_mpi_cache[_THAPI_MPI_CACHE_SLOTS]; +static __thread unsigned int _thapi_mpi_cache_next = 0; + +static inline int _thapi_mpi_cache_lookup(const void *ptr, + struct _thapi_mpi_cache_entry *out) { + uintptr_t p = (uintptr_t)ptr; + for (unsigned int i = 0; i < _THAPI_MPI_CACHE_SLOTS; i++) { + struct _thapi_mpi_cache_entry *e = &_thapi_mpi_cache[i]; + if (e->valid && p >= e->base && p < e->base + e->size) { + *out = *e; + return 1; + } + } + return 0; +} + +static inline void _thapi_mpi_cache_insert(const struct _thapi_mpi_cache_entry *e) { + _thapi_mpi_cache[_thapi_mpi_cache_next] = *e; + _thapi_mpi_cache_next = + (_thapi_mpi_cache_next + 1) % _THAPI_MPI_CACHE_SLOTS; +} + +/* --- Runtime probing ------------------------------------------------------ */ +static int _try_classify_ze(const void *ptr, + struct _thapi_mpi_cache_entry *out) { + if (!_ze_mem_get_alloc_properties || !_ze_introspection_ctx) + return 0; + + _thapi_ze_memory_allocation_properties_t props; + props.stype = _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES; + props.pNext = NULL; + props.type = _THAPI_ZE_MEMORY_TYPE_UNKNOWN; + _thapi_ze_device_handle_t hDev = NULL; + + if (_ze_mem_get_alloc_properties(_ze_introspection_ctx, ptr, &props, &hDev) + != _THAPI_ZE_RESULT_SUCCESS) + return 0; + if (props.type == _THAPI_ZE_MEMORY_TYPE_UNKNOWN) + return 0; + + void *base = NULL; + size_t size = 0; + if (_ze_mem_get_address_range && + _ze_mem_get_address_range(_ze_introspection_ctx, ptr, &base, &size) + != _THAPI_ZE_RESULT_SUCCESS) { + base = (void *)ptr; + size = 1; + } + + out->base = (uintptr_t)base; + out->size = size ? size : 1; + out->backend = THAPI_MPI_BACKEND_L0; + out->device_ordinal = 0; /* L0 device ordinal lookup would need extra calls */ + switch (props.type) { + case _THAPI_ZE_MEMORY_TYPE_HOST: out->kind = THAPI_MPI_PTR_HOST; break; + case _THAPI_ZE_MEMORY_TYPE_DEVICE: out->kind = THAPI_MPI_PTR_DEVICE; break; + case _THAPI_ZE_MEMORY_TYPE_SHARED: out->kind = THAPI_MPI_PTR_SHARED; break; + default: out->kind = THAPI_MPI_PTR_UNKNOWN; break; + } + out->valid = 1; + return 1; +} + +static int _try_classify_cuda(const void *ptr, + struct _thapi_mpi_cache_entry *out) { + if (!_cu_pointer_get_attribute) + return 0; + + unsigned int mem_type = 0; + _thapi_cu_deviceptr_t range_start = 0; + size_t range_size = 0; + int device_ordinal = 0; + _thapi_cu_deviceptr_t dptr = (_thapi_cu_deviceptr_t)(uintptr_t)ptr; + + if (_cu_pointer_get_attribute(&mem_type, + _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dptr) + != _THAPI_CU_SUCCESS || mem_type == 0) + return 0; + + (void)_cu_pointer_get_attribute(&range_start, + _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START, dptr); + (void)_cu_pointer_get_attribute(&range_size, + _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE, dptr); + (void)_cu_pointer_get_attribute(&device_ordinal, + _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dptr); + + out->base = range_start ? (uintptr_t)range_start : (uintptr_t)ptr; + out->size = range_size ? range_size : 1; + out->backend = THAPI_MPI_BACKEND_CUDA; + out->device_ordinal = (uint32_t)device_ordinal; + switch (mem_type) { + case _THAPI_CU_MEMORYTYPE_HOST: out->kind = THAPI_MPI_PTR_HOST; break; + case _THAPI_CU_MEMORYTYPE_DEVICE: out->kind = THAPI_MPI_PTR_DEVICE; break; + case _THAPI_CU_MEMORYTYPE_UNIFIED: out->kind = THAPI_MPI_PTR_SHARED; break; + default: out->kind = THAPI_MPI_PTR_UNKNOWN; break; + } + out->valid = 1; + return 1; +} + +static int _try_classify_hip(const void *ptr, + struct _thapi_mpi_cache_entry *out) { + if (!_hip_pointer_get_attributes) + return 0; + + _thapi_hip_pointer_attribute_t attr; + memset(&attr, 0, sizeof(attr)); + if (_hip_pointer_get_attributes(&attr, ptr) != _THAPI_HIP_SUCCESS) + return 0; + + void *base = (void *)ptr; + size_t size = 1; + if (_hip_mem_get_address_range) + (void)_hip_mem_get_address_range(&base, &size, (void *)ptr); + + out->base = (uintptr_t)base; + out->size = size ? size : 1; + out->backend = THAPI_MPI_BACKEND_HIP; + out->device_ordinal = (uint32_t)attr.device; + if (attr.isManaged || attr.type == _THAPI_HIP_MEMORY_TYPE_UNIFIED || + attr.type == _THAPI_HIP_MEMORY_TYPE_MANAGED) + out->kind = THAPI_MPI_PTR_SHARED; + else if (attr.type == _THAPI_HIP_MEMORY_TYPE_DEVICE) + out->kind = THAPI_MPI_PTR_DEVICE; + else if (attr.type == _THAPI_HIP_MEMORY_TYPE_HOST) + out->kind = THAPI_MPI_PTR_HOST; + else + out->kind = THAPI_MPI_PTR_UNKNOWN; + out->valid = 1; + return 1; +} + +static inline void _dump_buffer_info(const void *ptr, int role) { + if (!ptr) + return; + /* MPI_IN_PLACE / MPI_BOTTOM are sentinel values, not real pointers. */ + if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM) + return; + if (!tracepoint_enabled(lttng_ust_mpi_properties, buffer_info)) + return; + + struct _thapi_mpi_cache_entry e; + if (!_thapi_mpi_cache_lookup(ptr, &e)) { + e.valid = 0; + if (!_try_classify_ze(ptr, &e) && + !_try_classify_cuda(ptr, &e) && + !_try_classify_hip(ptr, &e)) { + /* Nobody claims it: treat as host. Cache a 1-byte range so we still + * answer in O(1) on the next call, but don't pollute the cache with + * adjacent host allocations. */ + e.base = (uintptr_t)ptr; + e.size = 1; + e.kind = THAPI_MPI_PTR_HOST; + e.backend = THAPI_MPI_BACKEND_NONE; + e.device_ordinal = 0; + e.valid = 1; + } + _thapi_mpi_cache_insert(&e); + } + + do_tracepoint(lttng_ust_mpi_properties, buffer_info, + ptr, role, (int)e.kind, (int)e.backend, + e.base, (uint64_t)e.size, e.device_ordinal); +} + +/* --- Loader -------------------------------------------------------------- */ +static void _load_gpu_introspection(int verbose) { + /* Each loader is optional: if the runtime isn't present on the system, the + * corresponding classifier stays NULL and is silently skipped. */ + void *h_ze = dlopen("libze_loader.so.1", RTLD_LAZY | RTLD_LOCAL); + if (!h_ze) + h_ze = dlopen("libze_loader.so", RTLD_LAZY | RTLD_LOCAL); + if (h_ze) { + _thapi_ze_init_fn_t ze_init = (_thapi_ze_init_fn_t) + dlsym(h_ze, "zeInit"); + _thapi_ze_driver_get_fn_t ze_driver = (_thapi_ze_driver_get_fn_t) + dlsym(h_ze, "zeDriverGet"); + _thapi_ze_context_create_fn_t ze_ctx_new = (_thapi_ze_context_create_fn_t) + dlsym(h_ze, "zeContextCreate"); + _ze_mem_get_alloc_properties = (_thapi_ze_mem_get_alloc_properties_fn_t) + dlsym(h_ze, "zeMemGetAllocProperties"); + _ze_mem_get_address_range = (_thapi_ze_mem_get_address_range_fn_t) + dlsym(h_ze, "zeMemGetAddressRange"); + + /* zeMemGetAllocProperties needs a context. Create a private one against + * the first driver so introspection never touches application contexts. */ + uint32_t n = 1; + _thapi_ze_driver_handle_t hDrv = NULL; + struct { uint32_t stype; const void *pNext; uint32_t flags; } desc = { + 0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0 }; + if (ze_init && ze_driver && ze_ctx_new && + ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS && + ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv && + ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx) + == _THAPI_ZE_RESULT_SUCCESS) { + if (verbose) + fprintf(stderr, "THAPI/MPI: Level Zero pointer introspection enabled\n"); + } else { + _ze_mem_get_alloc_properties = NULL; + _ze_mem_get_address_range = NULL; + _ze_introspection_ctx = NULL; + } + } + + void *h_cu = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_LOCAL); + if (!h_cu) + h_cu = dlopen("libcuda.so", RTLD_LAZY | RTLD_LOCAL); + if (h_cu) { + _thapi_cu_init_fn_t cu_init = + (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit"); + _cu_pointer_get_attribute = (_thapi_cu_pointer_get_attribute_fn_t) + dlsym(h_cu, "cuPointerGetAttribute"); + if (cu_init && _cu_pointer_get_attribute && + cu_init(0) == _THAPI_CU_SUCCESS) { + if (verbose) + fprintf(stderr, "THAPI/MPI: CUDA pointer introspection enabled\n"); + } else { + _cu_pointer_get_attribute = NULL; + } + } + + void *h_hip = dlopen("libamdhip64.so.5", RTLD_LAZY | RTLD_LOCAL); + if (!h_hip) + h_hip = dlopen("libamdhip64.so", RTLD_LAZY | RTLD_LOCAL); + if (h_hip) { + _hip_pointer_get_attributes = (_thapi_hip_pointer_get_attributes_fn_t) + dlsym(h_hip, "hipPointerGetAttributes"); + _hip_mem_get_address_range = (_thapi_hip_mem_get_address_range_fn_t) + dlsym(h_hip, "hipMemGetAddressRange"); + if (_hip_pointer_get_attributes && verbose) + fprintf(stderr, "THAPI/MPI: HIP pointer introspection enabled\n"); + } +} + static void _load_tracer(void) { char *s = NULL; void *handle = NULL; @@ -30,6 +413,7 @@ static void _load_tracer(void) { verbose = 1; find_mpi_symbols(handle, verbose); + _load_gpu_introspection(verbose); } static inline void _init_tracer(void) { From 6daf724c5c8918320a2e1f624fbf19129fa4c7df Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 13:15:42 -0500 Subject: [PATCH 02/14] mpi: defer GPU introspection loader to first buffer query Avoid creating an L0 context inside MPI_Init's pthread_once. The application's runtime setup (zeInit, ZE_AFFINITY_MASK, etc.) is allowed to complete before we touch the GPU stack. --- backends/mpi/tracer_mpi_helpers.include.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c index 874bd7c22..03cd65712 100644 --- a/backends/mpi/tracer_mpi_helpers.include.c +++ b/backends/mpi/tracer_mpi_helpers.include.c @@ -283,6 +283,18 @@ static int _try_classify_hip(const void *ptr, return 1; } +/* Lazy initialisation of the GPU introspection loaders. Deferred until the + * first buffer classification so that the application has finished its own + * runtime setup (zeInit / cuInit / ZE_AFFINITY_MASK / etc.) by the time we + * peek at the GPU stack. Doing this from _load_tracer() instead would create + * an L0 context before the app has set its environment, which is bad + * hygiene and can in rare cases trigger reentrancy through the L0 loader. */ +static void _load_gpu_introspection(int verbose); +static pthread_once_t _gpu_introspection_once = PTHREAD_ONCE_INIT; +static void _load_gpu_introspection_once(void) { + _load_gpu_introspection(getenv("LTTNG_UST_MPI_VERBOSE") ? 1 : 0); +} + static inline void _dump_buffer_info(const void *ptr, int role) { if (!ptr) return; @@ -292,6 +304,8 @@ static inline void _dump_buffer_info(const void *ptr, int role) { if (!tracepoint_enabled(lttng_ust_mpi_properties, buffer_info)) return; + pthread_once(&_gpu_introspection_once, _load_gpu_introspection_once); + struct _thapi_mpi_cache_entry e; if (!_thapi_mpi_cache_lookup(ptr, &e)) { e.valid = 0; @@ -413,7 +427,9 @@ static void _load_tracer(void) { verbose = 1; find_mpi_symbols(handle, verbose); - _load_gpu_introspection(verbose); + /* GPU pointer introspection is loaded lazily on first use (see + * _load_gpu_introspection_once), so the application's runtime setup + * completes before we touch L0/CUDA/HIP. */ } static inline void _init_tracer(void) { From 3a48873ec7893065a97fce8ebb383da7c3f21ace Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 13:25:35 -0500 Subject: [PATCH 03/14] mpi: drop GPU introspection diagnostic prints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make pointer introspection a silent default. Removes the only new getenv on the branch — the existing LTTNG_UST_MPI_VERBOSE no longer gates anything we added. --- backends/mpi/tracer_mpi_helpers.include.c | 29 ++++++++--------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c index 03cd65712..725a6712d 100644 --- a/backends/mpi/tracer_mpi_helpers.include.c +++ b/backends/mpi/tracer_mpi_helpers.include.c @@ -289,10 +289,10 @@ static int _try_classify_hip(const void *ptr, * peek at the GPU stack. Doing this from _load_tracer() instead would create * an L0 context before the app has set its environment, which is bad * hygiene and can in rare cases trigger reentrancy through the L0 loader. */ -static void _load_gpu_introspection(int verbose); +static void _load_gpu_introspection(void); static pthread_once_t _gpu_introspection_once = PTHREAD_ONCE_INIT; static void _load_gpu_introspection_once(void) { - _load_gpu_introspection(getenv("LTTNG_UST_MPI_VERBOSE") ? 1 : 0); + _load_gpu_introspection(); } static inline void _dump_buffer_info(const void *ptr, int role) { @@ -331,7 +331,7 @@ static inline void _dump_buffer_info(const void *ptr, int role) { } /* --- Loader -------------------------------------------------------------- */ -static void _load_gpu_introspection(int verbose) { +static void _load_gpu_introspection(void) { /* Each loader is optional: if the runtime isn't present on the system, the * corresponding classifier stays NULL and is silently skipped. */ void *h_ze = dlopen("libze_loader.so.1", RTLD_LAZY | RTLD_LOCAL); @@ -355,14 +355,11 @@ static void _load_gpu_introspection(int verbose) { _thapi_ze_driver_handle_t hDrv = NULL; struct { uint32_t stype; const void *pNext; uint32_t flags; } desc = { 0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0 }; - if (ze_init && ze_driver && ze_ctx_new && - ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS && - ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv && - ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx) - == _THAPI_ZE_RESULT_SUCCESS) { - if (verbose) - fprintf(stderr, "THAPI/MPI: Level Zero pointer introspection enabled\n"); - } else { + if (!(ze_init && ze_driver && ze_ctx_new && + ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS && + ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv && + ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx) + == _THAPI_ZE_RESULT_SUCCESS)) { _ze_mem_get_alloc_properties = NULL; _ze_mem_get_address_range = NULL; _ze_introspection_ctx = NULL; @@ -377,13 +374,9 @@ static void _load_gpu_introspection(int verbose) { (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit"); _cu_pointer_get_attribute = (_thapi_cu_pointer_get_attribute_fn_t) dlsym(h_cu, "cuPointerGetAttribute"); - if (cu_init && _cu_pointer_get_attribute && - cu_init(0) == _THAPI_CU_SUCCESS) { - if (verbose) - fprintf(stderr, "THAPI/MPI: CUDA pointer introspection enabled\n"); - } else { + if (!(cu_init && _cu_pointer_get_attribute && + cu_init(0) == _THAPI_CU_SUCCESS)) _cu_pointer_get_attribute = NULL; - } } void *h_hip = dlopen("libamdhip64.so.5", RTLD_LAZY | RTLD_LOCAL); @@ -394,8 +387,6 @@ static void _load_gpu_introspection(int verbose) { dlsym(h_hip, "hipPointerGetAttributes"); _hip_mem_get_address_range = (_thapi_hip_mem_get_address_range_fn_t) dlsym(h_hip, "hipMemGetAddressRange"); - if (_hip_pointer_get_attributes && verbose) - fprintf(stderr, "THAPI/MPI: HIP pointer introspection enabled\n"); } } From 1f6a85d6888175e07ef1c617f734c1a3c9e2ddb3 Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 15:50:21 -0500 Subject: [PATCH 04/14] mpi: fix buffer_info_callback ptr type to match generated typedef metababel strips qualifiers when emitting the callback typedef, so the 'const void *' field in mpi_events.yaml becomes 'void *' in the generated lttng_ust_mpi_properties_buffer_info_callback_f signature. Match it in the callback. --- backends/mpi/btx_mpiinterval_callbacks.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp index 1e29ed0ef..cada09ff6 100644 --- a/backends/mpi/btx_mpiinterval_callbacks.cpp +++ b/backends/mpi/btx_mpiinterval_callbacks.cpp @@ -178,7 +178,7 @@ static void buffer_info_callback(void *btx_handle, const char *hostname, int64_t vpid, uint64_t vtid, - uintptr_t ptr, + void *ptr, int role, int kind, int backend, From 3f2360a79d07545cd34049b9ffca663d20f2887d Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 15:54:01 -0500 Subject: [PATCH 05/14] mpi: use symbol key when reading mpi_meta_parameters.yaml The YAML top-level key is the Ruby symbol :meta_parameters (serialised as ':meta_parameters:'), so fetch('meta_parameters', ...) returns the empty default. Switch to fetch(:meta_parameters, {}) so prologues are actually registered for all 145 buffer-bearing MPI calls. Without this, _dump_buffer_info was unreferenced in the generated tracer_mpi.c and the build failed under -Werror. --- backends/mpi/mpi_model.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backends/mpi/mpi_model.rb b/backends/mpi/mpi_model.rb index 6084c4084..eda8c8459 100644 --- a/backends/mpi/mpi_model.rb +++ b/backends/mpi/mpi_model.rb @@ -110,7 +110,9 @@ def underscore(camel_cased_word) origin_addr result_addr compare_addr ].freeze -$mpi_meta_parameters.fetch('meta_parameters', []).each do |func, list| +# Note: the YAML top-level key is the Ruby symbol :meta_parameters (it's +# serialised as ":meta_parameters:"), not the string "meta_parameters". +$mpi_meta_parameters.fetch(:meta_parameters, {}).each do |func, list| buffers = list.select do |type, name| %w[InScalar OutScalar].include?(type) && MPI_PAYLOAD_BUFFER_NAMES.include?(name) From f2dfaf3a65d20a385015aafdd96578162e9bb9ec Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 16:27:53 -0500 Subject: [PATCH 06/14] xprof: enable lttng_ust_mpi_properties:* events in iprof sessions iprof manages its own LTTng session and ignores tracer_mpi.sh, so the enable-event line previously added to tracer_mpi.sh.in never took effect under iprof. Add the matching line in enable_events_mpi() so buffer_info events make it into the trace under both invocation paths. --- xprof/xprof.rb.in | 1 + 1 file changed, 1 insertion(+) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 9b4804dd6..553312e38 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -639,6 +639,7 @@ def enable_events_mpi(channel_name, tracing_mode: 'default', profiling: true) end LOGGER.debug('Profiling is ignored for mpi') if profiling exec("#{lttng_enable} lttng_ust_mpi_type:*") + exec("#{lttng_enable} lttng_ust_mpi_properties:*") end def enable_events_cxi(channel_name, tracing_mode: 'default', profiling: true); end From a1dadb67fb4f09bd5199bab6a5b5cde530fbccd5 Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 17:08:41 -0500 Subject: [PATCH 07/14] mpi: apply clang-format, yamlfmt, and rubocop fixes - clang-format-18 on btx_mpiinterval_callbacks.cpp and tracer_mpi_helpers.include.c (strip vertical alignment, collapse multi-statement switch cases). - yamlfmt on mpi_events.yaml (drop --- marker, strip inner spaces from flow sequences). - rubocop: 'Note:' -> 'NOTE:' in mpi_model.rb (Style/CommentAnnotation). --- backends/mpi/btx_mpiinterval_callbacks.cpp | 6 +- backends/mpi/mpi_events.yaml | 38 ++- backends/mpi/mpi_model.rb | 2 +- backends/mpi/tracer_mpi_helpers.include.c | 270 ++++++++++----------- 4 files changed, 156 insertions(+), 160 deletions(-) diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp index cada09ff6..43dca7878 100644 --- a/backends/mpi/btx_mpiinterval_callbacks.cpp +++ b/backends/mpi/btx_mpiinterval_callbacks.cpp @@ -10,9 +10,9 @@ // Must match the enum in backends/mpi/tracer_mpi_helpers.include.c. enum { THAPI_MPI_PTR_UNKNOWN = 0, - THAPI_MPI_PTR_HOST = 1, - THAPI_MPI_PTR_DEVICE = 2, - THAPI_MPI_PTR_SHARED = 3 + THAPI_MPI_PTR_HOST = 1, + THAPI_MPI_PTR_DEVICE = 2, + THAPI_MPI_PTR_SHARED = 3 }; struct data_s { diff --git a/backends/mpi/mpi_events.yaml b/backends/mpi/mpi_events.yaml index 8a8060873..51a03be66 100644 --- a/backends/mpi/mpi_events.yaml +++ b/backends/mpi/mpi_events.yaml @@ -1,14 +1,12 @@ ---- lttng_ust_mpi_type: events: - name: property args: - - [ MPI_Datatype, datatype] - - [ int, size] + - [MPI_Datatype, datatype] + - [int, size] fields: - - [ ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype" ] - - [ ctf_integer, int, size, size ] - + - [ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype"] + - [ctf_integer, int, size, size] lttng_ust_mpi_properties: events: # Emitted once per buffer argument of a buffer-bearing MPI call. @@ -17,18 +15,18 @@ lttng_ust_mpi_properties: # `role` : 0 input (send-like), 1 output (recv-like) - name: buffer_info args: - - [ const void *, ptr ] - - [ int, role ] - - [ int, kind ] - - [ int, backend ] - - [ uintptr_t, base ] - - [ uint64_t, size ] - - [ uint32_t, device_ordinal ] + - [const void *, ptr] + - [int, role] + - [int, kind] + - [int, backend] + - [uintptr_t, base] + - [uint64_t, size] + - [uint32_t, device_ordinal] fields: - - [ ctf_integer_hex, uintptr_t, ptr, "(uintptr_t)ptr" ] - - [ ctf_integer, int, role, role ] - - [ ctf_integer, int, kind, kind ] - - [ ctf_integer, int, backend, backend ] - - [ ctf_integer_hex, uintptr_t, base, base ] - - [ ctf_integer, uint64_t, size, size ] - - [ ctf_integer, uint32_t, device_ordinal, device_ordinal ] + - [ctf_integer_hex, uintptr_t, ptr, "(uintptr_t)ptr"] + - [ctf_integer, int, role, role] + - [ctf_integer, int, kind, kind] + - [ctf_integer, int, backend, backend] + - [ctf_integer_hex, uintptr_t, base, base] + - [ctf_integer, uint64_t, size, size] + - [ctf_integer, uint32_t, device_ordinal, device_ordinal] diff --git a/backends/mpi/mpi_model.rb b/backends/mpi/mpi_model.rb index eda8c8459..0568314df 100644 --- a/backends/mpi/mpi_model.rb +++ b/backends/mpi/mpi_model.rb @@ -110,7 +110,7 @@ def underscore(camel_cased_word) origin_addr result_addr compare_addr ].freeze -# Note: the YAML top-level key is the Ruby symbol :meta_parameters (it's +# NOTE: the YAML top-level key is the Ruby symbol :meta_parameters (it's # serialised as ":meta_parameters:"), not the string "meta_parameters". $mpi_meta_parameters.fetch(:meta_parameters, {}).each do |func, list| buffers = list.select do |type, name| diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c index 725a6712d..b4fbcf43f 100644 --- a/backends/mpi/tracer_mpi_helpers.include.c +++ b/backends/mpi/tracer_mpi_helpers.include.c @@ -22,84 +22,81 @@ static volatile unsigned int _initialized = 0; * (extract/, btx_mpiinterval_callbacks.cpp). */ enum thapi_mpi_ptr_kind { THAPI_MPI_PTR_UNKNOWN = 0, - THAPI_MPI_PTR_HOST = 1, - THAPI_MPI_PTR_DEVICE = 2, - THAPI_MPI_PTR_SHARED = 3 + THAPI_MPI_PTR_HOST = 1, + THAPI_MPI_PTR_DEVICE = 2, + THAPI_MPI_PTR_SHARED = 3 }; enum thapi_mpi_ptr_backend { THAPI_MPI_BACKEND_NONE = 0, - THAPI_MPI_BACKEND_L0 = 1, + THAPI_MPI_BACKEND_L0 = 1, THAPI_MPI_BACKEND_CUDA = 2, - THAPI_MPI_BACKEND_HIP = 3 + THAPI_MPI_BACKEND_HIP = 3 }; -enum thapi_mpi_buffer_role { - THAPI_MPI_ROLE_IN = 0, - THAPI_MPI_ROLE_OUT = 1 -}; +enum thapi_mpi_buffer_role { THAPI_MPI_ROLE_IN = 0, THAPI_MPI_ROLE_OUT = 1 }; /* --- Level Zero ----------------------------------------------------------- */ /* Opaque handle types and the minimal enums/structs needed to call * zeMemGetAllocProperties / zeMemGetAddressRange without including ze_api.h. */ -typedef int _thapi_ze_result_t; /* ze_result_t */ -typedef struct _thapi_ze_driver *_thapi_ze_driver_handle_t; -typedef struct _thapi_ze_context *_thapi_ze_context_handle_t; -typedef struct _thapi_ze_device *_thapi_ze_device_handle_t; +typedef int _thapi_ze_result_t; /* ze_result_t */ +typedef struct _thapi_ze_driver *_thapi_ze_driver_handle_t; +typedef struct _thapi_ze_context *_thapi_ze_context_handle_t; +typedef struct _thapi_ze_device *_thapi_ze_device_handle_t; -#define _THAPI_ZE_RESULT_SUCCESS 0 +#define _THAPI_ZE_RESULT_SUCCESS 0 #define _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES 0x000c -#define _THAPI_ZE_INIT_FLAG_GPU_ONLY 0x01 +#define _THAPI_ZE_INIT_FLAG_GPU_ONLY 0x01 typedef enum { _THAPI_ZE_MEMORY_TYPE_UNKNOWN = 0, - _THAPI_ZE_MEMORY_TYPE_HOST = 1, - _THAPI_ZE_MEMORY_TYPE_DEVICE = 2, - _THAPI_ZE_MEMORY_TYPE_SHARED = 3 + _THAPI_ZE_MEMORY_TYPE_HOST = 1, + _THAPI_ZE_MEMORY_TYPE_DEVICE = 2, + _THAPI_ZE_MEMORY_TYPE_SHARED = 3 } _thapi_ze_memory_type_t; typedef struct { - uint32_t stype; - const void *pNext; + uint32_t stype; + const void *pNext; _thapi_ze_memory_type_t type; - uint64_t id; - uint64_t pageSize; + uint64_t id; + uint64_t pageSize; } _thapi_ze_memory_allocation_properties_t; typedef _thapi_ze_result_t (*_thapi_ze_init_fn_t)(uint32_t flags); typedef _thapi_ze_result_t (*_thapi_ze_driver_get_fn_t)(uint32_t *pCount, - _thapi_ze_driver_handle_t *phDrivers); + _thapi_ze_driver_handle_t *phDrivers); typedef _thapi_ze_result_t (*_thapi_ze_context_create_fn_t)(_thapi_ze_driver_handle_t hDriver, const void *desc, _thapi_ze_context_handle_t *phContext); typedef _thapi_ze_result_t (*_thapi_ze_mem_get_alloc_properties_fn_t)( - _thapi_ze_context_handle_t hContext, const void *ptr, + _thapi_ze_context_handle_t hContext, + const void *ptr, _thapi_ze_memory_allocation_properties_t *pProps, _thapi_ze_device_handle_t *phDevice); typedef _thapi_ze_result_t (*_thapi_ze_mem_get_address_range_fn_t)( - _thapi_ze_context_handle_t hContext, const void *ptr, - void **pBase, size_t *pSize); + _thapi_ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize); static _thapi_ze_mem_get_alloc_properties_fn_t _ze_mem_get_alloc_properties = NULL; -static _thapi_ze_mem_get_address_range_fn_t _ze_mem_get_address_range = NULL; -static _thapi_ze_context_handle_t _ze_introspection_ctx = NULL; +static _thapi_ze_mem_get_address_range_fn_t _ze_mem_get_address_range = NULL; +static _thapi_ze_context_handle_t _ze_introspection_ctx = NULL; /* --- CUDA ----------------------------------------------------------------- */ /* Driver API (libcuda.so) is preferred: it's the same on every CUDA install * and doesn't drag in libcudart. */ -typedef int _thapi_cu_result_t; /* CUresult */ +typedef int _thapi_cu_result_t; /* CUresult */ typedef unsigned int _thapi_cu_attr_t; /* CUpointer_attribute */ typedef unsigned long long _thapi_cu_deviceptr_t; /* CUdeviceptr */ -#define _THAPI_CU_SUCCESS 0 -#define _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE 2 -#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START 7 -#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE 8 +#define _THAPI_CU_SUCCESS 0 +#define _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE 2 +#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START 7 +#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE 8 #define _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL 9 -#define _THAPI_CU_MEMORYTYPE_HOST 1 -#define _THAPI_CU_MEMORYTYPE_DEVICE 2 -#define _THAPI_CU_MEMORYTYPE_ARRAY 3 -#define _THAPI_CU_MEMORYTYPE_UNIFIED 4 +#define _THAPI_CU_MEMORYTYPE_HOST 1 +#define _THAPI_CU_MEMORYTYPE_DEVICE 2 +#define _THAPI_CU_MEMORYTYPE_ARRAY 3 +#define _THAPI_CU_MEMORYTYPE_UNIFIED 4 typedef _thapi_cu_result_t (*_thapi_cu_init_fn_t)(unsigned int flags); typedef _thapi_cu_result_t (*_thapi_cu_pointer_get_attribute_fn_t)(void *data, @@ -111,29 +108,30 @@ static _thapi_cu_pointer_get_attribute_fn_t _cu_pointer_get_attribute = NULL; /* --- HIP ------------------------------------------------------------------ */ typedef int _thapi_hip_error_t; /* hipError_t */ -#define _THAPI_HIP_SUCCESS 0 -#define _THAPI_HIP_MEMORY_TYPE_HOST 0 -#define _THAPI_HIP_MEMORY_TYPE_DEVICE 1 -#define _THAPI_HIP_MEMORY_TYPE_ARRAY 2 -#define _THAPI_HIP_MEMORY_TYPE_UNIFIED 3 -#define _THAPI_HIP_MEMORY_TYPE_MANAGED 4 +#define _THAPI_HIP_SUCCESS 0 +#define _THAPI_HIP_MEMORY_TYPE_HOST 0 +#define _THAPI_HIP_MEMORY_TYPE_DEVICE 1 +#define _THAPI_HIP_MEMORY_TYPE_ARRAY 2 +#define _THAPI_HIP_MEMORY_TYPE_UNIFIED 3 +#define _THAPI_HIP_MEMORY_TYPE_MANAGED 4 typedef struct { - unsigned int type; - int device; - void *devicePointer; - void *hostPointer; - int isManaged; - unsigned int allocationFlags; + unsigned int type; + int device; + void *devicePointer; + void *hostPointer; + int isManaged; + unsigned int allocationFlags; } _thapi_hip_pointer_attribute_t; typedef _thapi_hip_error_t (*_thapi_hip_pointer_get_attributes_fn_t)( _thapi_hip_pointer_attribute_t *attr, const void *ptr); -typedef _thapi_hip_error_t (*_thapi_hip_mem_get_address_range_fn_t)( - void **pbase, size_t *psize, void *dptr); +typedef _thapi_hip_error_t (*_thapi_hip_mem_get_address_range_fn_t)(void **pbase, + size_t *psize, + void *dptr); static _thapi_hip_pointer_get_attributes_fn_t _hip_pointer_get_attributes = NULL; -static _thapi_hip_mem_get_address_range_fn_t _hip_mem_get_address_range = NULL; +static _thapi_hip_mem_get_address_range_fn_t _hip_mem_get_address_range = NULL; /* --- Per-thread address-range cache --------------------------------------- */ /* Tiny LRU keyed on allocation base address. Buffers reused across MPI @@ -143,19 +141,17 @@ static _thapi_hip_mem_get_address_range_fn_t _hip_mem_get_address_range = NULL struct _thapi_mpi_cache_entry { uintptr_t base; - size_t size; - uint8_t kind; - uint8_t backend; - uint32_t device_ordinal; - uint8_t valid; + size_t size; + uint8_t kind; + uint8_t backend; + uint32_t device_ordinal; + uint8_t valid; }; -static __thread struct _thapi_mpi_cache_entry - _thapi_mpi_cache[_THAPI_MPI_CACHE_SLOTS]; +static __thread struct _thapi_mpi_cache_entry _thapi_mpi_cache[_THAPI_MPI_CACHE_SLOTS]; static __thread unsigned int _thapi_mpi_cache_next = 0; -static inline int _thapi_mpi_cache_lookup(const void *ptr, - struct _thapi_mpi_cache_entry *out) { +static inline int _thapi_mpi_cache_lookup(const void *ptr, struct _thapi_mpi_cache_entry *out) { uintptr_t p = (uintptr_t)ptr; for (unsigned int i = 0; i < _THAPI_MPI_CACHE_SLOTS; i++) { struct _thapi_mpi_cache_entry *e = &_thapi_mpi_cache[i]; @@ -169,33 +165,30 @@ static inline int _thapi_mpi_cache_lookup(const void *ptr, static inline void _thapi_mpi_cache_insert(const struct _thapi_mpi_cache_entry *e) { _thapi_mpi_cache[_thapi_mpi_cache_next] = *e; - _thapi_mpi_cache_next = - (_thapi_mpi_cache_next + 1) % _THAPI_MPI_CACHE_SLOTS; + _thapi_mpi_cache_next = (_thapi_mpi_cache_next + 1) % _THAPI_MPI_CACHE_SLOTS; } /* --- Runtime probing ------------------------------------------------------ */ -static int _try_classify_ze(const void *ptr, - struct _thapi_mpi_cache_entry *out) { +static int _try_classify_ze(const void *ptr, struct _thapi_mpi_cache_entry *out) { if (!_ze_mem_get_alloc_properties || !_ze_introspection_ctx) return 0; _thapi_ze_memory_allocation_properties_t props; props.stype = _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES; props.pNext = NULL; - props.type = _THAPI_ZE_MEMORY_TYPE_UNKNOWN; + props.type = _THAPI_ZE_MEMORY_TYPE_UNKNOWN; _thapi_ze_device_handle_t hDev = NULL; - if (_ze_mem_get_alloc_properties(_ze_introspection_ctx, ptr, &props, &hDev) - != _THAPI_ZE_RESULT_SUCCESS) + if (_ze_mem_get_alloc_properties(_ze_introspection_ctx, ptr, &props, &hDev) != + _THAPI_ZE_RESULT_SUCCESS) return 0; if (props.type == _THAPI_ZE_MEMORY_TYPE_UNKNOWN) return 0; - void *base = NULL; + void *base = NULL; size_t size = 0; - if (_ze_mem_get_address_range && - _ze_mem_get_address_range(_ze_introspection_ctx, ptr, &base, &size) - != _THAPI_ZE_RESULT_SUCCESS) { + if (_ze_mem_get_address_range && _ze_mem_get_address_range(_ze_introspection_ctx, ptr, &base, + &size) != _THAPI_ZE_RESULT_SUCCESS) { base = (void *)ptr; size = 1; } @@ -205,54 +198,66 @@ static int _try_classify_ze(const void *ptr, out->backend = THAPI_MPI_BACKEND_L0; out->device_ordinal = 0; /* L0 device ordinal lookup would need extra calls */ switch (props.type) { - case _THAPI_ZE_MEMORY_TYPE_HOST: out->kind = THAPI_MPI_PTR_HOST; break; - case _THAPI_ZE_MEMORY_TYPE_DEVICE: out->kind = THAPI_MPI_PTR_DEVICE; break; - case _THAPI_ZE_MEMORY_TYPE_SHARED: out->kind = THAPI_MPI_PTR_SHARED; break; - default: out->kind = THAPI_MPI_PTR_UNKNOWN; break; + case _THAPI_ZE_MEMORY_TYPE_HOST: + out->kind = THAPI_MPI_PTR_HOST; + break; + case _THAPI_ZE_MEMORY_TYPE_DEVICE: + out->kind = THAPI_MPI_PTR_DEVICE; + break; + case _THAPI_ZE_MEMORY_TYPE_SHARED: + out->kind = THAPI_MPI_PTR_SHARED; + break; + default: + out->kind = THAPI_MPI_PTR_UNKNOWN; + break; } out->valid = 1; return 1; } -static int _try_classify_cuda(const void *ptr, - struct _thapi_mpi_cache_entry *out) { +static int _try_classify_cuda(const void *ptr, struct _thapi_mpi_cache_entry *out) { if (!_cu_pointer_get_attribute) return 0; - unsigned int mem_type = 0; - _thapi_cu_deviceptr_t range_start = 0; - size_t range_size = 0; - int device_ordinal = 0; - _thapi_cu_deviceptr_t dptr = (_thapi_cu_deviceptr_t)(uintptr_t)ptr; + unsigned int mem_type = 0; + _thapi_cu_deviceptr_t range_start = 0; + size_t range_size = 0; + int device_ordinal = 0; + _thapi_cu_deviceptr_t dptr = (_thapi_cu_deviceptr_t)(uintptr_t)ptr; - if (_cu_pointer_get_attribute(&mem_type, - _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dptr) - != _THAPI_CU_SUCCESS || mem_type == 0) + if (_cu_pointer_get_attribute(&mem_type, _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dptr) != + _THAPI_CU_SUCCESS || + mem_type == 0) return 0; - (void)_cu_pointer_get_attribute(&range_start, - _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START, dptr); - (void)_cu_pointer_get_attribute(&range_size, - _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE, dptr); - (void)_cu_pointer_get_attribute(&device_ordinal, - _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dptr); + (void)_cu_pointer_get_attribute(&range_start, _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START, dptr); + (void)_cu_pointer_get_attribute(&range_size, _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE, dptr); + (void)_cu_pointer_get_attribute(&device_ordinal, _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + dptr); out->base = range_start ? (uintptr_t)range_start : (uintptr_t)ptr; out->size = range_size ? range_size : 1; out->backend = THAPI_MPI_BACKEND_CUDA; out->device_ordinal = (uint32_t)device_ordinal; switch (mem_type) { - case _THAPI_CU_MEMORYTYPE_HOST: out->kind = THAPI_MPI_PTR_HOST; break; - case _THAPI_CU_MEMORYTYPE_DEVICE: out->kind = THAPI_MPI_PTR_DEVICE; break; - case _THAPI_CU_MEMORYTYPE_UNIFIED: out->kind = THAPI_MPI_PTR_SHARED; break; - default: out->kind = THAPI_MPI_PTR_UNKNOWN; break; + case _THAPI_CU_MEMORYTYPE_HOST: + out->kind = THAPI_MPI_PTR_HOST; + break; + case _THAPI_CU_MEMORYTYPE_DEVICE: + out->kind = THAPI_MPI_PTR_DEVICE; + break; + case _THAPI_CU_MEMORYTYPE_UNIFIED: + out->kind = THAPI_MPI_PTR_SHARED; + break; + default: + out->kind = THAPI_MPI_PTR_UNKNOWN; + break; } out->valid = 1; return 1; } -static int _try_classify_hip(const void *ptr, - struct _thapi_mpi_cache_entry *out) { +static int _try_classify_hip(const void *ptr, struct _thapi_mpi_cache_entry *out) { if (!_hip_pointer_get_attributes) return 0; @@ -261,7 +266,7 @@ static int _try_classify_hip(const void *ptr, if (_hip_pointer_get_attributes(&attr, ptr) != _THAPI_HIP_SUCCESS) return 0; - void *base = (void *)ptr; + void *base = (void *)ptr; size_t size = 1; if (_hip_mem_get_address_range) (void)_hip_mem_get_address_range(&base, &size, (void *)ptr); @@ -291,9 +296,7 @@ static int _try_classify_hip(const void *ptr, * hygiene and can in rare cases trigger reentrancy through the L0 loader. */ static void _load_gpu_introspection(void); static pthread_once_t _gpu_introspection_once = PTHREAD_ONCE_INIT; -static void _load_gpu_introspection_once(void) { - _load_gpu_introspection(); -} +static void _load_gpu_introspection_once(void) { _load_gpu_introspection(); } static inline void _dump_buffer_info(const void *ptr, int role) { if (!ptr) @@ -309,9 +312,7 @@ static inline void _dump_buffer_info(const void *ptr, int role) { struct _thapi_mpi_cache_entry e; if (!_thapi_mpi_cache_lookup(ptr, &e)) { e.valid = 0; - if (!_try_classify_ze(ptr, &e) && - !_try_classify_cuda(ptr, &e) && - !_try_classify_hip(ptr, &e)) { + if (!_try_classify_ze(ptr, &e) && !_try_classify_cuda(ptr, &e) && !_try_classify_hip(ptr, &e)) { /* Nobody claims it: treat as host. Cache a 1-byte range so we still * answer in O(1) on the next call, but don't pollute the cache with * adjacent host allocations. */ @@ -325,8 +326,7 @@ static inline void _dump_buffer_info(const void *ptr, int role) { _thapi_mpi_cache_insert(&e); } - do_tracepoint(lttng_ust_mpi_properties, buffer_info, - ptr, role, (int)e.kind, (int)e.backend, + do_tracepoint(lttng_ust_mpi_properties, buffer_info, ptr, role, (int)e.kind, (int)e.backend, e.base, (uint64_t)e.size, e.device_ordinal); } @@ -336,57 +336,55 @@ static void _load_gpu_introspection(void) { * corresponding classifier stays NULL and is silently skipped. */ void *h_ze = dlopen("libze_loader.so.1", RTLD_LAZY | RTLD_LOCAL); if (!h_ze) - h_ze = dlopen("libze_loader.so", RTLD_LAZY | RTLD_LOCAL); + h_ze = dlopen("libze_loader.so", RTLD_LAZY | RTLD_LOCAL); if (h_ze) { - _thapi_ze_init_fn_t ze_init = (_thapi_ze_init_fn_t) - dlsym(h_ze, "zeInit"); - _thapi_ze_driver_get_fn_t ze_driver = (_thapi_ze_driver_get_fn_t) - dlsym(h_ze, "zeDriverGet"); - _thapi_ze_context_create_fn_t ze_ctx_new = (_thapi_ze_context_create_fn_t) - dlsym(h_ze, "zeContextCreate"); - _ze_mem_get_alloc_properties = (_thapi_ze_mem_get_alloc_properties_fn_t) - dlsym(h_ze, "zeMemGetAllocProperties"); - _ze_mem_get_address_range = (_thapi_ze_mem_get_address_range_fn_t) - dlsym(h_ze, "zeMemGetAddressRange"); + _thapi_ze_init_fn_t ze_init = (_thapi_ze_init_fn_t)dlsym(h_ze, "zeInit"); + _thapi_ze_driver_get_fn_t ze_driver = (_thapi_ze_driver_get_fn_t)dlsym(h_ze, "zeDriverGet"); + _thapi_ze_context_create_fn_t ze_ctx_new = + (_thapi_ze_context_create_fn_t)dlsym(h_ze, "zeContextCreate"); + _ze_mem_get_alloc_properties = + (_thapi_ze_mem_get_alloc_properties_fn_t)dlsym(h_ze, "zeMemGetAllocProperties"); + _ze_mem_get_address_range = + (_thapi_ze_mem_get_address_range_fn_t)dlsym(h_ze, "zeMemGetAddressRange"); /* zeMemGetAllocProperties needs a context. Create a private one against * the first driver so introspection never touches application contexts. */ uint32_t n = 1; _thapi_ze_driver_handle_t hDrv = NULL; - struct { uint32_t stype; const void *pNext; uint32_t flags; } desc = { - 0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0 }; + struct { + uint32_t stype; + const void *pNext; + uint32_t flags; + } desc = {0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0}; if (!(ze_init && ze_driver && ze_ctx_new && ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS && ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv && - ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx) - == _THAPI_ZE_RESULT_SUCCESS)) { + ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx) == _THAPI_ZE_RESULT_SUCCESS)) { _ze_mem_get_alloc_properties = NULL; - _ze_mem_get_address_range = NULL; - _ze_introspection_ctx = NULL; + _ze_mem_get_address_range = NULL; + _ze_introspection_ctx = NULL; } } void *h_cu = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_LOCAL); if (!h_cu) - h_cu = dlopen("libcuda.so", RTLD_LAZY | RTLD_LOCAL); + h_cu = dlopen("libcuda.so", RTLD_LAZY | RTLD_LOCAL); if (h_cu) { - _thapi_cu_init_fn_t cu_init = - (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit"); - _cu_pointer_get_attribute = (_thapi_cu_pointer_get_attribute_fn_t) - dlsym(h_cu, "cuPointerGetAttribute"); - if (!(cu_init && _cu_pointer_get_attribute && - cu_init(0) == _THAPI_CU_SUCCESS)) + _thapi_cu_init_fn_t cu_init = (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit"); + _cu_pointer_get_attribute = + (_thapi_cu_pointer_get_attribute_fn_t)dlsym(h_cu, "cuPointerGetAttribute"); + if (!(cu_init && _cu_pointer_get_attribute && cu_init(0) == _THAPI_CU_SUCCESS)) _cu_pointer_get_attribute = NULL; } void *h_hip = dlopen("libamdhip64.so.5", RTLD_LAZY | RTLD_LOCAL); if (!h_hip) - h_hip = dlopen("libamdhip64.so", RTLD_LAZY | RTLD_LOCAL); + h_hip = dlopen("libamdhip64.so", RTLD_LAZY | RTLD_LOCAL); if (h_hip) { - _hip_pointer_get_attributes = (_thapi_hip_pointer_get_attributes_fn_t) - dlsym(h_hip, "hipPointerGetAttributes"); - _hip_mem_get_address_range = (_thapi_hip_mem_get_address_range_fn_t) - dlsym(h_hip, "hipMemGetAddressRange"); + _hip_pointer_get_attributes = + (_thapi_hip_pointer_get_attributes_fn_t)dlsym(h_hip, "hipPointerGetAttributes"); + _hip_mem_get_address_range = + (_thapi_hip_mem_get_address_range_fn_t)dlsym(h_hip, "hipMemGetAddressRange"); } } From 7a388deae2c807ff0d8bd3799a2fffb4740fc527 Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 17:23:12 -0500 Subject: [PATCH 08/14] Revert producer-side GPU introspection Per review (Thomas Applencourt): the MPI tracer should not call into the GPU runtime to classify pointers. Specifically: - calling zeInit can break apps that rely on zesInit being first (deprecated API + sysman interaction); THAPI takes care to never call it, - dlopen('libze_loader.so') from inside an LD_PRELOAD'd libmpi.so actually opens the ze backend's own interceptor wrapper, so every zeMemGetAllocProperties query produces a spurious ze event and self-traces, - querying zeMemGetAllocProperties with a private context returns correct results only by accident on the current Intel L0 loader. Revert all producer-side changes (tracer_mpi_helpers.include.c, mpi_events.yaml, mpi_model.rb prologue registration, gen_mpi.rb includes, Makefile.am probe entry, tracer_mpi.sh.in / xprof.rb.in enable-event lines, mpi.h.include). Classification will be done post-mortem in btx_mpiinterval_callbacks.cpp using the existing zeMemAlloc*/zeMemFree events emitted by the ze backend, mirroring the rangeset_memory_{host,device,shared} pattern in btx_zeinterval_callbacks.cpp. --- backends/mpi/Makefile.am | 3 +- backends/mpi/btx_mpiinterval_callbacks.cpp | 49 +-- backends/mpi/gen_mpi.rb | 6 - backends/mpi/mpi.h.include | 3 +- backends/mpi/mpi_events.yaml | 33 +- backends/mpi/mpi_model.rb | 41 --- backends/mpi/tracer_mpi.sh.in | 2 - backends/mpi/tracer_mpi_helpers.include.c | 389 --------------------- xprof/xprof.rb.in | 1 - 9 files changed, 10 insertions(+), 517 deletions(-) diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am index 13400366e..f3d0ea28c 100644 --- a/backends/mpi/Makefile.am +++ b/backends/mpi/Makefile.am @@ -62,8 +62,7 @@ MPI_PROBES_INCL = $(MPI_PROBES:=.h) MPI_PROBES_SRC = $(MPI_PROBES:=.c) MPI_STATIC_PROBES = \ - mpi_type \ - mpi_properties + mpi_type MPI_STATIC_PROBES_TP = $(MPI_STATIC_PROBES:=.tp) MPI_STATIC_PROBES_INCL = $(MPI_STATIC_PROBES:=.h) diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp index 43dca7878..65f1cedd2 100644 --- a/backends/mpi/btx_mpiinterval_callbacks.cpp +++ b/backends/mpi/btx_mpiinterval_callbacks.cpp @@ -5,22 +5,9 @@ #include #include #include -#include - -// Must match the enum in backends/mpi/tracer_mpi_helpers.include.c. -enum { - THAPI_MPI_PTR_UNKNOWN = 0, - THAPI_MPI_PTR_HOST = 1, - THAPI_MPI_PTR_DEVICE = 2, - THAPI_MPI_PTR_SHARED = 3 -}; struct data_s { EntryState entry_state; - // Set of {hostname, vpid, vtid} for which the in-flight MPI call has been - // observed to carry at least one device/shared GPU buffer. Populated by - // buffer_info_callback, consumed (and cleared) by send_host_message. - std::unordered_set gpu_aware_calls; }; typedef struct data_s data_t; @@ -34,16 +21,9 @@ static void send_host_message(void *btx_handle, uint64_t vtid, bool err) { - auto *data = static_cast(usr_data); - const hpt_t hpt{hostname, vpid, vtid}; - std::string event_class_name_striped = strip_event_class_name_exit(event_class_name); - // GPU-aware MPI calls are prefixed with '*' so they stand out in the - // profile. The flag is set by buffer_info_callback during this call. - if (data->gpu_aware_calls.erase(hpt) > 0) - event_class_name_striped.insert(0, 1, '*'); - - const int64_t entry_ts = data->entry_state.get_ts(hpt); + const int64_t entry_ts = + static_cast(usr_data)->entry_state.get_ts({hostname, vpid, vtid}); btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI, event_class_name_striped.c_str(), (ts - entry_ts), err); @@ -169,29 +149,6 @@ static void type_property_callback(void *btx_handle, mpi_datatype_info[(uint64_t)datatype] = {std::to_string((uint64_t)datatype), size}; } -// Emitted from the prologue of every buffer-bearing MPI call, once per -// buffer argument. We only care whether at least one buffer is on the GPU -// (device or shared/managed); that's enough to tag the call as GPU-aware. -static void buffer_info_callback(void *btx_handle, - void *usr_data, - int64_t ts, - const char *hostname, - int64_t vpid, - uint64_t vtid, - void *ptr, - int role, - int kind, - int backend, - uintptr_t base, - uint64_t size, - uint32_t device_ordinal) { - - if (kind == THAPI_MPI_PTR_DEVICE || kind == THAPI_MPI_PTR_SHARED) { - auto *data = static_cast(usr_data); - data->gpu_aware_calls.insert({hostname, vpid, vtid}); - } -} - static void traffic_MPI_Count_entry_callback(void *btx_handle, void *usr_data, int64_t ts, @@ -252,6 +209,4 @@ void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback); btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback); - - btx_register_callbacks_lttng_ust_mpi_properties_buffer_info(btx_handle, &buffer_info_callback); } diff --git a/backends/mpi/gen_mpi.rb b/backends/mpi/gen_mpi.rb index 3c0e349cd..bd939a9bb 100644 --- a/backends/mpi/gen_mpi.rb +++ b/backends/mpi/gen_mpi.rb @@ -19,10 +19,6 @@ def common_block(c, provider) tracepoint_params = c.tracepoint_parameters.reject { |p| p.after? }.collect(&:name) puts " tracepoint(#{provider}, #{c.name}_#{START}, #{(tp_params + tracepoint_params).join(', ')});" - c.prologues.each do |p| - puts p - end - params = c.parameters.collect(&:name) if c.has_return_type? puts " #{c.type} _retval;" @@ -79,12 +75,10 @@ def define_and_find_mpi_symbols puts <<~EOF #include - #include #define MPICH_FORTRAN_SYMBOLS_NONABI #include #include "mpi_tracepoints.h" #include "mpi_type.h" - #include "mpi_properties.h" #include #include EOF diff --git a/backends/mpi/mpi.h.include b/backends/mpi/mpi.h.include index 4fe43271b..aed167967 100644 --- a/backends/mpi/mpi.h.include +++ b/backends/mpi/mpi.h.include @@ -1,3 +1,2 @@ #include -#include -#include + diff --git a/backends/mpi/mpi_events.yaml b/backends/mpi/mpi_events.yaml index 51a03be66..48c6dcc96 100644 --- a/backends/mpi/mpi_events.yaml +++ b/backends/mpi/mpi_events.yaml @@ -1,32 +1,11 @@ +--- lttng_ust_mpi_type: events: - name: property args: - - [MPI_Datatype, datatype] - - [int, size] + - [ MPI_Datatype, datatype] + - [ int, size] fields: - - [ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype"] - - [ctf_integer, int, size, size] -lttng_ust_mpi_properties: - events: - # Emitted once per buffer argument of a buffer-bearing MPI call. - # `kind` : 0 unknown, 1 host, 2 device, 3 shared/managed - # `backend` : 0 none, 1 level-zero, 2 cuda, 3 hip - # `role` : 0 input (send-like), 1 output (recv-like) - - name: buffer_info - args: - - [const void *, ptr] - - [int, role] - - [int, kind] - - [int, backend] - - [uintptr_t, base] - - [uint64_t, size] - - [uint32_t, device_ordinal] - fields: - - [ctf_integer_hex, uintptr_t, ptr, "(uintptr_t)ptr"] - - [ctf_integer, int, role, role] - - [ctf_integer, int, kind, kind] - - [ctf_integer, int, backend, backend] - - [ctf_integer_hex, uintptr_t, base, base] - - [ctf_integer, uint64_t, size, size] - - [ctf_integer, uint32_t, device_ordinal, device_ordinal] + - [ ctf_integer_hex, uintptr_t, datatype, "(uintptr_t)datatype" ] + - [ ctf_integer, int, size, size ] + diff --git a/backends/mpi/mpi_model.rb b/backends/mpi/mpi_model.rb index 0568314df..c8f3cc56c 100644 --- a/backends/mpi/mpi_model.rb +++ b/backends/mpi/mpi_model.rb @@ -89,44 +89,3 @@ def underscore(camel_cased_word) if (tracepoint_enabled(lttng_ust_mpi_type, property))#{' '} tracepoint(lttng_ust_mpi_type, property, *datatype, size); EOF - -# --------------------------------------------------------------------------- -# GPU-aware MPI introspection -# -# For every MPI call that carries a user payload buffer, register a prologue -# that emits an `lttng_ust_mpi_properties:buffer_info` event per buffer. The -# event records whether the pointer is host / device / shared memory and which -# GPU runtime (Level Zero, CUDA, HIP) owns it. Consumers (extract/, -# btx_mpiinterval_callbacks.cpp) use that to tag each MPI call as "GPU-aware" -# or "CPU-only". -# -# The set of buffer parameters is derived from mpi_meta_parameters.yaml so -# new MPI APIs picked up by codegen are classified automatically: any -# InScalar/OutScalar parameter whose name matches one of the canonical MPI -# payload-buffer names below is treated as a user buffer. -# --------------------------------------------------------------------------- -MPI_PAYLOAD_BUFFER_NAMES = %w[ - buf buffer sendbuf recvbuf inbuf outbuf inoutbuf - origin_addr result_addr compare_addr -].freeze - -# NOTE: the YAML top-level key is the Ruby symbol :meta_parameters (it's -# serialised as ":meta_parameters:"), not the string "meta_parameters". -$mpi_meta_parameters.fetch(:meta_parameters, {}).each do |func, list| - buffers = list.select do |type, name| - %w[InScalar OutScalar].include?(type) && - MPI_PAYLOAD_BUFFER_NAMES.include?(name) - end - next if buffers.empty? - - body = buffers.map do |type, name| - role = type == 'InScalar' ? 'THAPI_MPI_ROLE_IN' : 'THAPI_MPI_ROLE_OUT' - " _dump_buffer_info((const void *)#{name}, #{role});" - end.join("\n") - - register_prologue func, <<~EOF - if (tracepoint_enabled(lttng_ust_mpi_properties, buffer_info)) { - #{body} - } - EOF -end diff --git a/backends/mpi/tracer_mpi.sh.in b/backends/mpi/tracer_mpi.sh.in index e841ad94b..55343f6ca 100644 --- a/backends/mpi/tracer_mpi.sh.in +++ b/backends/mpi/tracer_mpi.sh.in @@ -81,8 +81,6 @@ fi lttng enable-channel --userspace --blocking-timeout=inf blocking-channel lttng add-context --userspace --channel=blocking-channel -t vpid -t vtid lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi:* -lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi_type:* -lttng enable-event --channel=blocking-channel --userspace lttng_ust_mpi_properties:* if [ -z "$LTTNG_UST_MPI_LIBMPI" ]; then LTTNG_UST_MPI_LIBMPI=$(whichlib64_head libmpi.so) if [ -n "$LTTNG_UST_MPI_LIBMPI" ]; then diff --git a/backends/mpi/tracer_mpi_helpers.include.c b/backends/mpi/tracer_mpi_helpers.include.c index b4fbcf43f..a432d2183 100644 --- a/backends/mpi/tracer_mpi_helpers.include.c +++ b/backends/mpi/tracer_mpi_helpers.include.c @@ -2,392 +2,6 @@ static pthread_once_t _init = PTHREAD_ONCE_INIT; static __thread volatile int in_init = 0; static volatile unsigned int _initialized = 0; -/* --------------------------------------------------------------------------- - * GPU pointer introspection - * - * Goal: when an MPI call carries a buffer pointer, decide at runtime whether - * the pointer refers to host memory or to a GPU allocation, so that downstream - * tools can tag the call as "GPU-aware MPI" vs "CPU-only MPI". - * - * Design constraints (matching the existing MPI backend): - * - No build-time dependency on Level Zero / CUDA / HIP. Each loader is - * dlopen'd lazily, and only the symbols we need are looked up via dlsym - * using locally-declared function-pointer signatures over opaque types. - * - All overhead is gated by tracepoint_enabled() and a small per-thread - * cache keyed on the allocation base address, so a buffer reused across - * iterations is classified only once. - * ------------------------------------------------------------------------- */ - -/* Values reported in the buffer_info tracepoint. Keep in sync with consumers - * (extract/, btx_mpiinterval_callbacks.cpp). */ -enum thapi_mpi_ptr_kind { - THAPI_MPI_PTR_UNKNOWN = 0, - THAPI_MPI_PTR_HOST = 1, - THAPI_MPI_PTR_DEVICE = 2, - THAPI_MPI_PTR_SHARED = 3 -}; - -enum thapi_mpi_ptr_backend { - THAPI_MPI_BACKEND_NONE = 0, - THAPI_MPI_BACKEND_L0 = 1, - THAPI_MPI_BACKEND_CUDA = 2, - THAPI_MPI_BACKEND_HIP = 3 -}; - -enum thapi_mpi_buffer_role { THAPI_MPI_ROLE_IN = 0, THAPI_MPI_ROLE_OUT = 1 }; - -/* --- Level Zero ----------------------------------------------------------- */ -/* Opaque handle types and the minimal enums/structs needed to call - * zeMemGetAllocProperties / zeMemGetAddressRange without including ze_api.h. */ -typedef int _thapi_ze_result_t; /* ze_result_t */ -typedef struct _thapi_ze_driver *_thapi_ze_driver_handle_t; -typedef struct _thapi_ze_context *_thapi_ze_context_handle_t; -typedef struct _thapi_ze_device *_thapi_ze_device_handle_t; - -#define _THAPI_ZE_RESULT_SUCCESS 0 -#define _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES 0x000c -#define _THAPI_ZE_INIT_FLAG_GPU_ONLY 0x01 - -typedef enum { - _THAPI_ZE_MEMORY_TYPE_UNKNOWN = 0, - _THAPI_ZE_MEMORY_TYPE_HOST = 1, - _THAPI_ZE_MEMORY_TYPE_DEVICE = 2, - _THAPI_ZE_MEMORY_TYPE_SHARED = 3 -} _thapi_ze_memory_type_t; - -typedef struct { - uint32_t stype; - const void *pNext; - _thapi_ze_memory_type_t type; - uint64_t id; - uint64_t pageSize; -} _thapi_ze_memory_allocation_properties_t; - -typedef _thapi_ze_result_t (*_thapi_ze_init_fn_t)(uint32_t flags); -typedef _thapi_ze_result_t (*_thapi_ze_driver_get_fn_t)(uint32_t *pCount, - _thapi_ze_driver_handle_t *phDrivers); -typedef _thapi_ze_result_t (*_thapi_ze_context_create_fn_t)(_thapi_ze_driver_handle_t hDriver, - const void *desc, - _thapi_ze_context_handle_t *phContext); -typedef _thapi_ze_result_t (*_thapi_ze_mem_get_alloc_properties_fn_t)( - _thapi_ze_context_handle_t hContext, - const void *ptr, - _thapi_ze_memory_allocation_properties_t *pProps, - _thapi_ze_device_handle_t *phDevice); -typedef _thapi_ze_result_t (*_thapi_ze_mem_get_address_range_fn_t)( - _thapi_ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize); - -static _thapi_ze_mem_get_alloc_properties_fn_t _ze_mem_get_alloc_properties = NULL; -static _thapi_ze_mem_get_address_range_fn_t _ze_mem_get_address_range = NULL; -static _thapi_ze_context_handle_t _ze_introspection_ctx = NULL; - -/* --- CUDA ----------------------------------------------------------------- */ -/* Driver API (libcuda.so) is preferred: it's the same on every CUDA install - * and doesn't drag in libcudart. */ -typedef int _thapi_cu_result_t; /* CUresult */ -typedef unsigned int _thapi_cu_attr_t; /* CUpointer_attribute */ -typedef unsigned long long _thapi_cu_deviceptr_t; /* CUdeviceptr */ - -#define _THAPI_CU_SUCCESS 0 -#define _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE 2 -#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START 7 -#define _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE 8 -#define _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL 9 -#define _THAPI_CU_MEMORYTYPE_HOST 1 -#define _THAPI_CU_MEMORYTYPE_DEVICE 2 -#define _THAPI_CU_MEMORYTYPE_ARRAY 3 -#define _THAPI_CU_MEMORYTYPE_UNIFIED 4 - -typedef _thapi_cu_result_t (*_thapi_cu_init_fn_t)(unsigned int flags); -typedef _thapi_cu_result_t (*_thapi_cu_pointer_get_attribute_fn_t)(void *data, - _thapi_cu_attr_t attr, - _thapi_cu_deviceptr_t ptr); - -static _thapi_cu_pointer_get_attribute_fn_t _cu_pointer_get_attribute = NULL; - -/* --- HIP ------------------------------------------------------------------ */ -typedef int _thapi_hip_error_t; /* hipError_t */ - -#define _THAPI_HIP_SUCCESS 0 -#define _THAPI_HIP_MEMORY_TYPE_HOST 0 -#define _THAPI_HIP_MEMORY_TYPE_DEVICE 1 -#define _THAPI_HIP_MEMORY_TYPE_ARRAY 2 -#define _THAPI_HIP_MEMORY_TYPE_UNIFIED 3 -#define _THAPI_HIP_MEMORY_TYPE_MANAGED 4 - -typedef struct { - unsigned int type; - int device; - void *devicePointer; - void *hostPointer; - int isManaged; - unsigned int allocationFlags; -} _thapi_hip_pointer_attribute_t; - -typedef _thapi_hip_error_t (*_thapi_hip_pointer_get_attributes_fn_t)( - _thapi_hip_pointer_attribute_t *attr, const void *ptr); -typedef _thapi_hip_error_t (*_thapi_hip_mem_get_address_range_fn_t)(void **pbase, - size_t *psize, - void *dptr); - -static _thapi_hip_pointer_get_attributes_fn_t _hip_pointer_get_attributes = NULL; -static _thapi_hip_mem_get_address_range_fn_t _hip_mem_get_address_range = NULL; - -/* --- Per-thread address-range cache --------------------------------------- */ -/* Tiny LRU keyed on allocation base address. Buffers reused across MPI - * iterations (the common case in HPC) hit the cache and skip the runtime - * query entirely. Sized small enough to live cheaply per-thread. */ -#define _THAPI_MPI_CACHE_SLOTS 16 - -struct _thapi_mpi_cache_entry { - uintptr_t base; - size_t size; - uint8_t kind; - uint8_t backend; - uint32_t device_ordinal; - uint8_t valid; -}; - -static __thread struct _thapi_mpi_cache_entry _thapi_mpi_cache[_THAPI_MPI_CACHE_SLOTS]; -static __thread unsigned int _thapi_mpi_cache_next = 0; - -static inline int _thapi_mpi_cache_lookup(const void *ptr, struct _thapi_mpi_cache_entry *out) { - uintptr_t p = (uintptr_t)ptr; - for (unsigned int i = 0; i < _THAPI_MPI_CACHE_SLOTS; i++) { - struct _thapi_mpi_cache_entry *e = &_thapi_mpi_cache[i]; - if (e->valid && p >= e->base && p < e->base + e->size) { - *out = *e; - return 1; - } - } - return 0; -} - -static inline void _thapi_mpi_cache_insert(const struct _thapi_mpi_cache_entry *e) { - _thapi_mpi_cache[_thapi_mpi_cache_next] = *e; - _thapi_mpi_cache_next = (_thapi_mpi_cache_next + 1) % _THAPI_MPI_CACHE_SLOTS; -} - -/* --- Runtime probing ------------------------------------------------------ */ -static int _try_classify_ze(const void *ptr, struct _thapi_mpi_cache_entry *out) { - if (!_ze_mem_get_alloc_properties || !_ze_introspection_ctx) - return 0; - - _thapi_ze_memory_allocation_properties_t props; - props.stype = _THAPI_ZE_STRUCTURE_TYPE_MEM_ALLOC_PROPERTIES; - props.pNext = NULL; - props.type = _THAPI_ZE_MEMORY_TYPE_UNKNOWN; - _thapi_ze_device_handle_t hDev = NULL; - - if (_ze_mem_get_alloc_properties(_ze_introspection_ctx, ptr, &props, &hDev) != - _THAPI_ZE_RESULT_SUCCESS) - return 0; - if (props.type == _THAPI_ZE_MEMORY_TYPE_UNKNOWN) - return 0; - - void *base = NULL; - size_t size = 0; - if (_ze_mem_get_address_range && _ze_mem_get_address_range(_ze_introspection_ctx, ptr, &base, - &size) != _THAPI_ZE_RESULT_SUCCESS) { - base = (void *)ptr; - size = 1; - } - - out->base = (uintptr_t)base; - out->size = size ? size : 1; - out->backend = THAPI_MPI_BACKEND_L0; - out->device_ordinal = 0; /* L0 device ordinal lookup would need extra calls */ - switch (props.type) { - case _THAPI_ZE_MEMORY_TYPE_HOST: - out->kind = THAPI_MPI_PTR_HOST; - break; - case _THAPI_ZE_MEMORY_TYPE_DEVICE: - out->kind = THAPI_MPI_PTR_DEVICE; - break; - case _THAPI_ZE_MEMORY_TYPE_SHARED: - out->kind = THAPI_MPI_PTR_SHARED; - break; - default: - out->kind = THAPI_MPI_PTR_UNKNOWN; - break; - } - out->valid = 1; - return 1; -} - -static int _try_classify_cuda(const void *ptr, struct _thapi_mpi_cache_entry *out) { - if (!_cu_pointer_get_attribute) - return 0; - - unsigned int mem_type = 0; - _thapi_cu_deviceptr_t range_start = 0; - size_t range_size = 0; - int device_ordinal = 0; - _thapi_cu_deviceptr_t dptr = (_thapi_cu_deviceptr_t)(uintptr_t)ptr; - - if (_cu_pointer_get_attribute(&mem_type, _THAPI_CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dptr) != - _THAPI_CU_SUCCESS || - mem_type == 0) - return 0; - - (void)_cu_pointer_get_attribute(&range_start, _THAPI_CU_POINTER_ATTRIBUTE_RANGE_START, dptr); - (void)_cu_pointer_get_attribute(&range_size, _THAPI_CU_POINTER_ATTRIBUTE_RANGE_SIZE, dptr); - (void)_cu_pointer_get_attribute(&device_ordinal, _THAPI_CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, - dptr); - - out->base = range_start ? (uintptr_t)range_start : (uintptr_t)ptr; - out->size = range_size ? range_size : 1; - out->backend = THAPI_MPI_BACKEND_CUDA; - out->device_ordinal = (uint32_t)device_ordinal; - switch (mem_type) { - case _THAPI_CU_MEMORYTYPE_HOST: - out->kind = THAPI_MPI_PTR_HOST; - break; - case _THAPI_CU_MEMORYTYPE_DEVICE: - out->kind = THAPI_MPI_PTR_DEVICE; - break; - case _THAPI_CU_MEMORYTYPE_UNIFIED: - out->kind = THAPI_MPI_PTR_SHARED; - break; - default: - out->kind = THAPI_MPI_PTR_UNKNOWN; - break; - } - out->valid = 1; - return 1; -} - -static int _try_classify_hip(const void *ptr, struct _thapi_mpi_cache_entry *out) { - if (!_hip_pointer_get_attributes) - return 0; - - _thapi_hip_pointer_attribute_t attr; - memset(&attr, 0, sizeof(attr)); - if (_hip_pointer_get_attributes(&attr, ptr) != _THAPI_HIP_SUCCESS) - return 0; - - void *base = (void *)ptr; - size_t size = 1; - if (_hip_mem_get_address_range) - (void)_hip_mem_get_address_range(&base, &size, (void *)ptr); - - out->base = (uintptr_t)base; - out->size = size ? size : 1; - out->backend = THAPI_MPI_BACKEND_HIP; - out->device_ordinal = (uint32_t)attr.device; - if (attr.isManaged || attr.type == _THAPI_HIP_MEMORY_TYPE_UNIFIED || - attr.type == _THAPI_HIP_MEMORY_TYPE_MANAGED) - out->kind = THAPI_MPI_PTR_SHARED; - else if (attr.type == _THAPI_HIP_MEMORY_TYPE_DEVICE) - out->kind = THAPI_MPI_PTR_DEVICE; - else if (attr.type == _THAPI_HIP_MEMORY_TYPE_HOST) - out->kind = THAPI_MPI_PTR_HOST; - else - out->kind = THAPI_MPI_PTR_UNKNOWN; - out->valid = 1; - return 1; -} - -/* Lazy initialisation of the GPU introspection loaders. Deferred until the - * first buffer classification so that the application has finished its own - * runtime setup (zeInit / cuInit / ZE_AFFINITY_MASK / etc.) by the time we - * peek at the GPU stack. Doing this from _load_tracer() instead would create - * an L0 context before the app has set its environment, which is bad - * hygiene and can in rare cases trigger reentrancy through the L0 loader. */ -static void _load_gpu_introspection(void); -static pthread_once_t _gpu_introspection_once = PTHREAD_ONCE_INIT; -static void _load_gpu_introspection_once(void) { _load_gpu_introspection(); } - -static inline void _dump_buffer_info(const void *ptr, int role) { - if (!ptr) - return; - /* MPI_IN_PLACE / MPI_BOTTOM are sentinel values, not real pointers. */ - if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM) - return; - if (!tracepoint_enabled(lttng_ust_mpi_properties, buffer_info)) - return; - - pthread_once(&_gpu_introspection_once, _load_gpu_introspection_once); - - struct _thapi_mpi_cache_entry e; - if (!_thapi_mpi_cache_lookup(ptr, &e)) { - e.valid = 0; - if (!_try_classify_ze(ptr, &e) && !_try_classify_cuda(ptr, &e) && !_try_classify_hip(ptr, &e)) { - /* Nobody claims it: treat as host. Cache a 1-byte range so we still - * answer in O(1) on the next call, but don't pollute the cache with - * adjacent host allocations. */ - e.base = (uintptr_t)ptr; - e.size = 1; - e.kind = THAPI_MPI_PTR_HOST; - e.backend = THAPI_MPI_BACKEND_NONE; - e.device_ordinal = 0; - e.valid = 1; - } - _thapi_mpi_cache_insert(&e); - } - - do_tracepoint(lttng_ust_mpi_properties, buffer_info, ptr, role, (int)e.kind, (int)e.backend, - e.base, (uint64_t)e.size, e.device_ordinal); -} - -/* --- Loader -------------------------------------------------------------- */ -static void _load_gpu_introspection(void) { - /* Each loader is optional: if the runtime isn't present on the system, the - * corresponding classifier stays NULL and is silently skipped. */ - void *h_ze = dlopen("libze_loader.so.1", RTLD_LAZY | RTLD_LOCAL); - if (!h_ze) - h_ze = dlopen("libze_loader.so", RTLD_LAZY | RTLD_LOCAL); - if (h_ze) { - _thapi_ze_init_fn_t ze_init = (_thapi_ze_init_fn_t)dlsym(h_ze, "zeInit"); - _thapi_ze_driver_get_fn_t ze_driver = (_thapi_ze_driver_get_fn_t)dlsym(h_ze, "zeDriverGet"); - _thapi_ze_context_create_fn_t ze_ctx_new = - (_thapi_ze_context_create_fn_t)dlsym(h_ze, "zeContextCreate"); - _ze_mem_get_alloc_properties = - (_thapi_ze_mem_get_alloc_properties_fn_t)dlsym(h_ze, "zeMemGetAllocProperties"); - _ze_mem_get_address_range = - (_thapi_ze_mem_get_address_range_fn_t)dlsym(h_ze, "zeMemGetAddressRange"); - - /* zeMemGetAllocProperties needs a context. Create a private one against - * the first driver so introspection never touches application contexts. */ - uint32_t n = 1; - _thapi_ze_driver_handle_t hDrv = NULL; - struct { - uint32_t stype; - const void *pNext; - uint32_t flags; - } desc = {0x000d /* ZE_STRUCTURE_TYPE_CONTEXT_DESC */, NULL, 0}; - if (!(ze_init && ze_driver && ze_ctx_new && - ze_init(_THAPI_ZE_INIT_FLAG_GPU_ONLY) == _THAPI_ZE_RESULT_SUCCESS && - ze_driver(&n, &hDrv) == _THAPI_ZE_RESULT_SUCCESS && hDrv && - ze_ctx_new(hDrv, &desc, &_ze_introspection_ctx) == _THAPI_ZE_RESULT_SUCCESS)) { - _ze_mem_get_alloc_properties = NULL; - _ze_mem_get_address_range = NULL; - _ze_introspection_ctx = NULL; - } - } - - void *h_cu = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_LOCAL); - if (!h_cu) - h_cu = dlopen("libcuda.so", RTLD_LAZY | RTLD_LOCAL); - if (h_cu) { - _thapi_cu_init_fn_t cu_init = (_thapi_cu_init_fn_t)dlsym(h_cu, "cuInit"); - _cu_pointer_get_attribute = - (_thapi_cu_pointer_get_attribute_fn_t)dlsym(h_cu, "cuPointerGetAttribute"); - if (!(cu_init && _cu_pointer_get_attribute && cu_init(0) == _THAPI_CU_SUCCESS)) - _cu_pointer_get_attribute = NULL; - } - - void *h_hip = dlopen("libamdhip64.so.5", RTLD_LAZY | RTLD_LOCAL); - if (!h_hip) - h_hip = dlopen("libamdhip64.so", RTLD_LAZY | RTLD_LOCAL); - if (h_hip) { - _hip_pointer_get_attributes = - (_thapi_hip_pointer_get_attributes_fn_t)dlsym(h_hip, "hipPointerGetAttributes"); - _hip_mem_get_address_range = - (_thapi_hip_mem_get_address_range_fn_t)dlsym(h_hip, "hipMemGetAddressRange"); - } -} - static void _load_tracer(void) { char *s = NULL; void *handle = NULL; @@ -416,9 +30,6 @@ static void _load_tracer(void) { verbose = 1; find_mpi_symbols(handle, verbose); - /* GPU pointer introspection is loaded lazily on first use (see - * _load_gpu_introspection_once), so the application's runtime setup - * completes before we touch L0/CUDA/HIP. */ } static inline void _init_tracer(void) { diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 553312e38..9b4804dd6 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -639,7 +639,6 @@ def enable_events_mpi(channel_name, tracing_mode: 'default', profiling: true) end LOGGER.debug('Profiling is ignored for mpi') if profiling exec("#{lttng_enable} lttng_ust_mpi_type:*") - exec("#{lttng_enable} lttng_ust_mpi_properties:*") end def enable_events_cxi(channel_name, tracing_mode: 'default', profiling: true); end From c8cde68ee624d9973008fa569c619a559b0957f0 Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 17:54:34 -0500 Subject: [PATCH 09/14] mpi: consumer-side GPU-aware tagging via ze allocation events Implements Thomas's review suggestion: instead of querying the GPU runtime from the MPI tracer, classify MPI buffer pointers post-mortem in btx_mpiinterval_callbacks.cpp by joining the ze backend's zeMemAlloc/zeMemFree events into the MPI filter's upstream model. Changes: backends/mpi/Makefile.am: Union backends/ze/btx_ze_model.yaml into the MPI filter's -u so metababel routes ze events to btx_filter_mpi. Adds a build-order dependency mpi -> ze. backends/mpi/btx_mpimatching_model.yaml: - Anchor the existing entries/exits categories to the lttng_ust_mpi provider so ze events do not accidentally enter them. - Add mpi_1buf_entry / mpi_2buf_entry categories that capture payload buffer pointer(s) of buffer-bearing MPI calls. - Add ze_alloc_entry / ze_alloc_exit / ze_free_entry categories that capture ze USM allocation events. backends/mpi/btx_mpiinterval_callbacks.cpp: - Maintain per-process address-range maps rangeset_memory_device and rangeset_memory_shared, mirroring the pattern in backends/ze/btx_zeinterval_callbacks.cpp. - Populate the maps from ze_alloc_*/ze_free_entry callbacks (host USM is intentionally not classified as GPU-aware). - On every buffer-bearing MPI entry event, look up each buffer pointer; if it falls in a tracked GPU range, record the call's {hostname,vpid,vtid} in gpu_aware_calls. - In send_host_message, prepend '*' to the MPI call name when the flag is set, so the tally aggregates GPU-aware and CPU-only variants as distinct rows. When the user runs --backend mpi without --backend ze, no ze events appear in the trace, the range-map stays empty, and the output is unchanged from upstream (no '*' ever appears). No producer-side changes; the MPI tracer hot path is unaffected. --- backends/mpi/Makefile.am | 11 +- backends/mpi/btx_mpiinterval_callbacks.cpp | 179 ++++++++++++++++++++- backends/mpi/btx_mpimatching_model.yaml | 67 +++++++- 3 files changed, 251 insertions(+), 6 deletions(-) diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am index f3d0ea28c..c115cf3ff 100644 --- a/backends/mpi/Makefile.am +++ b/backends/mpi/Makefile.am @@ -189,8 +189,15 @@ EXTRA_DIST += \ $(top_srcdir)/xprof/btx_interval_model.yaml \ btx_mpimatching_model.yaml -$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml - $(METABABEL) -u btx_mpi_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include +# The MPI interval filter unions the ze upstream model so that +# btx_mpiinterval_callbacks.cpp can subscribe to zeMemAlloc/zeMemFree +# events and tag GPU-aware MPI calls via address-range lookup. This +# creates a build-order dependency mpi -> ze; the ze backend must be +# configured/built first when changing only the MPI backend. +BTX_ZE_MODEL = $(top_builddir)/backends/ze/btx_ze_model.yaml + +$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml $(BTX_ZE_MODEL) + $(METABABEL) -u btx_mpi_model.yaml,$(BTX_ZE_MODEL) -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include CLEANFILES += \ $(BTX_MPI_GENERATED) \ diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp index 65f1cedd2..efd807071 100644 --- a/backends/mpi/btx_mpiinterval_callbacks.cpp +++ b/backends/mpi/btx_mpiinterval_callbacks.cpp @@ -1,17 +1,69 @@ #include "xprof_utils.hpp" #include +#include #include #include #include #include #include +#include + +// --------------------------------------------------------------------------- +// GPU-aware MPI tagging +// +// We maintain a per-process address-range map populated from +// lttng_ust_ze:zeMemAlloc*_exit / zeMemFree_entry events emitted by the ze +// backend (joined into our upstream model via backends/mpi/Makefile.am). +// Each MPI buffer-bearing entry event looks up its buffer pointer in that +// map; if the pointer falls inside a known GPU allocation, the +// {hostname,vpid,vtid} key is recorded in gpu_aware_calls. The matching +// exit callback consumes the key and prepends '*' to the MPI call name +// before pushing it to the aggregator, producing a separate '*MPI_Foo' +// row in the tally. +// +// When the user enables only --backend mpi, no ze events flow in, the +// range-map stays empty, and no '*' ever appears (CPU-only profile). +// --------------------------------------------------------------------------- + +// Per-process address-range map: base -> (base + size). +typedef std::map memory_interval_t; struct data_s { EntryState entry_state; + // GPU allocation ranges, keyed on {hostname, vpid}. Mirrors the + // rangeset_memory_{host,device,shared} pattern in + // backends/ze/btx_zeinterval_callbacks.cpp. + std::unordered_map rangeset_memory_device; + std::unordered_map rangeset_memory_shared; + // {hostname, vpid, vtid} of MPI calls observed to carry at least one + // device/shared GPU buffer. Populated in mpi_*_buf_entry_callback, + // consumed (and cleared) in send_host_message. + std::unordered_set gpu_aware_calls; }; typedef struct data_s data_t; +// True iff `ptr` lies inside any allocation tracked in `m`. +static bool memory_includes(const memory_interval_t &m, uintptr_t ptr) { + const auto it = m.upper_bound(ptr); + if (it == m.cbegin()) + return false; + return ptr < std::prev(it)->second; +} + +// True iff `ptr` lies inside a tracked GPU (device or shared) allocation +// for the given {hostname, vpid}. Host USM is intentionally NOT tagged +// as GPU-aware here (it behaves like host memory from MPI's perspective). +static bool is_gpu_pointer(const data_t *data, const hp_t &hp, uintptr_t ptr) { + auto itd = data->rangeset_memory_device.find(hp); + if (itd != data->rangeset_memory_device.end() && memory_includes(itd->second, ptr)) + return true; + auto its = data->rangeset_memory_shared.find(hp); + if (its != data->rangeset_memory_shared.end() && memory_includes(its->second, ptr)) + return true; + return false; +} + static void send_host_message(void *btx_handle, void *usr_data, int64_t ts, @@ -21,9 +73,16 @@ static void send_host_message(void *btx_handle, uint64_t vtid, bool err) { + auto *data = static_cast(usr_data); + const hpt_t hpt{hostname, vpid, vtid}; + std::string event_class_name_striped = strip_event_class_name_exit(event_class_name); - const int64_t entry_ts = - static_cast(usr_data)->entry_state.get_ts({hostname, vpid, vtid}); + // GPU-aware MPI calls get a leading '*' so they sort and aggregate as a + // distinct row from their CPU-only counterparts in the tally output. + if (data->gpu_aware_calls.erase(hpt) > 0) + event_class_name_striped.insert(0, 1, '*'); + + const int64_t entry_ts = data->entry_state.get_ts(hpt); btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI, event_class_name_striped.c_str(), (ts - entry_ts), err); @@ -198,6 +257,114 @@ static void traffic_int_entry_callback(void *btx_handle, count * size, oss.str().c_str()); } +// --- ze allocation / free tracking ---------------------------------------- +// Bind allocation size at entry; the matching exit callback uses it together +// with the freshly-returned pointer to populate the range-map. The +// event_class_name suffix (Device vs Shared vs Host) selects which rangeset +// the allocation lands in. +static void ze_alloc_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + size_t size) { + static_cast(usr_data)->entry_state.set_data({hostname, vpid, vtid}, size); +} + +static void ze_alloc_exit_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + int zeResult, + void *pptr_val) { + auto *data = static_cast(usr_data); + auto size = data->entry_state.get_data({hostname, vpid, vtid}); + if (zeResult != 0 || pptr_val == nullptr) + return; + + const std::string ev{event_class_name}; + // Host USM is reachable from the host CPU so MPI sees it as a normal host + // pointer; classifying it as GPU-aware would produce false positives. + // Device and Shared (managed) USM live (at least partly) on the device, + // so MPI calls on them are GPU-aware. + std::unordered_map *mi = nullptr; + if (ev.find("zeMemAllocDevice") != std::string::npos) + mi = &data->rangeset_memory_device; + else if (ev.find("zeMemAllocShared") != std::string::npos) + mi = &data->rangeset_memory_shared; + else + return; // host USM, ignored + + const uintptr_t base = reinterpret_cast(pptr_val); + (*mi)[{hostname, vpid}][base] = base + size; +} + +static void ze_free_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + void *ptr) { + auto *data = static_cast(usr_data); + if (ptr == nullptr) + return; + const uintptr_t base = reinterpret_cast(ptr); + const hp_t hp{hostname, vpid}; + // The freed pointer may have been classified as device OR shared at + // allocation; try both. + auto itd = data->rangeset_memory_device.find(hp); + if (itd != data->rangeset_memory_device.end()) + itd->second.erase(base); + auto its = data->rangeset_memory_shared.find(hp); + if (its != data->rangeset_memory_shared.end()) + its->second.erase(base); +} + +// --- MPI buffer classification -------------------------------------------- +static void +tag_if_gpu(data_t *data, const char *hostname, int64_t vpid, uint64_t vtid, const void *ptr) { + if (ptr == nullptr) + return; + // MPICH sentinels: MPI_IN_PLACE / MPI_BOTTOM. These would never match a + // real allocation but checking explicitly keeps the intent clear. + if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM) + return; + if (is_gpu_pointer(data, {hostname, vpid}, reinterpret_cast(ptr))) + data->gpu_aware_calls.insert({hostname, vpid, vtid}); +} + +static void mpi_1buf_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + const void *buf) { + tag_if_gpu(static_cast(usr_data), hostname, vpid, vtid, buf); +} + +static void mpi_2buf_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + const void *sendbuf, + const void *recvbuf) { + auto *data = static_cast(usr_data); + tag_if_gpu(data, hostname, vpid, vtid, sendbuf); + tag_if_gpu(data, hostname, vpid, vtid, recvbuf); +} + void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component); btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component); @@ -209,4 +376,12 @@ void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback); btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback); + + // GPU-aware MPI: ze alloc/free updates the range-map, MPI buffer entries + // look up against it. + btx_register_callbacks_ze_alloc_entry(btx_handle, &ze_alloc_entry_callback); + btx_register_callbacks_ze_alloc_exit(btx_handle, &ze_alloc_exit_callback); + btx_register_callbacks_ze_free_entry(btx_handle, &ze_free_entry_callback); + btx_register_callbacks_mpi_1buf_entry(btx_handle, &mpi_1buf_entry_callback); + btx_register_callbacks_mpi_2buf_entry(btx_handle, &mpi_2buf_entry_callback); } diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml index 829bbedc8..7183c1d00 100644 --- a/backends/mpi/btx_mpimatching_model.yaml +++ b/backends/mpi/btx_mpimatching_model.yaml @@ -14,11 +14,14 @@ :field_class: :type: integer_unsigned :event_classes: + # Restrict the bare entry/exit patterns to the MPI provider so that + # ze allocation/free events sourced from the ze backend (see below) + # do not accidentally match them. - :set_id: entries - :name: "_entry$" + :name: "^lttng_ust_mpi:.*_entry$" - :set_id: exits :register: false - :name: "_exit$" + :name: "^lttng_ust_mpi:.*_exit$" - :set_id: exits_mpiError_present :domain: exits :payload_field_class: @@ -52,3 +55,63 @@ - :name: ^(datatype|origin_count)$ :field_class: :cast_type: MPI_Datatype + # ------------------------------------------------------------------ + # GPU-aware MPI tagging + # + # Categories below feed btx_mpiinterval_callbacks.cpp's range-map + # lookup. The MPI buffer categories capture the payload pointer(s) + # of every buffer-bearing MPI call; the ze alloc/free categories + # maintain a {hostname,vpid} -> address-range map. An MPI call whose + # buffer falls in the range-map is tagged with a leading '*' in the + # tally output. This is only effective when the user enables both + # --backend mpi and --backend ze; with only --backend mpi the + # range-map stays empty and no '*' ever appears. + # ------------------------------------------------------------------ + # Two-buffer MPI calls (Sendrecv, Allreduce, Reduce, Bcast variants + # with sendbuf+recvbuf, Get_accumulate with origin_addr+result_addr, + # etc.). + - :set_id: mpi_2buf_entry + :domain: entries + :payload_field_class: + :type: structure + :members: + - :name: ^(sendbuf|buf|buffer|inbuf|origin_addr|inoutbuf)$ + :field_class: + :cast_type: const void \* + - :name: ^(recvbuf|outbuf|result_addr)$ + :field_class: + :cast_type: const void \* + # Single-buffer MPI calls (Send, Recv, Bcast, etc.). + - :set_id: mpi_1buf_entry + :domain: entries - mpi_2buf_entry + :payload_field_class: + :type: structure + :members: + - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr|compare_addr)$ + :field_class: + :cast_type: const void \* + # Level Zero USM allocation / free events sourced from the ze + # backend (see backends/ze/btx_zematching_model.yaml for matching + # parents). We bind size at entry and the allocated pointer at exit, + # mirroring btx_zeinterval_callbacks.cpp:entries_alloc / exits_alloc. + - :set_id: ze_alloc_entry + :name: "^lttng_ust_ze:zeMemAlloc.*_entry$" + :payload_field_class: + :type: structure + :members: + - :name: ^size$ + - :set_id: ze_alloc_exit + :name: "^lttng_ust_ze:zeMemAlloc.*_exit$" + :payload_field_class: + :type: structure + :members: + - :name: ^zeResult$ + :field_class: + :cast_type: ^int$ + - :name: ^pptr_val$ + - :set_id: ze_free_entry + :name: "^lttng_ust_ze:zeMemFree_entry$" + :payload_field_class: + :type: structure + :members: + - :name: ^ptr$ From c40a24c8758bdfaafd5862551a3f82bf01e7b69a Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 21:51:57 -0500 Subject: [PATCH 10/14] mpi: synthesise merged upstream model instead of metababel -u union metababel's bare `-u model_a,model_b` upstream union fails when both models declare the same environment field, raising: Match expression '^hostname$' must match only one member, '2' matched ["hostname", "hostname"] Every backend model generated by utils/gen_babeltrace_model_helper.rb declares :environment: :entries: [{name: hostname, type: string}], so any two backend models clash. metababel itself owns the rule and isn't ours to change. Side-step it: generate a single merged upstream model (btx_mpi_with_ze_model.yaml) that starts from btx_mpi_model.yaml and copies only the ze allocation/free event classes we actually need into MPI's stream class. One environment declaration, one stream class, no duplicates. - backends/mpi/gen_btx_mpi_with_ze_model.rb: the generator. - backends/mpi/Makefile.am: builds the merged YAML, feeds it to metababel as the sole `-u` source. --- backends/mpi/Makefile.am | 22 ++++++---- backends/mpi/gen_btx_mpi_with_ze_model.rb | 51 +++++++++++++++++++++++ 2 files changed, 66 insertions(+), 7 deletions(-) create mode 100644 backends/mpi/gen_btx_mpi_with_ze_model.rb diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am index c115cf3ff..681bed984 100644 --- a/backends/mpi/Makefile.am +++ b/backends/mpi/Makefile.am @@ -189,15 +189,23 @@ EXTRA_DIST += \ $(top_srcdir)/xprof/btx_interval_model.yaml \ btx_mpimatching_model.yaml -# The MPI interval filter unions the ze upstream model so that -# btx_mpiinterval_callbacks.cpp can subscribe to zeMemAlloc/zeMemFree -# events and tag GPU-aware MPI calls via address-range lookup. This -# creates a build-order dependency mpi -> ze; the ze backend must be -# configured/built first when changing only the MPI backend. +# The MPI interval filter needs to receive Level Zero allocation/free +# events in addition to MPI events so that btx_mpiinterval_callbacks.cpp +# can tag GPU-aware MPI calls via address-range lookup. metababel's bare +# `-u model_a,model_b` union fails when both backend models declare the +# same environment field (hostname), so we generate a merged upstream +# model that copies only the ze events we need into MPI's stream class. +# This adds a build-order dependency mpi -> ze. BTX_ZE_MODEL = $(top_builddir)/backends/ze/btx_ze_model.yaml -$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml $(BTX_ZE_MODEL) - $(METABABEL) -u btx_mpi_model.yaml,$(BTX_ZE_MODEL) -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include +btx_mpi_with_ze_model.yaml: $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL) + $(RUBY) $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL) > $@ + +CLEANFILES += btx_mpi_with_ze_model.yaml +EXTRA_DIST += gen_btx_mpi_with_ze_model.rb + +$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_with_ze_model.yaml + $(METABABEL) -u btx_mpi_with_ze_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include CLEANFILES += \ $(BTX_MPI_GENERATED) \ diff --git a/backends/mpi/gen_btx_mpi_with_ze_model.rb b/backends/mpi/gen_btx_mpi_with_ze_model.rb new file mode 100644 index 000000000..1365a593c --- /dev/null +++ b/backends/mpi/gen_btx_mpi_with_ze_model.rb @@ -0,0 +1,51 @@ +#!/usr/bin/env ruby +# Generate a merged upstream YAML model for the MPI interval filter. +# +# btx_filter_mpi needs to receive Level Zero allocation/free events in +# addition to MPI events so that btx_mpiinterval_callbacks.cpp can +# maintain an address-range map and tag GPU-aware MPI calls. metababel's +# -u union of two backend models fails because both declare +# :environment: :entries: [{name: hostname}], which the matching +# engine treats as an ambiguity. +# +# This generator side-steps the issue by emitting a single model file +# that: +# - starts from btx_mpi_model.yaml, +# - copies only the ze allocation/free event classes we actually +# need into the same stream class. +# +# Usage: ruby gen_btx_mpi_with_ze_model.rb MPI_MODEL.yaml ZE_MODEL.yaml > OUT.yaml +require 'yaml' + +mpi_path = ARGV[0] +ze_path = ARGV[1] +raise 'usage: gen_btx_mpi_with_ze_model.rb MPI_MODEL ZE_MODEL' unless mpi_path && ze_path + +ZE_WANTED_EVENT_NAMES = %w[ + lttng_ust_ze:zeMemAllocHost_entry + lttng_ust_ze:zeMemAllocHost_exit + lttng_ust_ze:zeMemAllocDevice_entry + lttng_ust_ze:zeMemAllocDevice_exit + lttng_ust_ze:zeMemAllocShared_entry + lttng_ust_ze:zeMemAllocShared_exit + lttng_ust_ze:zeMemFree_entry + lttng_ust_ze:zeMemFree_exit +].freeze + +mpi_model = YAML.load_file(mpi_path) +ze_model = YAML.load_file(ze_path) + +ze_events = ze_model.fetch(:stream_classes, []).flat_map do |sc| + sc.fetch(:event_classes, []).select { |ec| ZE_WANTED_EVENT_NAMES.include?(ec[:name]) } +end + +if ze_events.empty? + warn "WARNING: no ze allocation events found in #{ze_path}; " \ + 'GPU-aware MPI tagging will be inactive.' +end + +# Inject the ze events into the MPI stream class. There is only one +# stream class per backend model (see utils/gen_babeltrace_model_helper.rb). +mpi_model[:stream_classes].first[:event_classes].concat(ze_events) + +puts mpi_model.to_yaml From c5094b395007bc7d8f2b6e1d22a6cc33da000843 Mon Sep 17 00:00:00 2001 From: abagusetty Date: Wed, 3 Jun 2026 21:56:40 -0500 Subject: [PATCH 11/14] mpi: relax matching-model cast_types to match upstream Two matching failures surfaced when metababel walked the merged upstream model: 1. MPI buffer parameters are declared with two different cast_types in btx_mpi_model.yaml: 'const void *' for input buffers (sendbuf, buf on _Send variants, ...) and plain 'void *' for output buffers (recvbuf, buf on _Recv variants, ...). The matching model required 'const void *' on both, so mpi_2buf_entry matched zero events: 'No event matched mpi_2buf_entry, at least one matching event required.' Relax to '^(const )?void \\*$' on both buffer slots. 2. zeResult is typed ze_result_t in btx_ze_model.yaml. Matching against '^int$' fails. Use ze_result_t directly would force btx_mpiinterval_callbacks.cpp to include ze_api.h (heavy build dep on the MPI consumer just to spell an enum). Drop zeResult from the match entirely; a failed allocation has pptr_val == NULL, which the callback already guards against. --- backends/mpi/btx_mpiinterval_callbacks.cpp | 5 +++-- backends/mpi/btx_mpimatching_model.yaml | 16 +++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp index efd807071..62fd83852 100644 --- a/backends/mpi/btx_mpiinterval_callbacks.cpp +++ b/backends/mpi/btx_mpiinterval_callbacks.cpp @@ -280,11 +280,12 @@ static void ze_alloc_exit_callback(void *btx_handle, const char *hostname, int64_t vpid, uint64_t vtid, - int zeResult, void *pptr_val) { auto *data = static_cast(usr_data); auto size = data->entry_state.get_data({hostname, vpid, vtid}); - if (zeResult != 0 || pptr_val == nullptr) + // We don't see zeResult in the matching model to avoid pulling in + // ze_api.h; a failed allocation has pptr_val == NULL. + if (pptr_val == nullptr) return; const std::string ev{event_class_name}; diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml index 7183c1d00..f8d8d7b8a 100644 --- a/backends/mpi/btx_mpimatching_model.yaml +++ b/backends/mpi/btx_mpimatching_model.yaml @@ -69,7 +69,9 @@ # ------------------------------------------------------------------ # Two-buffer MPI calls (Sendrecv, Allreduce, Reduce, Bcast variants # with sendbuf+recvbuf, Get_accumulate with origin_addr+result_addr, - # etc.). + # etc.). The cast_type intentionally accepts both `const void *` + # (input buffers) and plain `void *` (output buffers) since MPI + # prototypes use both. - :set_id: mpi_2buf_entry :domain: entries :payload_field_class: @@ -77,10 +79,10 @@ :members: - :name: ^(sendbuf|buf|buffer|inbuf|origin_addr|inoutbuf)$ :field_class: - :cast_type: const void \* + :cast_type: ^(const )?void \*$ - :name: ^(recvbuf|outbuf|result_addr)$ :field_class: - :cast_type: const void \* + :cast_type: ^(const )?void \*$ # Single-buffer MPI calls (Send, Recv, Bcast, etc.). - :set_id: mpi_1buf_entry :domain: entries - mpi_2buf_entry @@ -89,7 +91,7 @@ :members: - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr|compare_addr)$ :field_class: - :cast_type: const void \* + :cast_type: ^(const )?void \*$ # Level Zero USM allocation / free events sourced from the ze # backend (see backends/ze/btx_zematching_model.yaml for matching # parents). We bind size at entry and the allocated pointer at exit, @@ -100,14 +102,14 @@ :type: structure :members: - :name: ^size$ + # We deliberately omit zeResult here so that the consumer callback + # signature does not need to depend on ze_api.h. A failed allocation + # has pptr_val == NULL, which the callback already guards against. - :set_id: ze_alloc_exit :name: "^lttng_ust_ze:zeMemAlloc.*_exit$" :payload_field_class: :type: structure :members: - - :name: ^zeResult$ - :field_class: - :cast_type: ^int$ - :name: ^pptr_val$ - :set_id: ze_free_entry :name: "^lttng_ust_ze:zeMemFree_entry$" From e4ca7a4f0101a668768a781f8afbabbc9b7f016a Mon Sep 17 00:00:00 2001 From: abagusetty Date: Thu, 4 Jun 2026 08:37:16 -0500 Subject: [PATCH 12/14] mpi: split buffer categories per MPI shape to satisfy metababel metababel's matching engine requires each :name: regex to bind to EXACTLY ONE field per matched event, so the broad regex ^(sendbuf|buf|buffer|inbuf|origin_addr|inoutbuf)$ fails on MPI_Reduce_local (has both inbuf and inoutbuf) with: Match expression '...' must match only one member, '2' matched ["inbuf", "inoutbuf"] Enumerate one category per distinct MPI buffer-shape so member regexes never overlap within an event: - mpi_cas_entry : origin_addr + compare_addr + result_addr (MPI_Compare_and_swap) - mpi_sendrecv_entry : sendbuf + recvbuf (Sendrecv, Allreduce, Reduce, Gather, ...) - mpi_reduce_local_entry : inbuf + inoutbuf (MPI_Reduce_local) - mpi_rma_fetch_entry : origin_addr + result_addr (MPI_Get_accumulate, MPI_Fetch_and_op) - mpi_1buf_entry : everything else with a single buffer field Each category subtracts the prior set_ids from its domain so an event falls into exactly one bucket. Register one consumer-side callback per shape; all delegate to the same tag_if_gpu helper. --- backends/mpi/btx_mpiinterval_callbacks.cpp | 74 ++++++++++++++++++---- backends/mpi/btx_mpimatching_model.yaml | 66 +++++++++++++++---- 2 files changed, 118 insertions(+), 22 deletions(-) diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp index 62fd83852..7b00c52bf 100644 --- a/backends/mpi/btx_mpiinterval_callbacks.cpp +++ b/backends/mpi/btx_mpiinterval_callbacks.cpp @@ -352,20 +352,68 @@ static void mpi_1buf_entry_callback(void *btx_handle, tag_if_gpu(static_cast(usr_data), hostname, vpid, vtid, buf); } -static void mpi_2buf_entry_callback(void *btx_handle, - void *usr_data, - int64_t ts, - const char *event_class_name, - const char *hostname, - int64_t vpid, - uint64_t vtid, - const void *sendbuf, - const void *recvbuf) { +// MPI_Sendrecv, MPI_Allreduce, MPI_Reduce, MPI_Gather, ... — the bulk. +static void mpi_sendrecv_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + const void *sendbuf, + const void *recvbuf) { auto *data = static_cast(usr_data); tag_if_gpu(data, hostname, vpid, vtid, sendbuf); tag_if_gpu(data, hostname, vpid, vtid, recvbuf); } +// MPI_Reduce_local(inbuf, inoutbuf, ...) +static void mpi_reduce_local_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + const void *inbuf, + const void *inoutbuf) { + auto *data = static_cast(usr_data); + tag_if_gpu(data, hostname, vpid, vtid, inbuf); + tag_if_gpu(data, hostname, vpid, vtid, inoutbuf); +} + +// MPI_Get_accumulate, MPI_Fetch_and_op (origin_addr + result_addr). +static void mpi_rma_fetch_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + const void *origin_addr, + const void *result_addr) { + auto *data = static_cast(usr_data); + tag_if_gpu(data, hostname, vpid, vtid, origin_addr); + tag_if_gpu(data, hostname, vpid, vtid, result_addr); +} + +// MPI_Compare_and_swap(origin_addr, compare_addr, result_addr, ...) +static void mpi_cas_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + const void *origin_addr, + const void *compare_addr, + const void *result_addr) { + auto *data = static_cast(usr_data); + tag_if_gpu(data, hostname, vpid, vtid, origin_addr); + tag_if_gpu(data, hostname, vpid, vtid, compare_addr); + tag_if_gpu(data, hostname, vpid, vtid, result_addr); +} + void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component); btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component); @@ -379,10 +427,14 @@ void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback); // GPU-aware MPI: ze alloc/free updates the range-map, MPI buffer entries - // look up against it. + // look up against it. One callback per MPI buffer-shape (see the matching + // model for why we cannot lump them into a single regex). btx_register_callbacks_ze_alloc_entry(btx_handle, &ze_alloc_entry_callback); btx_register_callbacks_ze_alloc_exit(btx_handle, &ze_alloc_exit_callback); btx_register_callbacks_ze_free_entry(btx_handle, &ze_free_entry_callback); btx_register_callbacks_mpi_1buf_entry(btx_handle, &mpi_1buf_entry_callback); - btx_register_callbacks_mpi_2buf_entry(btx_handle, &mpi_2buf_entry_callback); + btx_register_callbacks_mpi_sendrecv_entry(btx_handle, &mpi_sendrecv_entry_callback); + btx_register_callbacks_mpi_reduce_local_entry(btx_handle, &mpi_reduce_local_entry_callback); + btx_register_callbacks_mpi_rma_fetch_entry(btx_handle, &mpi_rma_fetch_entry_callback); + btx_register_callbacks_mpi_cas_entry(btx_handle, &mpi_cas_entry_callback); } diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml index f8d8d7b8a..b9ca7babe 100644 --- a/backends/mpi/btx_mpimatching_model.yaml +++ b/backends/mpi/btx_mpimatching_model.yaml @@ -67,29 +67,73 @@ # --backend mpi and --backend ze; with only --backend mpi the # range-map stays empty and no '*' ever appears. # ------------------------------------------------------------------ - # Two-buffer MPI calls (Sendrecv, Allreduce, Reduce, Bcast variants - # with sendbuf+recvbuf, Get_accumulate with origin_addr+result_addr, - # etc.). The cast_type intentionally accepts both `const void *` - # (input buffers) and plain `void *` (output buffers) since MPI - # prototypes use both. - - :set_id: mpi_2buf_entry + # MPI buffer categories. metababel requires each :name: regex to bind + # to EXACTLY ONE field per matched event, so we cannot lump all buffer + # names into a single regex slot: an event like MPI_Reduce_local has + # both `inbuf` and `inoutbuf`, and MPI_Compare_and_swap has three + # buffer fields, which would over-match. Define one category per + # distinct MPI buffer-shape, then a catch-all for single-buffer calls. + # The cast_type accepts both `const void *` (input buffers) and plain + # `void *` (output buffers) since MPI prototypes use both. + # + # 3-buffer: MPI_Compare_and_swap + - :set_id: mpi_cas_entry :domain: entries :payload_field_class: :type: structure :members: - - :name: ^(sendbuf|buf|buffer|inbuf|origin_addr|inoutbuf)$ + - :name: ^origin_addr$ + :field_class: + :cast_type: ^(const )?void \*$ + - :name: ^compare_addr$ + :field_class: + :cast_type: ^(const )?void \*$ + - :name: ^result_addr$ + :field_class: + :cast_type: ^(const )?void \*$ + # 2-buffer Send/Recv pair (Sendrecv, Allreduce, Reduce, Gather, ...). + - :set_id: mpi_sendrecv_entry + :domain: entries - mpi_cas_entry + :payload_field_class: + :type: structure + :members: + - :name: ^sendbuf$ + :field_class: + :cast_type: ^(const )?void \*$ + - :name: ^recvbuf$ + :field_class: + :cast_type: ^(const )?void \*$ + # 2-buffer Reduce_local pair (inbuf + inoutbuf). + - :set_id: mpi_reduce_local_entry + :domain: entries - mpi_cas_entry - mpi_sendrecv_entry + :payload_field_class: + :type: structure + :members: + - :name: ^inbuf$ + :field_class: + :cast_type: ^(const )?void \*$ + - :name: ^inoutbuf$ + :field_class: + :cast_type: ^(const )?void \*$ + # 2-buffer RMA fetch pair (Get_accumulate, Fetch_and_op). + - :set_id: mpi_rma_fetch_entry + :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry + :payload_field_class: + :type: structure + :members: + - :name: ^origin_addr$ :field_class: :cast_type: ^(const )?void \*$ - - :name: ^(recvbuf|outbuf|result_addr)$ + - :name: ^result_addr$ :field_class: :cast_type: ^(const )?void \*$ - # Single-buffer MPI calls (Send, Recv, Bcast, etc.). + # 1-buffer everything else (Send/Recv/Bcast/Put/Get/Accumulate/...). - :set_id: mpi_1buf_entry - :domain: entries - mpi_2buf_entry + :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry - mpi_rma_fetch_entry :payload_field_class: :type: structure :members: - - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr|compare_addr)$ + - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr)$ :field_class: :cast_type: ^(const )?void \*$ # Level Zero USM allocation / free events sourced from the ze From babf87d10777c1b24a926d03ee66e644847ef9fb Mon Sep 17 00:00:00 2001 From: abagusetty Date: Thu, 4 Jun 2026 08:38:46 -0500 Subject: [PATCH 13/14] mpi: tighten mpi_1buf_entry regex to actually unique names After enumerating all MPI calls and excluding the 4 multi-buffer categories, every remaining buffer-bearing MPI call carries exactly one of {buf, buffer, origin_addr}. Tightening the 1buf regex to ^(buf|buffer|origin_addr)$ removes the last source of in-event ambiguity for metababel. Pack/Unpack/Pack_external/Unpack_external are intentionally NOT classified: they carry inbuf+outbuf but the buffers never traverse the network, so GPU-aware tagging would be semantically meaningless. --- backends/mpi/btx_mpimatching_model.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml index b9ca7babe..798268888 100644 --- a/backends/mpi/btx_mpimatching_model.yaml +++ b/backends/mpi/btx_mpimatching_model.yaml @@ -128,12 +128,17 @@ :field_class: :cast_type: ^(const )?void \*$ # 1-buffer everything else (Send/Recv/Bcast/Put/Get/Accumulate/...). + # After excluding the multi-buffer categories above, every remaining + # buffer-bearing MPI call carries exactly one of {buf, buffer, + # origin_addr}. Pack/Unpack (inbuf + outbuf) is intentionally NOT + # included: the buffers never traverse the network, so GPU-aware + # tagging is meaningless there. - :set_id: mpi_1buf_entry :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry - mpi_rma_fetch_entry :payload_field_class: :type: structure :members: - - :name: ^(sendbuf|buf|buffer|inbuf|outbuf|inoutbuf|origin_addr|result_addr)$ + - :name: ^(buf|buffer|origin_addr)$ :field_class: :cast_type: ^(const )?void \*$ # Level Zero USM allocation / free events sourced from the ze From 1179340189ff026c027ef1df298273f47e7ad04b Mon Sep 17 00:00:00 2001 From: abagusetty Date: Thu, 4 Jun 2026 08:54:42 -0500 Subject: [PATCH 14/14] mpi: strip ze-specific fields from merged upstream model Two issues surfaced when the consumer compiled: 1. metababel emitted callback typedefs for the full ze allocation events (ze_context_handle_t, ze_device_mem_alloc_desc_t *, ze_result_t, ...). libMPIInterval is built with only mpi.h visible, so btx_upstream.h failed to compile with 'unknown type name ze_context_handle_t' (and friends). Teach gen_btx_mpi_with_ze_model.rb to keep only the payload fields the MPI consumer actually uses (size, pptr_val, ptr). After stripping, every generated typedef references only size_t / void *, which mpi.h transitively pulls in. 2. metababel strips qualifiers when generating callback typedefs, so the matching model's 'const void *' becomes 'void *' in the generated mpi_*_entry_callback_f signatures. Drop 'const' from each mpi_*_entry_callback parameter so the signatures match. (Same fix shape as the earlier buffer_info_callback fix.) Also replace filter_map with map.compact for Ruby <= 2.6 compatibility, even though Aurora's Ruby (3.3) supports filter_map. --- backends/mpi/btx_mpiinterval_callbacks.cpp | 23 ++++---- backends/mpi/gen_btx_mpi_with_ze_model.rb | 66 +++++++++++++++------- 2 files changed, 59 insertions(+), 30 deletions(-) diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp index 7b00c52bf..38d39caf0 100644 --- a/backends/mpi/btx_mpiinterval_callbacks.cpp +++ b/backends/mpi/btx_mpiinterval_callbacks.cpp @@ -341,6 +341,9 @@ tag_if_gpu(data_t *data, const char *hostname, int64_t vpid, uint64_t vtid, cons data->gpu_aware_calls.insert({hostname, vpid, vtid}); } +// metababel strips qualifiers when generating callback typedefs, so buffer +// parameters declared `const void *` in the matching model become `void *` +// in the generated mpi_*_entry_callback_f typedefs. Match exactly. static void mpi_1buf_entry_callback(void *btx_handle, void *usr_data, int64_t ts, @@ -348,7 +351,7 @@ static void mpi_1buf_entry_callback(void *btx_handle, const char *hostname, int64_t vpid, uint64_t vtid, - const void *buf) { + void *buf) { tag_if_gpu(static_cast(usr_data), hostname, vpid, vtid, buf); } @@ -360,8 +363,8 @@ static void mpi_sendrecv_entry_callback(void *btx_handle, const char *hostname, int64_t vpid, uint64_t vtid, - const void *sendbuf, - const void *recvbuf) { + void *sendbuf, + void *recvbuf) { auto *data = static_cast(usr_data); tag_if_gpu(data, hostname, vpid, vtid, sendbuf); tag_if_gpu(data, hostname, vpid, vtid, recvbuf); @@ -375,8 +378,8 @@ static void mpi_reduce_local_entry_callback(void *btx_handle, const char *hostname, int64_t vpid, uint64_t vtid, - const void *inbuf, - const void *inoutbuf) { + void *inbuf, + void *inoutbuf) { auto *data = static_cast(usr_data); tag_if_gpu(data, hostname, vpid, vtid, inbuf); tag_if_gpu(data, hostname, vpid, vtid, inoutbuf); @@ -390,8 +393,8 @@ static void mpi_rma_fetch_entry_callback(void *btx_handle, const char *hostname, int64_t vpid, uint64_t vtid, - const void *origin_addr, - const void *result_addr) { + void *origin_addr, + void *result_addr) { auto *data = static_cast(usr_data); tag_if_gpu(data, hostname, vpid, vtid, origin_addr); tag_if_gpu(data, hostname, vpid, vtid, result_addr); @@ -405,9 +408,9 @@ static void mpi_cas_entry_callback(void *btx_handle, const char *hostname, int64_t vpid, uint64_t vtid, - const void *origin_addr, - const void *compare_addr, - const void *result_addr) { + void *origin_addr, + void *compare_addr, + void *result_addr) { auto *data = static_cast(usr_data); tag_if_gpu(data, hostname, vpid, vtid, origin_addr); tag_if_gpu(data, hostname, vpid, vtid, compare_addr); diff --git a/backends/mpi/gen_btx_mpi_with_ze_model.rb b/backends/mpi/gen_btx_mpi_with_ze_model.rb index 1365a593c..e68530f0b 100644 --- a/backends/mpi/gen_btx_mpi_with_ze_model.rb +++ b/backends/mpi/gen_btx_mpi_with_ze_model.rb @@ -3,16 +3,26 @@ # # btx_filter_mpi needs to receive Level Zero allocation/free events in # addition to MPI events so that btx_mpiinterval_callbacks.cpp can -# maintain an address-range map and tag GPU-aware MPI calls. metababel's -# -u union of two backend models fails because both declare -# :environment: :entries: [{name: hostname}], which the matching -# engine treats as an ambiguity. +# maintain an address-range map and tag GPU-aware MPI calls. Two +# constraints conspire to make this non-trivial: # -# This generator side-steps the issue by emitting a single model file -# that: -# - starts from btx_mpi_model.yaml, -# - copies only the ze allocation/free event classes we actually -# need into the same stream class. +# 1. metababel's bare `-u model_a,model_b` union fails because each +# backend model declares :environment: :entries:[{name: hostname}], +# causing the matching engine to see an ambiguous member. +# +# 2. The MPI bt2 plugin (libMPIInterval) is built with only mpi.h on +# the include path; pulling in the full ze event payloads would +# drag in types (ze_context_handle_t, ze_result_t, +# ze_device_mem_alloc_desc_t *, ...) that the MPI consumer cannot +# resolve. +# +# Side-step both by emitting a single merged model that: +# - starts from btx_mpi_model.yaml (single :environment:), +# - copies only the ze allocation/free event classes we need into +# MPI's stream class, +# - keeps only the payload fields the MPI consumer actually uses, +# so all C types referenced in the generated upstream typedefs +# remain primitive (size_t / void *) and need no ze headers. # # Usage: ruby gen_btx_mpi_with_ze_model.rb MPI_MODEL.yaml ZE_MODEL.yaml > OUT.yaml require 'yaml' @@ -21,22 +31,38 @@ ze_path = ARGV[1] raise 'usage: gen_btx_mpi_with_ze_model.rb MPI_MODEL ZE_MODEL' unless mpi_path && ze_path -ZE_WANTED_EVENT_NAMES = %w[ - lttng_ust_ze:zeMemAllocHost_entry - lttng_ust_ze:zeMemAllocHost_exit - lttng_ust_ze:zeMemAllocDevice_entry - lttng_ust_ze:zeMemAllocDevice_exit - lttng_ust_ze:zeMemAllocShared_entry - lttng_ust_ze:zeMemAllocShared_exit - lttng_ust_ze:zeMemFree_entry - lttng_ust_ze:zeMemFree_exit -].freeze +# (event name) -> (allow-list of payload field names the MPI consumer +# needs to see). Fields not in the allow-list are dropped from the +# copied event, keeping the generated upstream typedefs free of any +# ze-specific types. +ZE_EVENT_FIELDS = { + 'lttng_ust_ze:zeMemAllocHost_entry' => %w[size], + 'lttng_ust_ze:zeMemAllocHost_exit' => %w[pptr_val], + 'lttng_ust_ze:zeMemAllocDevice_entry' => %w[size], + 'lttng_ust_ze:zeMemAllocDevice_exit' => %w[pptr_val], + 'lttng_ust_ze:zeMemAllocShared_entry' => %w[size], + 'lttng_ust_ze:zeMemAllocShared_exit' => %w[pptr_val], + 'lttng_ust_ze:zeMemFree_entry' => %w[ptr], + 'lttng_ust_ze:zeMemFree_exit' => %w[], +}.freeze mpi_model = YAML.load_file(mpi_path) ze_model = YAML.load_file(ze_path) ze_events = ze_model.fetch(:stream_classes, []).flat_map do |sc| - sc.fetch(:event_classes, []).select { |ec| ZE_WANTED_EVENT_NAMES.include?(ec[:name]) } + sc.fetch(:event_classes, []).map do |ec| + wanted = ZE_EVENT_FIELDS[ec[:name]] + next nil unless wanted + + # Deep-copy via Marshal so we can mutate freely without aliasing the + # original ze model. + ec = Marshal.load(Marshal.dump(ec)) + if (pfc = ec[:payload_field_class]) && (members = pfc[:members]) + pfc[:members] = members.select { |m| wanted.include?(m[:name].to_s) } + ec.delete(:payload_field_class) if pfc[:members].empty? + end + ec + end.compact end if ze_events.empty?