Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions backends/mpi/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,23 @@ EXTRA_DIST += \
$(top_srcdir)/xprof/btx_interval_model.yaml \
btx_mpimatching_model.yaml

$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml
$(METABABEL) -u btx_mpi_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include
# The MPI interval filter needs to receive Level Zero allocation/free
# events in addition to MPI events so that btx_mpiinterval_callbacks.cpp
# can tag GPU-aware MPI calls via address-range lookup. metababel's bare
# `-u model_a,model_b` union fails when both backend models declare the
# same environment field (hostname), so we generate a merged upstream
# model that copies only the ze events we need into MPI's stream class.
# This adds a build-order dependency mpi -> ze.
BTX_ZE_MODEL = $(top_builddir)/backends/ze/btx_ze_model.yaml

btx_mpi_with_ze_model.yaml: $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL)
$(RUBY) $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL) > $@

CLEANFILES += btx_mpi_with_ze_model.yaml
EXTRA_DIST += gen_btx_mpi_with_ze_model.rb

$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_with_ze_model.yaml
$(METABABEL) -u btx_mpi_with_ze_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include

CLEANFILES += \
$(BTX_MPI_GENERATED) \
Expand Down
235 changes: 233 additions & 2 deletions backends/mpi/btx_mpiinterval_callbacks.cpp
Original file line number Diff line number Diff line change
@@ -1,17 +1,69 @@
#include "xprof_utils.hpp"
#include <iostream>
#include <map>
#include <metababel/metababel.h>
#include <mpi.h.include>
#include <sstream>
#include <string>
#include <tuple>
#include <unordered_set>

// ---------------------------------------------------------------------------
// GPU-aware MPI tagging
//
// We maintain a per-process address-range map populated from
// lttng_ust_ze:zeMemAlloc*_exit / zeMemFree_entry events emitted by the ze
// backend (joined into our upstream model via backends/mpi/Makefile.am).
// Each MPI buffer-bearing entry event looks up its buffer pointer in that
// map; if the pointer falls inside a known GPU allocation, the
// {hostname,vpid,vtid} key is recorded in gpu_aware_calls. The matching
// exit callback consumes the key and prepends '*' to the MPI call name
// before pushing it to the aggregator, producing a separate '*MPI_Foo'
// row in the tally.
//
// When the user enables only --backend mpi, no ze events flow in, the
// range-map stays empty, and no '*' ever appears (CPU-only profile).
// ---------------------------------------------------------------------------

// Per-process address-range map: base -> (base + size).
typedef std::map<uintptr_t, uintptr_t> memory_interval_t;

struct data_s {
EntryState entry_state;
// GPU allocation ranges, keyed on {hostname, vpid}. Mirrors the
// rangeset_memory_{host,device,shared} pattern in
// backends/ze/btx_zeinterval_callbacks.cpp.
std::unordered_map<hp_t, memory_interval_t> rangeset_memory_device;
std::unordered_map<hp_t, memory_interval_t> rangeset_memory_shared;
// {hostname, vpid, vtid} of MPI calls observed to carry at least one
// device/shared GPU buffer. Populated in mpi_*_buf_entry_callback,
// consumed (and cleared) in send_host_message.
std::unordered_set<hpt_t> gpu_aware_calls;
};

typedef struct data_s data_t;

// True iff `ptr` lies inside any allocation tracked in `m`.
static bool memory_includes(const memory_interval_t &m, uintptr_t ptr) {
const auto it = m.upper_bound(ptr);
if (it == m.cbegin())
return false;
return ptr < std::prev(it)->second;
}

// True iff `ptr` lies inside a tracked GPU (device or shared) allocation
// for the given {hostname, vpid}. Host USM is intentionally NOT tagged
// as GPU-aware here (it behaves like host memory from MPI's perspective).
static bool is_gpu_pointer(const data_t *data, const hp_t &hp, uintptr_t ptr) {
auto itd = data->rangeset_memory_device.find(hp);
if (itd != data->rangeset_memory_device.end() && memory_includes(itd->second, ptr))
return true;
auto its = data->rangeset_memory_shared.find(hp);
if (its != data->rangeset_memory_shared.end() && memory_includes(its->second, ptr))
return true;
return false;
}

static void send_host_message(void *btx_handle,
void *usr_data,
int64_t ts,
Expand All @@ -21,9 +73,16 @@ static void send_host_message(void *btx_handle,
uint64_t vtid,
bool err) {

auto *data = static_cast<data_t *>(usr_data);
const hpt_t hpt{hostname, vpid, vtid};

std::string event_class_name_striped = strip_event_class_name_exit(event_class_name);
const int64_t entry_ts =
static_cast<data_t *>(usr_data)->entry_state.get_ts({hostname, vpid, vtid});
// GPU-aware MPI calls get a leading '*' so they sort and aggregate as a
// distinct row from their CPU-only counterparts in the tally output.
if (data->gpu_aware_calls.erase(hpt) > 0)
event_class_name_striped.insert(0, 1, '*');

const int64_t entry_ts = data->entry_state.get_ts(hpt);

btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI,
event_class_name_striped.c_str(), (ts - entry_ts), err);
Expand Down Expand Up @@ -198,6 +257,166 @@ static void traffic_int_entry_callback(void *btx_handle,
count * size, oss.str().c_str());
}

// --- ze allocation / free tracking ----------------------------------------
// Bind allocation size at entry; the matching exit callback uses it together
// with the freshly-returned pointer to populate the range-map. The
// event_class_name suffix (Device vs Shared vs Host) selects which rangeset
// the allocation lands in.
static void ze_alloc_entry_callback(void *btx_handle,
void *usr_data,
int64_t ts,
const char *event_class_name,
const char *hostname,
int64_t vpid,
uint64_t vtid,
size_t size) {
static_cast<data_t *>(usr_data)->entry_state.set_data({hostname, vpid, vtid}, size);
}

static void ze_alloc_exit_callback(void *btx_handle,
void *usr_data,
int64_t ts,
const char *event_class_name,
const char *hostname,
int64_t vpid,
uint64_t vtid,
void *pptr_val) {
auto *data = static_cast<data_t *>(usr_data);
auto size = data->entry_state.get_data<size_t>({hostname, vpid, vtid});
// We don't see zeResult in the matching model to avoid pulling in
// ze_api.h; a failed allocation has pptr_val == NULL.
if (pptr_val == nullptr)
return;

const std::string ev{event_class_name};
// Host USM is reachable from the host CPU so MPI sees it as a normal host
// pointer; classifying it as GPU-aware would produce false positives.
// Device and Shared (managed) USM live (at least partly) on the device,
// so MPI calls on them are GPU-aware.
std::unordered_map<hp_t, memory_interval_t> *mi = nullptr;
if (ev.find("zeMemAllocDevice") != std::string::npos)
mi = &data->rangeset_memory_device;
else if (ev.find("zeMemAllocShared") != std::string::npos)
mi = &data->rangeset_memory_shared;
else
return; // host USM, ignored

const uintptr_t base = reinterpret_cast<uintptr_t>(pptr_val);
(*mi)[{hostname, vpid}][base] = base + size;
}

static void ze_free_entry_callback(void *btx_handle,
void *usr_data,
int64_t ts,
const char *event_class_name,
const char *hostname,
int64_t vpid,
uint64_t vtid,
void *ptr) {
auto *data = static_cast<data_t *>(usr_data);
if (ptr == nullptr)
return;
const uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
const hp_t hp{hostname, vpid};
// The freed pointer may have been classified as device OR shared at
// allocation; try both.
auto itd = data->rangeset_memory_device.find(hp);
if (itd != data->rangeset_memory_device.end())
itd->second.erase(base);
auto its = data->rangeset_memory_shared.find(hp);
if (its != data->rangeset_memory_shared.end())
its->second.erase(base);
}

// --- MPI buffer classification --------------------------------------------
static void
tag_if_gpu(data_t *data, const char *hostname, int64_t vpid, uint64_t vtid, const void *ptr) {
if (ptr == nullptr)
return;
// MPICH sentinels: MPI_IN_PLACE / MPI_BOTTOM. These would never match a
// real allocation but checking explicitly keeps the intent clear.
if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM)
return;
if (is_gpu_pointer(data, {hostname, vpid}, reinterpret_cast<uintptr_t>(ptr)))
data->gpu_aware_calls.insert({hostname, vpid, vtid});
}

// metababel strips qualifiers when generating callback typedefs, so buffer
// parameters declared `const void *` in the matching model become `void *`
// in the generated mpi_*_entry_callback_f typedefs. Match exactly.
static void mpi_1buf_entry_callback(void *btx_handle,
void *usr_data,
int64_t ts,
const char *event_class_name,
const char *hostname,
int64_t vpid,
uint64_t vtid,
void *buf) {
tag_if_gpu(static_cast<data_t *>(usr_data), hostname, vpid, vtid, buf);
}

// MPI_Sendrecv, MPI_Allreduce, MPI_Reduce, MPI_Gather, ... — the bulk.
static void mpi_sendrecv_entry_callback(void *btx_handle,
void *usr_data,
int64_t ts,
const char *event_class_name,
const char *hostname,
int64_t vpid,
uint64_t vtid,
void *sendbuf,
void *recvbuf) {
auto *data = static_cast<data_t *>(usr_data);
tag_if_gpu(data, hostname, vpid, vtid, sendbuf);
tag_if_gpu(data, hostname, vpid, vtid, recvbuf);
}

// MPI_Reduce_local(inbuf, inoutbuf, ...)
static void mpi_reduce_local_entry_callback(void *btx_handle,
void *usr_data,
int64_t ts,
const char *event_class_name,
const char *hostname,
int64_t vpid,
uint64_t vtid,
void *inbuf,
void *inoutbuf) {
auto *data = static_cast<data_t *>(usr_data);
tag_if_gpu(data, hostname, vpid, vtid, inbuf);
tag_if_gpu(data, hostname, vpid, vtid, inoutbuf);
}

// MPI_Get_accumulate, MPI_Fetch_and_op (origin_addr + result_addr).
static void mpi_rma_fetch_entry_callback(void *btx_handle,
void *usr_data,
int64_t ts,
const char *event_class_name,
const char *hostname,
int64_t vpid,
uint64_t vtid,
void *origin_addr,
void *result_addr) {
auto *data = static_cast<data_t *>(usr_data);
tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
tag_if_gpu(data, hostname, vpid, vtid, result_addr);
}

// MPI_Compare_and_swap(origin_addr, compare_addr, result_addr, ...)
static void mpi_cas_entry_callback(void *btx_handle,
void *usr_data,
int64_t ts,
const char *event_class_name,
const char *hostname,
int64_t vpid,
uint64_t vtid,
void *origin_addr,
void *compare_addr,
void *result_addr) {
auto *data = static_cast<data_t *>(usr_data);
tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
tag_if_gpu(data, hostname, vpid, vtid, compare_addr);
tag_if_gpu(data, hostname, vpid, vtid, result_addr);
}

void btx_register_usr_callbacks(void *btx_handle) {
btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component);
btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component);
Expand All @@ -209,4 +428,16 @@ void btx_register_usr_callbacks(void *btx_handle) {
btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback);

btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback);

// GPU-aware MPI: ze alloc/free updates the range-map, MPI buffer entries
// look up against it. One callback per MPI buffer-shape (see the matching
// model for why we cannot lump them into a single regex).
btx_register_callbacks_ze_alloc_entry(btx_handle, &ze_alloc_entry_callback);
btx_register_callbacks_ze_alloc_exit(btx_handle, &ze_alloc_exit_callback);
btx_register_callbacks_ze_free_entry(btx_handle, &ze_free_entry_callback);
btx_register_callbacks_mpi_1buf_entry(btx_handle, &mpi_1buf_entry_callback);
btx_register_callbacks_mpi_sendrecv_entry(btx_handle, &mpi_sendrecv_entry_callback);
btx_register_callbacks_mpi_reduce_local_entry(btx_handle, &mpi_reduce_local_entry_callback);
btx_register_callbacks_mpi_rma_fetch_entry(btx_handle, &mpi_rma_fetch_entry_callback);
btx_register_callbacks_mpi_cas_entry(btx_handle, &mpi_cas_entry_callback);
}
Loading
Loading