From 5ec3a4e8eae50017aa4d8c90f16ae8279d95431d Mon Sep 17 00:00:00 2001 From: "Paul H. Hargrove" Date: Sat, 19 Feb 2022 13:06:50 -0800 Subject: [PATCH] ucx: check memory kinds support at MK_Create time This commit resolves the "detect" aspect of Bug 4382: "Detect/document ucx-conduit native kinds when UCX build lacks GDR support". As agreed in the bug report, `gex_MK_Create` will now return a non-fatal `GASNET_ERR_BAD_ARG`. A well written client can potentially recover by falling back to use of some non-kinds comms. In the future, however, GASNet-EX would ideally fall back to a reference implementation which would "do the right thing" transparently. Example output from process 0 in a run of `testcudauva` on a system with CUDA GPUs, but lacking CUDA support in the UCX library: ``` *** WARNING (proc 0): GASNet gasnetc_mk_create_hook returning an error code: GASNET_ERR_BAD_ARG (Invalid function parameter passed) at /[REDACTED]/gasnet/ucx-conduit/gasnet_kinds.c:93 reason: Requested device memory type is not supported in the UCX library ERROR calling: gex_MK_Create(&kind, myclient, &args, 0) at: /[REDACTED]/gasnet/tests/testcudauva.c:203 error: GASNET_ERR_BAD_ARG (Invalid function parameter passed) ``` --- ucx-conduit/Makefile.am | 2 + ucx-conduit/gasnet_core_fwd.h | 2 +- ucx-conduit/gasnet_kinds.c | 97 +++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 ucx-conduit/gasnet_kinds.c diff --git a/ucx-conduit/Makefile.am b/ucx-conduit/Makefile.am index 19e9fcc7a..4c5ece2dc 100644 --- a/ucx-conduit/Makefile.am +++ b/ucx-conduit/Makefile.am @@ -59,6 +59,7 @@ CONDUIT_FILELIST = \ gasnet_ucx_req.h \ gasnet_extended.c \ gasnet_extended_fwd.h \ + gasnet_kinds.c \ gasnet_ratomic.c \ gasnet_ratomic_fwd.h @@ -68,6 +69,7 @@ CONDUIT_SOURCELIST = \ $(srcdir)/gasnet_core.c \ $(srcdir)/gasnet_core_sndrcv.c \ $(srcdir)/gasnet_extended.c \ + $(srcdir)/gasnet_kinds.c \ $(srcdir)/gasnet_ratomic.c \ $(ssh_sources) $(pmi_sources) diff --git a/ucx-conduit/gasnet_core_fwd.h b/ucx-conduit/gasnet_core_fwd.h index c3cdf2853..2c49738f5 100644 --- a/ucx-conduit/gasnet_core_fwd.h +++ b/ucx-conduit/gasnet_core_fwd.h @@ -182,7 +182,7 @@ // Uncomment the following defines if conduit provides the corresponding hook. // See other/kinds/gasnet_kinds_internal.h for prototypes and brief descriptions. -//#define GASNETC_MK_CREATE_HOOK 1 +#define GASNETC_MK_CREATE_HOOK 1 //#define GASNETC_MK_DESTROY_HOOK 1 // If conduit supports GASNET_MAXEPS!=1, set default and (optional) max values here. diff --git a/ucx-conduit/gasnet_kinds.c b/ucx-conduit/gasnet_kinds.c new file mode 100644 index 000000000..d2c83acd7 --- /dev/null +++ b/ucx-conduit/gasnet_kinds.c @@ -0,0 +1,97 @@ +/* $Source: bitbucket.org:berkeleylab/gasnet.git/ucx-conduit/gasnet_kinds.c $ + * Description: GASNet Memory Kinds implementation + * Copyright 2022, The Regents of the University of California + * Terms of use are as specified in license.txt + */ + +#define GASNETI_NEED_GASNET_MK_H 1 +#include +#include + +#include + +// 1 if found +// 0 if not found +// -1 if error prevented search +static +int check_transport(const char *tr) { + int found = 0; + ucs_status_t st; + +#if UCT_API < UCT_VERSION(1,7) + uct_md_resource_desc_t *mds = NULL; + unsigned int num_md; + st = uct_query_md_resources(&mds, &num_md); + if (st) return -1; + for (unsigned int i = 0; (i < num_md) && !found; ++i) { + found = !strcmp(tr, mds[i].md_name); + } + uct_release_md_resource_list(mds); +#else + uct_component_h *comps; + unsigned int num_comp; + st = uct_query_components(&comps, &num_comp); + if (st) return -1; + for (unsigned int i = 0; (i < num_comp) && !found; ++i) { + uct_component_attr_t attr; + attr.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | + UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT; + st = uct_component_query(comps[i], &attr); + if (st) { found = -1; break; } + found = !strcmp(tr, attr.name) && attr.md_resource_count; + } + uct_release_component_list(comps); +#endif + + return found; +} + +int gasnetc_mk_create_hook( + gasneti_MK_t kind, + gasneti_Client_t client, + const gex_MK_Create_args_t *args, + gex_Flags_t flags) +{ + // Verify that the UCX library has support for the requested device + // + // We probe "[foo]_ipc" because these names have remained stable across + // versions, while both "cuda_copy" and "cuda_cpy" have been used at times. + // However, "[foo]_copy" and "[foo]_ipc" are inseparable in the UCX build. + + int found = 0; + switch (args->gex_class) { + #if GASNETI_MK_CLASS_CUDA_UVA_ENABLED + case GEX_MK_CLASS_CUDA_UVA: + found = check_transport("cuda_ipc"); + break; + #endif + + #if GASNETI_MK_CLASS_HIP_ENABLED + case GEX_MK_CLASS_HIP: + #if GASNETI_HIP_PLATFORM_AMD + found = check_transport("rocm_ipc"); + #elif GASNETI_HIP_PLATFORM_NVIDIA + found = check_transport("cuda_ipc"); + #else + #error Unknown HIP platform + #endif + break; + #endif + + #if GASNETI_MK_CLASS_ZE_ENABLED + case GEX_MK_CLASS_ZE: + found = check_transport("ze_ipc"); + break; + #endif + + default: + gasneti_unreachable_error(("Unknown memory kind '%s'", kind->_mk_impl->mk_name)); + break; + } + + if (found != 1) { + GASNETI_RETURN_ERRR(BAD_ARG,"Requested device memory type is not supported in the UCX library"); + } + + return GASNET_OK; +}