From 77854f0ed414232077f1c74f0c50522f404a18bb Mon Sep 17 00:00:00 2001 From: "Paul H. Hargrove" Date: Tue, 2 Mar 2021 20:47:57 -0800 Subject: [PATCH] ibv: NPAM Long via RDMA This commit implements a RDMA protocol for GASNet-allocated NPAM Long, where previously only a "packed" protocol was available. The new environment variable `GASNET_PACKEDLONG_ALLOC_LIMIT` is added to control the protocol switch point (default is to match the value of `GASNET_PACKEDLONG_LIMIT`). Relative to the packed protocol, this saves the `memcpy()` at the target at the expense of adding a second ibv-level message injection at the initiator. --- ibv-conduit/README | 7 ++++- ibv-conduit/gasnet_core.c | 32 ++++++++++++++++++++-- ibv-conduit/gasnet_core_fwd.h | 1 + ibv-conduit/gasnet_core_internal.h | 6 ++++ ibv-conduit/gasnet_core_sndrcv.c | 44 ++++++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 3 deletions(-) diff --git a/ibv-conduit/README b/ibv-conduit/README index bd71346ed..2ce9d0e9e 100644 --- a/ibv-conduit/README +++ b/ibv-conduit/README @@ -803,7 +803,12 @@ Paul H. Hargrove The default value is the maximum that, together with the maximum sized header, fits into a 4KiB transfer (currently 4012). A value of zero ensures the payload and header always travel separately. - + + + GASNET_PACKEDLONG_ALLOC_LIMIT + As for GASNET_PACKEDLONG_LIMIT, above, but for the case of Negotiated- + Payload AMLong with GASNet-allocated buffer. + The default is to take on the value of GASNET_PACKEDLONG_LIMIT. + + GASNET_NONBULKPUT_BOUNCE_LIMIT This parameter sets the limit on the use of bounce buffers to achieve local completion of "non-bulk" PUT and AMLong payload transfers. When diff --git a/ibv-conduit/gasnet_core.c b/ibv-conduit/gasnet_core.c index e91c97c85..a03dae808 100644 --- a/ibv-conduit/gasnet_core.c +++ b/ibv-conduit/gasnet_core.c @@ -943,6 +943,7 @@ static int gasnetc_load_settings(void) { GASNETC_ENVINT(gasnetc_inline_limit, GASNET_INLINESEND_LIMIT, GASNETC_DEFAULT_INLINESEND_LIMIT, -1, 0); GASNETC_ENVINT(gasnetc_nonbulk_bounce_limit, GASNET_NONBULKPUT_BOUNCE_LIMIT, GASNETC_DEFAULT_NONBULKPUT_BOUNCE_LIMIT, 0, 1); GASNETC_ENVINT(gasnetc_packedlong_limit, GASNET_PACKEDLONG_LIMIT, GASNETC_DEFAULT_PACKEDLONG_LIMIT, 0, 1); + GASNETC_ENVINT(gasnetc_packedlong_alloc_limit, GASNET_PACKEDLONG_ALLOC_LIMIT, gasnetc_packedlong_limit, 0, 1); GASNETC_ENVINT(gasnetc_am_gather_min, GASNET_AM_GATHER_MIN, GASNETC_DEFAULT_AM_GATHER_MIN, -1, 1); if (gasnetc_am_gather_min == -1) { // -1 is the documented value to disable this optimization @@ -1067,6 +1068,12 @@ static int gasnetc_load_settings(void) { (unsigned int)gasnetc_packedlong_limit, (unsigned int)GASNETC_MAX_PACKEDLONG); gasnetc_packedlong_limit = GASNETC_MAX_PACKEDLONG; } + if_pf (gasnetc_packedlong_alloc_limit > GASNETC_MAX_PACKEDLONG_(0)) { + fprintf(stderr, + "WARNING: GASNET_PACKEDLONG_ALLOC_LIMIT reduced from %u to %u\n", + (unsigned int)gasnetc_packedlong_alloc_limit, (unsigned int)GASNETC_MAX_PACKEDLONG_(0)); + gasnetc_packedlong_alloc_limit = GASNETC_MAX_PACKEDLONG_(0); + } #if GASNETC_DYNAMIC_CONNECT gasnetc_conn_static = gasneti_getenv_yesno_withdefault("GASNET_CONNECT_STATIC", 1); @@ -5175,8 +5182,29 @@ int gasnetc_commit_common( gasneti_assert(!lc_opt); local_cb = NULL; local_cnt = NULL; - // TODO: RDMA of Long payload can be beneficial - copy_len = nbytes; + switch (category) { + #if GASNET_NATIVE_NP_ALLOC_REQ_MEDIUM || GASNET_NATIVE_NP_ALLOC_REP_MEDIUM + case gasneti_Medium: + copy_len = nbytes; + break; + #endif + + #if GASNET_NATIVE_NP_ALLOC_REQ_LONG || GASNET_NATIVE_NP_ALLOC_REP_LONG + case gasneti_Long: + if ((nbytes <= gasnetc_packedlong_alloc_limit) || !sd->_buf_alloc || (!GASNETC_PIN_SEGMENT && is_reply)) { + // Small enough to send like a Medium OR not in a bounce buffer (forced for firehose Reply) + copy_len = nbytes; + } else { + // Inject RMA + int rc = gasnetc_rdma_npam_long_put(sd->_ep, sd->_cep, sd->_addr, dest_addr, nbytes, + /*imm*/0 GASNETI_THREAD_PASS); + gasneti_assert(!rc); // Never fails, since never "immediate" + } + break; + #endif + + default: gasneti_unreachable_error(("Invalid AM category: 0x%x",(int)category)); + } } int rc = gasnetc_am_commit( diff --git a/ibv-conduit/gasnet_core_fwd.h b/ibv-conduit/gasnet_core_fwd.h index 6bfa07fbe..5f6b0a2e7 100644 --- a/ibv-conduit/gasnet_core_fwd.h +++ b/ibv-conduit/gasnet_core_fwd.h @@ -256,6 +256,7 @@ VAL(C, RDMA_PUT_BOUNCE, bytes) \ VAL(C, RDMA_PUT_ZEROCP, bytes) \ VAL(C, RDMA_PUT_READONLY, bytes) \ + VAL(C, RDMA_PUT_BUFFERED, bytes) \ VAL(C, RDMA_GET_BOUNCE, bytes) \ VAL(C, RDMA_GET_ZEROCP, bytes) \ CNT(C, ALLOC_AM_SPARE, cnt) \ diff --git a/ibv-conduit/gasnet_core_internal.h b/ibv-conduit/gasnet_core_internal.h index 9d8d90fa8..a1591deef 100644 --- a/ibv-conduit/gasnet_core_internal.h +++ b/ibv-conduit/gasnet_core_internal.h @@ -796,6 +796,7 @@ typedef enum { // Long payload puts do NOT need fencing (see bug 4049) GASNETC_OP_LONG_ZEROCP, GASNETC_OP_LONG_BOUNCE, + GASNETC_OP_LONG_BUFFERED, // Following all have GASNETC_OP_NEEDS_FENCE bit set GASNETC_OP_PUT_INLINE = GASNETC_OP_NEEDS_FENCE, GASNETC_OP_PUT_ZEROCP, @@ -1039,6 +1040,10 @@ extern int gasnetc_rdma_long_put( void *src_ptr, void *dst_ptr, size_t nbytes, gex_Flags_t flags, gasnetc_atomic_val_t *local_cnt, gasnetc_cb_t local_cb GASNETI_THREAD_FARG); +extern int gasnetc_rdma_npam_long_put( + gasnetc_EP_t ep, gasnetc_cep_t *cep, + void *src_ptr, void *dst_ptr, size_t nbytes, gex_Flags_t flags + GASNETI_THREAD_FARG); extern int gasnetc_rdma_get( gex_TM_t tm, gex_Rank_t rank, void *src_ptr, void *dst_ptr, size_t nbytes, gex_Flags_t flags, @@ -1156,6 +1161,7 @@ extern int gasnetc_am_credits_slack; extern int gasnetc_alloc_qps; /* Number of QPs per node in gasnetc_ceps[] */ extern int gasnetc_num_qps; /* How many QPs to use per peer */ extern size_t gasnetc_packedlong_limit; +extern size_t gasnetc_packedlong_alloc_limit; extern size_t gasnetc_inline_limit; extern size_t gasnetc_nonbulk_bounce_limit; #if !GASNETC_PIN_SEGMENT diff --git a/ibv-conduit/gasnet_core_sndrcv.c b/ibv-conduit/gasnet_core_sndrcv.c index 51a782199..dec889d12 100644 --- a/ibv-conduit/gasnet_core_sndrcv.c +++ b/ibv-conduit/gasnet_core_sndrcv.c @@ -38,6 +38,7 @@ size_t gasnetc_fh_align_mask; size_t gasnetc_inline_limit; size_t gasnetc_nonbulk_bounce_limit; size_t gasnetc_packedlong_limit; // TODO-EX: adjust w/ nargs? +size_t gasnetc_packedlong_alloc_limit; // TODO-EX: adjust w/ nargs? size_t gasnetc_put_stripe_sz, gasnetc_put_stripe_split; size_t gasnetc_get_stripe_sz, gasnetc_get_stripe_split; #if !GASNETC_PIN_SEGMENT @@ -1008,6 +1009,12 @@ void gasnetc_snd_reap_one(struct ibv_wc *comp_p, gasnetc_hca_t *hca GASNETC_COLL #endif break; + #if GASNET_NATIVE_NP_ALLOC_REQ_LONG || GASNET_NATIVE_NP_ALLOC_REP_LONG + case GASNETC_OP_LONG_BUFFERED: // Zero-copy Long payload with source in the header buffer + gasneti_assert(sreq->comp.cb == NULL); + break; + #endif + case GASNETC_OP_PUT_ZEROCP: // Zero-copy PUT case GASNETC_OP_LONG_ZEROCP: // Zero-copy Long payload if (sreq->comp.cb != NULL) { @@ -3854,6 +3861,43 @@ extern int gasnetc_rdma_long_put( return 0; } +#if GASNET_NATIVE_NP_ALLOC_REQ_LONG || GASNET_NATIVE_NP_ALLOC_REP_LONG +// Put specialized for needs of NPAM Long payload +// * caller needs to control the qpi (via cep) +// * never has local callbacks +// * never has remote callbacks +// * source lies within the buffer containing the AM header +// * assumed never small enough for inline send (would be packed instead) +extern int gasnetc_rdma_npam_long_put( + gasnetc_EP_t ep, gasnetc_cep_t *cep, + void *src_ptr, void *dst_ptr, + size_t nbytes, + gex_Flags_t flags + GASNETI_THREAD_FARG) +{ + gasnetc_epid_t epid = cep->epid; + GASNETC_DECL_SR_DESC(sr_desc, GASNETC_SND_SG); + gasnetc_sreq_t * const sreq = gasnetc_get_sreq(GASNETC_OP_LONG_BUFFERED GASNETI_THREAD_PASS); + + gasneti_assert(nbytes != 0); + + // TODO-EX: + // All uses of rem_auxseg are a temporary hack + // This will be replaced by general multi-registration support later + const int rem_auxseg = gasneti_in_auxsegment(gasnetc_epid2node(epid), dst_ptr, nbytes); + + sr_desc->wr.rdma.remote_addr = (uintptr_t)dst_ptr; + sr_desc_sg_lst[0].addr = (uintptr_t)src_ptr; + sreq->bb_buff = src_ptr; + + GASNETI_TRACE_EVENT_VAL(C, RDMA_PUT_BUFFERED, nbytes); + + gasnetc_bounce_common(ep, epid, rem_auxseg, sr_desc, nbytes, sreq, IBV_WR_RDMA_WRITE GASNETI_THREAD_PASS); + + return 0; +} +#endif + /* Perform an RDMA get * * Uses zero-copy (with firehose if the destination is not pre-pinned).