From 246ead0615f4155eca1fd6c203b97b405992ff1e Mon Sep 17 00:00:00 2001 From: alexandertimofeyev <106101886+alexandertimofeyev@users.noreply.github.com> Date: Fri, 29 May 2026 13:42:06 -0700 Subject: [PATCH] DAOS-18972 cart: configurable address family for fabric init (#18254) By default, CaRT initializes Mercury with na_init_info.addr_format set to NA_ADDR_UNSPEC, which Mercury's na_ofi plugin maps to its per-provider preference table. For verbs/RoCE this resolves to FI_SOCKADDR_IN (IPv4), which causes libfabric's fabric scan and QP-attach to prefer IPv4 even when the operator has explicitly configured the fabric NIC for IPv6. Add a per-provider address-family hint, configurable two ways: - crt_init_options_t::cio_addr_format (API field) - D_ADDR_FORMAT environment variable mirroring the existing pattern used by cio_provider / D_PROVIDER, cio_interface / D_INTERFACE, cio_domain / D_DOMAIN, cio_port / D_PORT, and cio_auth_key / D_PROVIDER_AUTH_KEY. Accepted values are "unspec" (default), "ipv4", "ipv6", and "native". Unrecognized values fall back silently to "unspec" rather than failing initialization, matching the behavior of crt_str_to_tc() for traffic classes. The value is stored per-provider on struct crt_prov_gdata::cpg_addr_format and forwarded to Mercury via init_info.na_init_info.addr_format in crt_hg_class_init(). For multi-provider configurations the option accepts a comma-separated list, matched one-to-one with the entries of D_PROVIDER (same parsing pattern as the existing fields). Mercury already supports IPv6 end-to-end in the na_ofi plugin: the addr_format mapping, addr_size, raw_addr serialize/deserialize, and addr_to_key paths all handle FI_SOCKADDR_IN6. No Mercury change is required. Implementation notes: * enum crt_addr_format mirrors enum na_addr_format. Static assertions guard the alignment so crt_hg.c can cast the enum directly when assigning na_init_info.addr_format, matching the existing idiom used for cg_swim_tc -> enum na_traffic_class. * Default behavior is preserved: omitting D_ADDR_FORMAT leaves the per-provider gdata at CRT_AF_UNSPEC, which casts to NA_ADDR_UNSPEC, yielding the previous IPv4-preferring fabric scan. Existing deployments see no functional change. * This is the CaRT half of IPv6 fabric support. Two companion changes are required to round out the full v6 fabric story (each independent and separate from this patch): - src/control/server/server_utils.go: "tcp4" -> "tcp", "0.0.0.0:%d" -> "[::]:%d" for the gRPC management listener; - HG_Get_na_protocol_info or equivalent Mercury API to accept an addr_format hint so the daos_server fabric scan respects v6 too. Validation performed before submission (2-node DAOS 2.6.5 cluster, Mellanox ConnectX-7, RoCEv2, ofi+verbs;ofi_rxm): Test Result ---- ------ Patch builds against runtime libmercury OK Default behaviour (D_ADDR_FORMAT unset) OK - all ranks Joined, dmg pool create and storage format succeed unchanged Explicit D_ADDR_FORMAT=ipv4 OK - identical to default Unknown value (e.g. D_ADDR_FORMAT=garbagez) OK - silent fallback to unspec, no init failure D_ADDR_FORMAT=ipv6 hint reaches libfabric OK - confirmed via 'rdma resource show cm_id': engines LISTEN on [2a04:f547:93:3082::20bc] :20000 (IPv6), not 10.92.32.188:20000 (IPv4) End-to-end IPv6 RPC between engines was *not* exercisable on this specific test cluster: libfabric 1.22.0's verbs;ofi_rxm provider returns ENODATA from fi_getinfo when an IPv6 addr_format hint is passed, even though the fabric NIC has an IPv6 global configured. Reproduced independently with 'fi_pingpong -p "verbs;ofi_rxm" -6', which fails identically without any DAOS code involved. The kernel RDMA-CM + RoCEv2 v6 path itself is fine (verified via rping). So the v6 gap is in the libfabric verbs provider, downstream of every layer this patch touches. Local unit testing follows the existing convention for crt_str_to_tc() and the other CRT_ENV_OPT_GET-mediated options, which are exercised via the cart ftest suite rather than per-function unit tests. Signed-off-by: Alex Timofeyev --- src/cart/README.env | 19 ++++++++++ src/cart/crt_hg.c | 13 +++++++ src/cart/crt_init.c | 69 ++++++++++++++++++++++++++++++++--- src/cart/crt_internal_types.h | 34 +++++++++++++++++ src/include/cart/types.h | 13 +++++++ 5 files changed, 142 insertions(+), 6 deletions(-) diff --git a/src/cart/README.env b/src/cart/README.env index befc0fd8f4f..f092563637b 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -137,6 +137,25 @@ This file lists the environment variables used in CaRT. D_PROVIDER_AUTH_KEY is assumed to be empty. Supports a comma separated list of keys, similar to D_INTERFACE handling + . D_ADDR_FORMAT + Selects the preferred IP address family for fabric initialization. The + value is forwarded to Mercury via na_init_info.addr_format and used as + a hint by libfabric's fabric scan to enumerate interfaces of the chosen + family. Accepted values (case-sensitive): + - "unspec" (default) - leave it to the plugin; Mercury's na_ofi + plugin falls back to its per-provider + preference (IPv4 for verbs/RoCE). + - "ipv4" - prefer IPv4 (FI_SOCKADDR_IN). + - "ipv6" - prefer IPv6 (FI_SOCKADDR_IN6). Required for + IPv6-only fabric NIC deployments where the + default IPv4 preference would hide the only + usable interfaces. + - "native" - provider native addressing. + Unrecognized values fall back silently to "unspec" rather than failing + initialization. Supports a comma separated list of values, similar to + D_INTERFACE handling; entries are matched one-to-one with the + comma-separated D_PROVIDER list for multi-provider configurations. + . CRT_CREDIT_EP_CTX Set it as the max number of in-flight RPCs to a target endpoint context, the valid range is [0, 256]. diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 9ee161dbeb5..f81dc0a3761 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -846,6 +846,19 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ init_info.na_init_info.auth_key = prov_data->cpg_na_config.noc_auth_key; + /* + * Forward the per-provider address-family preference to Mercury. The + * default (CRT_AF_UNSPEC -> NA_ADDR_UNSPEC) preserves the original + * behavior: Mercury's na_ofi plugin falls back to its per-provider + * preference table (IPv4 for verbs/RoCE). Setting D_ADDR_FORMAT=ipv6 + * (or cio_addr_format="ipv6") instead steers libfabric's fabric scan + * to enumerate IPv6 interfaces, which is required for IPv6-only + * fabric NIC deployments. CRT_AF_* values are statically asserted to + * match the corresponding NA_ADDR_* values in crt_init.c, so the + * direct cast is safe. + */ + init_info.na_init_info.addr_format = (enum na_addr_format)prov_data->cpg_addr_format; + if (crt_provider_is_block_mode(provider) && !crt_gdata.cg_progress_busy) init_info.na_init_info.progress_mode = 0; else diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 0ba59243058..5c7b1f06b3c 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -24,6 +24,10 @@ static bool g_prov_settings_applied[CRT_PROV_COUNT]; static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES}; #undef X +#define X(a, b) b, +static const char *const crt_addr_format_name[] = {CRT_ADDR_FORMATS}; +#undef X + #define CRT_ENV_OPT_GET(opt, x, env) \ do { \ if (opt != NULL && opt->cio_##x) \ @@ -35,7 +39,7 @@ static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES}; static int crt_init_prov(crt_provider_t provider, bool primary, struct crt_prov_gdata *prov_gdata, const char *interface, const char *domain, const char *port, const char *auth_key, - bool port_auto_adjust, crt_init_options_t *opt); + const char *addr_format, bool port_auto_adjust, crt_init_options_t *opt); static void crt_lib_init(void) __attribute__((__constructor__)); @@ -96,6 +100,7 @@ dump_opt(crt_init_options_t *opt) D_INFO("domain = %s\n", opt->cio_domain); D_INFO("port = %s\n", opt->cio_port); D_INFO("auth_key = %s\n", opt->cio_auth_key); + D_INFO("addr_format = %s\n", opt->cio_addr_format); D_INFO("ep_credits = %d\n", opt->cio_ep_credits); D_INFO("Flags: fault_inject = %d, use_sensors = %d, thread_mode_single = %d, " "progress_busy = %d, mem_device = %d\n", @@ -259,6 +264,37 @@ crt_str_to_tc(const char *str) return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i; } +/* + * Parse a textual address-format hint into the matching enum value. + * Falls back to CRT_AF_UNSPEC (Mercury default) on NULL, empty, or + * unrecognized input — keeps the historical behavior for users who + * don't set the option, and avoids surfacing typos as init failures. + */ +static enum crt_addr_format +crt_str_to_addr_format(const char *str) +{ + enum crt_addr_format i = 0; + + if (str == NULL || str[0] == '\0') + return CRT_AF_UNSPEC; + + while (strcmp(crt_addr_format_name[i], str) != 0 && i < CRT_AF_UNKNOWN) + i++; + + return i == CRT_AF_UNKNOWN ? CRT_AF_UNSPEC : i; +} + +/* + * CRT_AF_* values are kept aligned with Mercury's enum na_addr_format so + * that crt_hg.c can cast directly when assigning na_init_info.addr_format + * (mirroring how cg_swim_tc is cast to enum na_traffic_class). The static + * assertions below catch any future drift between the two enums. + */ +D_CASSERT((int)CRT_AF_UNSPEC == (int)NA_ADDR_UNSPEC); +D_CASSERT((int)CRT_AF_IPV4 == (int)NA_ADDR_IPV4); +D_CASSERT((int)CRT_AF_IPV6 == (int)NA_ADDR_IPV6); +D_CASSERT((int)CRT_AF_NATIVE == (int)NA_ADDR_NATIVE); + /* first step init - for initializing crt_gdata */ static int data_init(int server, crt_init_options_t *opt) @@ -603,11 +639,12 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) int rc = 0; crt_provider_t prov; char *provider = NULL, *interface = NULL, *domain = NULL, *port = NULL, *auth_key = NULL; + char *addr_format = NULL; char *path = NULL; char *provider_str = NULL, *interface_str = NULL, *domain_str = NULL, *port_str = NULL, - *auth_key_str = NULL; + *auth_key_str = NULL, *addr_format_str = NULL; char *save_provider_str = NULL, *save_interface_str = NULL, *save_domain_str = NULL, - *save_port_str = NULL, *save_auth_key_str = NULL; + *save_port_str = NULL, *save_auth_key_str = NULL, *save_addr_format_str = NULL; bool port_auto_adjust = false, thread_mode_single = false, progress_busy = false, mem_device = false; int i; @@ -680,6 +717,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) CRT_ENV_OPT_GET(opt, domain, D_DOMAIN); CRT_ENV_OPT_GET(opt, port, D_PORT); CRT_ENV_OPT_GET(opt, auth_key, D_PROVIDER_AUTH_KEY); + CRT_ENV_OPT_GET(opt, addr_format, D_ADDR_FORMAT); crt_env_get(D_PORT_AUTO_ADJUST, &port_auto_adjust); @@ -743,6 +781,13 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) D_GOTO(unlock, rc = -DER_NOMEM); auth_key = strtok_r(auth_key_str, ",", &save_auth_key_str); } + + if (addr_format != NULL) { + D_STRNDUP(addr_format_str, addr_format, CRT_ENV_STR_MAX_SIZE); + if (addr_format_str == NULL) + D_GOTO(unlock, rc = -DER_NOMEM); + addr_format = strtok_r(addr_format_str, ",", &save_addr_format_str); + } } prov = crt_str_to_provider(provider); @@ -768,7 +813,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) * and processed in crt_na_config_init(). */ rc = crt_init_prov(prov, true, &crt_gdata.cg_prov_gdata_primary, interface, domain, port, - auth_key, port_auto_adjust, opt); + auth_key, addr_format, port_auto_adjust, opt); if (rc != 0) D_GOTO(unlock, rc); @@ -812,6 +857,8 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) port = strtok_r(NULL, ",", &save_port_str); if (auth_key != NULL) auth_key = strtok_r(NULL, ",", &save_auth_key_str); + if (addr_format != NULL) + addr_format = strtok_r(NULL, ",", &save_addr_format_str); /* Secondary provider needs its own interface or domain */ if (interface == NULL && domain == NULL) { @@ -825,7 +872,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) rc = crt_init_prov(crt_gdata.cg_secondary_provs[i], false, &crt_gdata.cg_prov_gdata_secondary[i], interface, domain, - port, auth_key, port_auto_adjust, opt); + port, auth_key, addr_format, port_auto_adjust, opt); if (rc != 0) { D_ERROR("crt_init_prov() failed for secondary provider, " DF_RC "\n", @@ -904,6 +951,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) D_FREE(domain_str); D_FREE(port_str); D_FREE(auth_key_str); + D_FREE(addr_format_str); if (rc != 0) { D_ERROR("failed, " DF_RC "\n", DP_RC(rc)); @@ -916,7 +964,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) static int crt_init_prov(crt_provider_t provider, bool primary, struct crt_prov_gdata *prov_gdata, const char *interface, const char *domain, const char *port, const char *auth_key, - bool port_auto_adjust, crt_init_options_t *opt) + const char *addr_format, bool port_auto_adjust, crt_init_options_t *opt) { int rc; @@ -926,6 +974,15 @@ crt_init_prov(crt_provider_t provider, bool primary, struct crt_prov_gdata *prov prov_settings_apply(primary, provider, opt); + /* + * Record the requested address family on the per-provider gdata so it + * can be forwarded to Mercury via na_init_info.addr_format when each + * HG class is initialized (see crt_hg.c::crt_hg_class_init). Unknown + * or unset values resolve to CRT_AF_UNSPEC, preserving the historical + * Mercury-default behavior. + */ + prov_gdata->cpg_addr_format = crt_str_to_addr_format(addr_format); + rc = crt_na_config_init(primary, provider, interface, domain, port, auth_key, port_auto_adjust); if (rc != 0) { diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index 02fbe3eea0c..4ede1e9d5fa 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -55,6 +55,32 @@ struct crt_na_config { enum crt_traffic_class { CRT_TRAFFIC_CLASSES }; #undef X +/* + * Preferred address family for fabric init. Forwarded to Mercury via + * na_init_info.addr_format and translated by Mercury's na_ofi plugin + * into the libfabric addr_format hint (FI_SOCKADDR_IN / FI_SOCKADDR_IN6 + * / provider native / FI_FORMAT_UNSPEC). + * + * Default is CRT_AF_UNSPEC, which preserves the historical behavior of + * letting Mercury pick from its per-provider preference table (IPv4 for + * verbs/RoCE). Set to CRT_AF_IPV6 to enable IPv6 fabric on an interface + * that lacks an IPv4 address. + * + * CRT_AF_UNKNOWN is a sentinel returned by crt_str_to_addr_format() when + * the input string does not match any known value; callers map it back + * to CRT_AF_UNSPEC. + */ +#define CRT_ADDR_FORMATS \ + X(CRT_AF_UNSPEC, "unspec") /* Leave it upon plugin to choose (default) */ \ + X(CRT_AF_IPV4, "ipv4") /* Prefer IPv4 (FI_SOCKADDR_IN) */ \ + X(CRT_AF_IPV6, "ipv6") /* Prefer IPv6 (FI_SOCKADDR_IN6) */ \ + X(CRT_AF_NATIVE, "native") /* Provider native addressing */ \ + X(CRT_AF_UNKNOWN, "unknown") /* Unknown / parse error sentinel */ + +#define X(a, b) a, +enum crt_addr_format { CRT_ADDR_FORMATS }; +#undef X + struct crt_prov_gdata { /** NA plugin type */ int cpg_provider; @@ -77,6 +103,13 @@ struct crt_prov_gdata { uint32_t cpg_max_exp_size; uint32_t cpg_max_unexp_size; + /** + * Preferred address family for Mercury fabric init for this provider. + * Defaults to CRT_AF_UNSPEC (Mercury picks). Set via D_ADDR_FORMAT env + * or crt_init_options_t::cio_addr_format API field. + */ + enum crt_addr_format cpg_addr_format; + /** Number of remote tags */ uint32_t cpg_num_remote_tags; uint32_t cpg_last_remote_tag; @@ -219,6 +252,7 @@ struct crt_event_cb_priv { ENV_STR(DD_MASK) \ ENV_STR(DD_STDERR) \ ENV_STR(DD_SUBSYS) \ + ENV_STR(D_ADDR_FORMAT) \ ENV_STR(D_CLIENT_METRICS_DUMP_DIR) \ ENV(D_CLIENT_METRICS_ENABLE) \ ENV(D_CLIENT_METRICS_RETAIN) \ diff --git a/src/include/cart/types.h b/src/include/cart/types.h index 65786f57ede..fbbc154dd2a 100644 --- a/src/include/cart/types.h +++ b/src/include/cart/types.h @@ -88,6 +88,19 @@ typedef struct crt_init_options { /** If set, used as the authentication key instead of D_PROVIDER_AUTH_KEY env */ char *cio_auth_key; + /** + * If set, used as the preferred address family for fabric init instead of + * the D_ADDR_FORMAT env. Accepted values: "unspec" (default), "ipv4", + * "ipv6", "native". The value is forwarded to Mercury via + * na_init_info.addr_format, which lets libfabric's fabric scan find + * interfaces of the chosen family. Useful for IPv6-only fabric + * deployments where the default IPv4 preference would otherwise hide + * the only usable interfaces. For multi-provider configurations, the + * value may be a comma-separated list (one entry per provider, same + * ordering as cio_provider). + */ + char *cio_addr_format; + /** use single thread to access context */ bool cio_thread_mode_single;