diff --git a/src/cart/README.env b/src/cart/README.env index befc0fd8f4f..f092563637b 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -137,6 +137,25 @@ This file lists the environment variables used in CaRT. D_PROVIDER_AUTH_KEY is assumed to be empty. Supports a comma separated list of keys, similar to D_INTERFACE handling + . D_ADDR_FORMAT + Selects the preferred IP address family for fabric initialization. The + value is forwarded to Mercury via na_init_info.addr_format and used as + a hint by libfabric's fabric scan to enumerate interfaces of the chosen + family. Accepted values (case-sensitive): + - "unspec" (default) - leave it to the plugin; Mercury's na_ofi + plugin falls back to its per-provider + preference (IPv4 for verbs/RoCE). + - "ipv4" - prefer IPv4 (FI_SOCKADDR_IN). + - "ipv6" - prefer IPv6 (FI_SOCKADDR_IN6). Required for + IPv6-only fabric NIC deployments where the + default IPv4 preference would hide the only + usable interfaces. + - "native" - provider native addressing. + Unrecognized values fall back silently to "unspec" rather than failing + initialization. Supports a comma separated list of values, similar to + D_INTERFACE handling; entries are matched one-to-one with the + comma-separated D_PROVIDER list for multi-provider configurations. + . CRT_CREDIT_EP_CTX Set it as the max number of in-flight RPCs to a target endpoint context, the valid range is [0, 256]. diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 9ee161dbeb5..f81dc0a3761 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -846,6 +846,19 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ init_info.na_init_info.auth_key = prov_data->cpg_na_config.noc_auth_key; + /* + * Forward the per-provider address-family preference to Mercury. The + * default (CRT_AF_UNSPEC -> NA_ADDR_UNSPEC) preserves the original + * behavior: Mercury's na_ofi plugin falls back to its per-provider + * preference table (IPv4 for verbs/RoCE). Setting D_ADDR_FORMAT=ipv6 + * (or cio_addr_format="ipv6") instead steers libfabric's fabric scan + * to enumerate IPv6 interfaces, which is required for IPv6-only + * fabric NIC deployments. CRT_AF_* values are statically asserted to + * match the corresponding NA_ADDR_* values in crt_init.c, so the + * direct cast is safe. + */ + init_info.na_init_info.addr_format = (enum na_addr_format)prov_data->cpg_addr_format; + if (crt_provider_is_block_mode(provider) && !crt_gdata.cg_progress_busy) init_info.na_init_info.progress_mode = 0; else diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 0ba59243058..5c7b1f06b3c 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -24,6 +24,10 @@ static bool g_prov_settings_applied[CRT_PROV_COUNT]; static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES}; #undef X +#define X(a, b) b, +static const char *const crt_addr_format_name[] = {CRT_ADDR_FORMATS}; +#undef X + #define CRT_ENV_OPT_GET(opt, x, env) \ do { \ if (opt != NULL && opt->cio_##x) \ @@ -35,7 +39,7 @@ static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES}; static int crt_init_prov(crt_provider_t provider, bool primary, struct crt_prov_gdata *prov_gdata, const char *interface, const char *domain, const char *port, const char *auth_key, - bool port_auto_adjust, crt_init_options_t *opt); + const char *addr_format, bool port_auto_adjust, crt_init_options_t *opt); static void crt_lib_init(void) __attribute__((__constructor__)); @@ -96,6 +100,7 @@ dump_opt(crt_init_options_t *opt) D_INFO("domain = %s\n", opt->cio_domain); D_INFO("port = %s\n", opt->cio_port); D_INFO("auth_key = %s\n", opt->cio_auth_key); + D_INFO("addr_format = %s\n", opt->cio_addr_format); D_INFO("ep_credits = %d\n", opt->cio_ep_credits); D_INFO("Flags: fault_inject = %d, use_sensors = %d, thread_mode_single = %d, " "progress_busy = %d, mem_device = %d\n", @@ -259,6 +264,37 @@ crt_str_to_tc(const char *str) return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i; } +/* + * Parse a textual address-format hint into the matching enum value. + * Falls back to CRT_AF_UNSPEC (Mercury default) on NULL, empty, or + * unrecognized input — keeps the historical behavior for users who + * don't set the option, and avoids surfacing typos as init failures. + */ +static enum crt_addr_format +crt_str_to_addr_format(const char *str) +{ + enum crt_addr_format i = 0; + + if (str == NULL || str[0] == '\0') + return CRT_AF_UNSPEC; + + while (strcmp(crt_addr_format_name[i], str) != 0 && i < CRT_AF_UNKNOWN) + i++; + + return i == CRT_AF_UNKNOWN ? CRT_AF_UNSPEC : i; +} + +/* + * CRT_AF_* values are kept aligned with Mercury's enum na_addr_format so + * that crt_hg.c can cast directly when assigning na_init_info.addr_format + * (mirroring how cg_swim_tc is cast to enum na_traffic_class). The static + * assertions below catch any future drift between the two enums. + */ +D_CASSERT((int)CRT_AF_UNSPEC == (int)NA_ADDR_UNSPEC); +D_CASSERT((int)CRT_AF_IPV4 == (int)NA_ADDR_IPV4); +D_CASSERT((int)CRT_AF_IPV6 == (int)NA_ADDR_IPV6); +D_CASSERT((int)CRT_AF_NATIVE == (int)NA_ADDR_NATIVE); + /* first step init - for initializing crt_gdata */ static int data_init(int server, crt_init_options_t *opt) @@ -603,11 +639,12 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) int rc = 0; crt_provider_t prov; char *provider = NULL, *interface = NULL, *domain = NULL, *port = NULL, *auth_key = NULL; + char *addr_format = NULL; char *path = NULL; char *provider_str = NULL, *interface_str = NULL, *domain_str = NULL, *port_str = NULL, - *auth_key_str = NULL; + *auth_key_str = NULL, *addr_format_str = NULL; char *save_provider_str = NULL, *save_interface_str = NULL, *save_domain_str = NULL, - *save_port_str = NULL, *save_auth_key_str = NULL; + *save_port_str = NULL, *save_auth_key_str = NULL, *save_addr_format_str = NULL; bool port_auto_adjust = false, thread_mode_single = false, progress_busy = false, mem_device = false; int i; @@ -680,6 +717,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) CRT_ENV_OPT_GET(opt, domain, D_DOMAIN); CRT_ENV_OPT_GET(opt, port, D_PORT); CRT_ENV_OPT_GET(opt, auth_key, D_PROVIDER_AUTH_KEY); + CRT_ENV_OPT_GET(opt, addr_format, D_ADDR_FORMAT); crt_env_get(D_PORT_AUTO_ADJUST, &port_auto_adjust); @@ -743,6 +781,13 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) D_GOTO(unlock, rc = -DER_NOMEM); auth_key = strtok_r(auth_key_str, ",", &save_auth_key_str); } + + if (addr_format != NULL) { + D_STRNDUP(addr_format_str, addr_format, CRT_ENV_STR_MAX_SIZE); + if (addr_format_str == NULL) + D_GOTO(unlock, rc = -DER_NOMEM); + addr_format = strtok_r(addr_format_str, ",", &save_addr_format_str); + } } prov = crt_str_to_provider(provider); @@ -768,7 +813,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) * and processed in crt_na_config_init(). */ rc = crt_init_prov(prov, true, &crt_gdata.cg_prov_gdata_primary, interface, domain, port, - auth_key, port_auto_adjust, opt); + auth_key, addr_format, port_auto_adjust, opt); if (rc != 0) D_GOTO(unlock, rc); @@ -812,6 +857,8 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) port = strtok_r(NULL, ",", &save_port_str); if (auth_key != NULL) auth_key = strtok_r(NULL, ",", &save_auth_key_str); + if (addr_format != NULL) + addr_format = strtok_r(NULL, ",", &save_addr_format_str); /* Secondary provider needs its own interface or domain */ if (interface == NULL && domain == NULL) { @@ -825,7 +872,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) rc = crt_init_prov(crt_gdata.cg_secondary_provs[i], false, &crt_gdata.cg_prov_gdata_secondary[i], interface, domain, - port, auth_key, port_auto_adjust, opt); + port, auth_key, addr_format, port_auto_adjust, opt); if (rc != 0) { D_ERROR("crt_init_prov() failed for secondary provider, " DF_RC "\n", @@ -904,6 +951,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) D_FREE(domain_str); D_FREE(port_str); D_FREE(auth_key_str); + D_FREE(addr_format_str); if (rc != 0) { D_ERROR("failed, " DF_RC "\n", DP_RC(rc)); @@ -916,7 +964,7 @@ crt_init_opt(crt_group_id_t grpid, uint32_t flags, crt_init_options_t *opt) static int crt_init_prov(crt_provider_t provider, bool primary, struct crt_prov_gdata *prov_gdata, const char *interface, const char *domain, const char *port, const char *auth_key, - bool port_auto_adjust, crt_init_options_t *opt) + const char *addr_format, bool port_auto_adjust, crt_init_options_t *opt) { int rc; @@ -926,6 +974,15 @@ crt_init_prov(crt_provider_t provider, bool primary, struct crt_prov_gdata *prov prov_settings_apply(primary, provider, opt); + /* + * Record the requested address family on the per-provider gdata so it + * can be forwarded to Mercury via na_init_info.addr_format when each + * HG class is initialized (see crt_hg.c::crt_hg_class_init). Unknown + * or unset values resolve to CRT_AF_UNSPEC, preserving the historical + * Mercury-default behavior. + */ + prov_gdata->cpg_addr_format = crt_str_to_addr_format(addr_format); + rc = crt_na_config_init(primary, provider, interface, domain, port, auth_key, port_auto_adjust); if (rc != 0) { diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index 02fbe3eea0c..4ede1e9d5fa 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -55,6 +55,32 @@ struct crt_na_config { enum crt_traffic_class { CRT_TRAFFIC_CLASSES }; #undef X +/* + * Preferred address family for fabric init. Forwarded to Mercury via + * na_init_info.addr_format and translated by Mercury's na_ofi plugin + * into the libfabric addr_format hint (FI_SOCKADDR_IN / FI_SOCKADDR_IN6 + * / provider native / FI_FORMAT_UNSPEC). + * + * Default is CRT_AF_UNSPEC, which preserves the historical behavior of + * letting Mercury pick from its per-provider preference table (IPv4 for + * verbs/RoCE). Set to CRT_AF_IPV6 to enable IPv6 fabric on an interface + * that lacks an IPv4 address. + * + * CRT_AF_UNKNOWN is a sentinel returned by crt_str_to_addr_format() when + * the input string does not match any known value; callers map it back + * to CRT_AF_UNSPEC. + */ +#define CRT_ADDR_FORMATS \ + X(CRT_AF_UNSPEC, "unspec") /* Leave it upon plugin to choose (default) */ \ + X(CRT_AF_IPV4, "ipv4") /* Prefer IPv4 (FI_SOCKADDR_IN) */ \ + X(CRT_AF_IPV6, "ipv6") /* Prefer IPv6 (FI_SOCKADDR_IN6) */ \ + X(CRT_AF_NATIVE, "native") /* Provider native addressing */ \ + X(CRT_AF_UNKNOWN, "unknown") /* Unknown / parse error sentinel */ + +#define X(a, b) a, +enum crt_addr_format { CRT_ADDR_FORMATS }; +#undef X + struct crt_prov_gdata { /** NA plugin type */ int cpg_provider; @@ -77,6 +103,13 @@ struct crt_prov_gdata { uint32_t cpg_max_exp_size; uint32_t cpg_max_unexp_size; + /** + * Preferred address family for Mercury fabric init for this provider. + * Defaults to CRT_AF_UNSPEC (Mercury picks). Set via D_ADDR_FORMAT env + * or crt_init_options_t::cio_addr_format API field. + */ + enum crt_addr_format cpg_addr_format; + /** Number of remote tags */ uint32_t cpg_num_remote_tags; uint32_t cpg_last_remote_tag; @@ -219,6 +252,7 @@ struct crt_event_cb_priv { ENV_STR(DD_MASK) \ ENV_STR(DD_STDERR) \ ENV_STR(DD_SUBSYS) \ + ENV_STR(D_ADDR_FORMAT) \ ENV_STR(D_CLIENT_METRICS_DUMP_DIR) \ ENV(D_CLIENT_METRICS_ENABLE) \ ENV(D_CLIENT_METRICS_RETAIN) \ diff --git a/src/include/cart/types.h b/src/include/cart/types.h index 65786f57ede..fbbc154dd2a 100644 --- a/src/include/cart/types.h +++ b/src/include/cart/types.h @@ -88,6 +88,19 @@ typedef struct crt_init_options { /** If set, used as the authentication key instead of D_PROVIDER_AUTH_KEY env */ char *cio_auth_key; + /** + * If set, used as the preferred address family for fabric init instead of + * the D_ADDR_FORMAT env. Accepted values: "unspec" (default), "ipv4", + * "ipv6", "native". The value is forwarded to Mercury via + * na_init_info.addr_format, which lets libfabric's fabric scan find + * interfaces of the chosen family. Useful for IPv6-only fabric + * deployments where the default IPv4 preference would otherwise hide + * the only usable interfaces. For multi-provider configurations, the + * value may be a comma-separated list (one entry per provider, same + * ordering as cio_provider). + */ + char *cio_addr_format; + /** use single thread to access context */ bool cio_thread_mode_single;