Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/common/tests_dmg_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,14 @@ dmg_pool_create(const char *dmg_config_file,
if (args == NULL)
D_GOTO(out, rc = -DER_NOMEM);
}

entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SVC_REDUN_FAC);
if (entry != NULL) {
args = cmd_push_arg(args, &argcount, "--properties=svc_rf:%zu ",
entry->dpe_val);
if (args == NULL)
D_GOTO(out, rc = -DER_NOMEM);
}
}

/* Temporarily use old pool property defaults due to DAOS-17946 */
Expand Down
1 change: 1 addition & 0 deletions src/include/daos/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,7 @@ enum {

#define DAOS_RDB_SKIP_APPENDENTRIES_FAIL (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1a)
#define DAOS_FORCE_REFRESH_POOL_MAP (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1b)
#define DAOS_RDB_FAIL_MODIFY_REPLICAS (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1c)

#define DAOS_FORCE_CAPA_FETCH (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1e)
#define DAOS_FORCE_PROP_VERIFY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x1f)
Expand Down
14 changes: 11 additions & 3 deletions src/rdb/rdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,12 @@ rdb_alloc_replica_gen(struct rdb *db, uint64_t term, uint32_t *gen_out)
}

/**
* Modify \a replicas.
* Perform \a op on \a replicas in the membership.
*
* Note that when \a op is RDB_REPLICA_ADD, even if this function returns an
* error, one of \a replicas [out] may have been added to the local membership.
* The caller must check the local membership before destroying \a replicas
* [out].
*
* \param[in] db database
* \param[in] op operation to perform
Expand Down Expand Up @@ -838,9 +843,12 @@ rdb_modify_replicas(struct rdb *db, enum rdb_replica_op op, rdb_replica_id_t *re
}
for (i = 0; i < *replicas_len; ++i) {
rc = rdb_raft_append_apply_cfg(db, type, replicas[i]);
if (rc == 0 && DAOS_FAIL_CHECK(DAOS_RDB_FAIL_MODIFY_REPLICAS))
rc = -DER_NOTLEADER;
if (rc != 0) {
DL_ERROR(rc, DF_DB ": failed to do op %d on replica " RDB_F_RID, DP_DB(db),
op, RDB_P_RID(replicas[i]));
DL_CDEBUG(rc == -DER_NOTLEADER, DLOG_INFO, DLOG_ERR, rc,
DF_DB ": failed to do op %d on replica " RDB_F_RID, DP_DB(db), op,
RDB_P_RID(replicas[i]));
break;
}
}
Expand Down
21 changes: 19 additions & 2 deletions src/rsvc/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1237,8 +1237,25 @@ ds_rsvc_add_replicas_s(struct ds_rsvc *svc, d_rank_list_t *ranks, size_t size,

rc = rdb_modify_replicas(svc->s_db, RDB_REPLICA_ADD, &id, &ids_len);
if (rc != 0) {
ds_rsvc_dist_stop(svc->s_class, &svc->s_id, &rl, NULL, svc->s_term,
true /* destroy */);
rdb_replica_id_t *replicas;
int replicas_len;
int rc_tmp;

/*
* If this new replica has been added to the local membership, we can't
* destroy it.
*/
rc_tmp = rdb_get_replicas(svc->s_db, &replicas, &replicas_len);
if (rc_tmp == 0) {
int j;

for (j = 0; j < replicas_len; j++)
if (rdb_replica_id_compare(replicas[j], id) == 0)
break;
if (j == replicas_len)
ds_rsvc_dist_stop(svc->s_class, &svc->s_id, &rl, NULL,
svc->s_term, true /* destroy */);
}
break;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/daos_test/suite.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ timeout: 600
timeouts:
test_daos_degraded_mode: 450
test_daos_management: 110
test_daos_pool: 240
test_daos_pool: 300
test_daos_container: 700
test_daos_epoch: 125
test_daos_verify_consistency: 105
Expand Down
138 changes: 100 additions & 38 deletions src/tests/suite/daos_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -2313,45 +2313,107 @@ pool_map_refreshes_fallback(void **state)
pool_map_refreshes_common(state, true /* fall_back */);
}

/*
* A reconfiguration interrupted by a -DER_NOTLEADER should not destroy the new
* replica who may have been added to the leader's local membership.
*/
static void
reconf_notleader_handling(void **state)
{
test_arg_t *arg0 = *state;
test_arg_t *arg = NULL;
daos_prop_t *prop;
daos_pool_info_t pinfo = {0};
d_rank_t extend_rank = 1;
uuid_t cont_uuid;
int rc;

FAULT_INJECTION_REQUIRED();

par_barrier(PAR_COMM_WORLD);

if (arg0->myrank == 0) {
rc = test_pool_get_info(arg0, &pinfo, NULL);
assert_rc_equal(rc, 0);
if (pinfo.pi_nnodes < 2) {
print_message("need at least 2 ranks\n");
skip();
}
}

if (arg0->myrank != 0)
return;

/* Create a pool on rank 0 only, with svc_rf = 1. */
rc = test_setup((void **)&arg, SETUP_EQ, false /* multi_rank */, SMALL_POOL_SIZE,
1 /* node_size */, NULL);
assert_rc_equal(rc, 0);
prop = daos_prop_alloc(1);
assert_ptr_not_equal(prop, NULL);
prop->dpp_entries[0].dpe_type = DAOS_PROP_PO_SVC_REDUN_FAC;
prop->dpp_entries[0].dpe_val = 1;
while (!rc && arg->setup_state != SETUP_POOL_CONNECT)
rc = test_setup_next_step((void **)&arg, NULL, prop, NULL);
assert_rc_equal(rc, 0);
daos_prop_free(prop);

/* Inject fault on rank 0 (the only PS leader). */
test_set_engine_fail_loc(arg, 0 /* rank */, DAOS_RDB_FAIL_MODIFY_REPLICAS | DAOS_FAIL_ONCE);

/*
* Extend the pool to rank 1, and wait for the rank to become UPIN,
* triggering ds_rsvc_add_replicas_s, which used to destroy the new PS
* replica on rank 1 incorrectly, even though it has been added to the
* leader's local membership.
*/
print_message("extending pool to rank %u\n", extend_rank);
rc = dmg_pool_extend(arg->dmg_config, arg->pool.pool_uuid, arg->group, &extend_rank, 1);
assert_rc_equal(rc, 0);
print_message("waiting for rebuild to complete\n");
test_rebuild_wait(&arg, 1);

print_message("wait 5 s for the PS to reconfigure\n");
sleep(5);

print_message("creating a container\n");
rc = daos_cont_create(arg->pool.poh, &cont_uuid, NULL /* prop */, NULL /* ev */);
assert_rc_equal(rc, 0);

print_message("cleaning up\n");
test_teardown((void **)&arg);
}

static const struct CMUnitTest pool_tests[] = {
{ "POOL1: connect to non-existing pool",
pool_connect_nonexist, NULL, test_case_teardown},
{ "POOL2: connect/disconnect to pool",
pool_connect, async_disable, test_case_teardown},
{ "POOL3: connect/disconnect to pool (async)",
pool_connect, async_enable, test_case_teardown},
{ "POOL4: pool handle local2global and global2local",
pool_connect, hdl_share_enable, test_case_teardown},
{ "POOL5: exclusive connection",
pool_connect_exclusively, NULL, test_case_teardown},
{ "POOL6: exclude targets and query pool info",
pool_exclude, async_disable, NULL},
{ "POOL7: set/get/list user-defined pool attributes (sync)",
pool_attribute, NULL, test_case_teardown},
{ "POOL8: set/get/list user-defined pool attributes (async)",
pool_attribute, NULL, test_case_teardown},
{ "POOL9: pool reconnect after daos re-init",
init_fini_conn, NULL, test_case_teardown},
{ "POOL10: pool create with properties and query",
pool_properties, NULL, test_case_teardown},
{ "POOL11: pool list containers (zero)",
list_containers_test, setup_zerocontainers, teardown_containers},
{ "POOL12: pool list containers (many)",
list_containers_test, setup_manycontainers, teardown_containers},
{ "POOL13: retry POOL_{CONNECT,DISCONNECT,QUERY}",
pool_op_retry, NULL, test_case_teardown},
{ "POOL14: pool connect access based on ACL",
pool_connect_access, NULL, test_case_teardown},
{ "POOL15: label property string validation",
label_strings_test, NULL, test_case_teardown},
{ "POOL16: pool map refreshes",
pool_map_refreshes, pool_map_refreshes_setup, test_case_teardown},
{ "POOL17: pool map refreshes (fallback)",
pool_map_refreshes_fallback, pool_map_refreshes_setup, test_case_teardown},
{ "POOL18: pool filter containers (zero)",
filter_containers_test, setup_zerocontainers, teardown_containers},
{ "POOL19: pool filter containers (many)",
filter_containers_test, setup_manycontainers, teardown_containers},
{"POOL1: connect to non-existing pool", pool_connect_nonexist, NULL, test_case_teardown},
{"POOL2: connect/disconnect to pool", pool_connect, async_disable, test_case_teardown},
{"POOL3: connect/disconnect to pool (async)", pool_connect, async_enable, test_case_teardown},
{"POOL4: pool handle local2global and global2local", pool_connect, hdl_share_enable,
test_case_teardown},
{"POOL5: exclusive connection", pool_connect_exclusively, NULL, test_case_teardown},
{"POOL6: exclude targets and query pool info", pool_exclude, async_disable, NULL},
{"POOL7: set/get/list user-defined pool attributes (sync)", pool_attribute, NULL,
test_case_teardown},
{"POOL8: set/get/list user-defined pool attributes (async)", pool_attribute, NULL,
test_case_teardown},
{"POOL9: pool reconnect after daos re-init", init_fini_conn, NULL, test_case_teardown},
{"POOL10: pool create with properties and query", pool_properties, NULL, test_case_teardown},
{"POOL11: pool list containers (zero)", list_containers_test, setup_zerocontainers,
teardown_containers},
{"POOL12: pool list containers (many)", list_containers_test, setup_manycontainers,
teardown_containers},
{"POOL13: retry POOL_{CONNECT,DISCONNECT,QUERY}", pool_op_retry, NULL, test_case_teardown},
{"POOL14: pool connect access based on ACL", pool_connect_access, NULL, test_case_teardown},
{"POOL15: label property string validation", label_strings_test, NULL, test_case_teardown},
{"POOL16: pool map refreshes", pool_map_refreshes, pool_map_refreshes_setup,
test_case_teardown},
{"POOL17: pool map refreshes (fallback)", pool_map_refreshes_fallback, pool_map_refreshes_setup,
test_case_teardown},
{"POOL18: pool filter containers (zero)", filter_containers_test, setup_zerocontainers,
teardown_containers},
{"POOL19: pool filter containers (many)", filter_containers_test, setup_manycontainers,
teardown_containers},
{"POOL20: reconf interrupted by -DER_NOTLEADER", reconf_notleader_handling, NULL,
test_case_teardown},
};

int
Expand Down
Loading