Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion src/container/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,24 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb,
if (!param->ap_vos_agg)
vos_cont_set_mod_bound(cont->sc_hdl, epoch_range.epr_hi);

if (dss_xstream_is_busy())
/*
* Force-merge coalesces small contiguous records into large physical
* extents to defragment the VOS tree. It's normally skipped for this
* tail (non-snapshot) range under load to avoid relocation IO.
*
* For EC objects this coalescing is the ONLY way the data shards get
* defragmented: the client can't recompute parity for partial-stripe
* writes, so small writes land as many tiny records that only VOS
* aggregation can merge. Unlike replicated objects, this range is
* already capped to the EC aggregation epoch boundary (see
* adjust_upper_bound()), so keeping force-merge enabled here can never
* merge/relocate epochs whose parity isn't yet consistent - degraded
* reads and rebuild always reconstruct identical bytes, i.e. no data
* corruption. Retain it for the EC VOS-agg pass so partial-stripe EC
* data doesn't stay fragmented (slow reads/rebuild) under sustained
* load. Reversible via DAOS_EC_AGG_FORCE_MERGE=0.
*/
if (dss_xstream_is_busy() && !(param->ap_vos_agg && ec_agg_force_merge && !ec_agg_disabled))
flags &= ~VOS_AGG_FL_FORCE_MERGE;
rc = agg_cb(cont, &epoch_range, flags, param);
out:
Expand Down
1 change: 1 addition & 0 deletions src/include/daos_srv/pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ int
ds_pool_rebuild_stop(uuid_t pool_uuid, uint32_t force, struct rsvc_hint *hint);

extern bool ec_agg_disabled;
extern bool ec_agg_force_merge;

int dsc_pool_open(uuid_t pool_uuid, uuid_t pool_hdl_uuid,
unsigned int flags, const char *grp,
Expand Down
14 changes: 13 additions & 1 deletion src/pool/srv.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -22,6 +22,13 @@
#include "srv_layout.h"

bool ec_agg_disabled;
/*
* Keep VOS aggregation's force-merge (record coalescing) enabled for EC objects
* even when the target xstream is busy. For EC, coalescing the data shards is the
* only way partial-stripe writes get defragmented, and it is always bounded to the
* EC aggregation epoch boundary, so it cannot merge parity-inconsistent epochs.
*/
bool ec_agg_force_merge = true;
uint32_t pw_rf = -1; /* pool wise redundancy factor */
uint32_t ps_cache_intvl = 2; /* pool space cache expiration time, in seconds */
#define PW_RF_DEFAULT (2)
Expand Down Expand Up @@ -73,6 +80,11 @@ init(void)
if (unlikely(ec_agg_disabled))
D_WARN("EC aggregation is disabled.\n");

ec_agg_force_merge = true;
d_getenv_bool("DAOS_EC_AGG_FORCE_MERGE", &ec_agg_force_merge);
if (!ec_agg_force_merge)
D_WARN("EC aggregation force-merge (coalescing under load) is disabled.\n");

pw_rf = -1;
if (!check_pool_redundancy_factor("DAOS_POOL_RF"))
pw_rf = PW_RF_DEFAULT;
Expand Down
Loading