From 51afafc5a6013937c8cad9cd58b54975fed6c035 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 2 Jul 2026 08:29:14 +0800 Subject: [PATCH] DAOS-19212 object: client retry modification if TX_RESTART repeatedly - b28 On server side, when IO handler repeatedly hit -DER_TX_RESTART, then it is quite possible that the -DER_TX_RESTART failure is related with server overload or some congestion caused RPC delay. Under such case, server retry with newer epoch may increase server workload/congestion. Then let's ask client to retry with some backoff delay. Signed-off-by: Fan Yong --- src/object/srv_obj.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 62044b08221..a0cebabc9f8 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -3035,6 +3035,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) uint32_t max_ver = 0; struct dtx_epoch epoch = {0}; int rc; + int retry = 0; bool need_abort = false; D_ASSERT(orw != NULL); @@ -3252,6 +3253,17 @@ ds_obj_rw_handler(crt_rpc_t *rpc) break; } + /* If we have already retried once, but still failed for -DER_TX_RESTART, then + * it is quite possible that the -DER_TX_RESTART failure is related with server + * overload or some congestion caused RPC delay. Let's ask client to retry with + * some backoff delay. That will avoid increasing server workload/congestion and + * avoid client RPC timeout during server retry repeatedly. + */ + if (++retry > 1) { + rc = -DER_INPROGRESS; + break; + } + /* Only standalone updates use this RPC. Retry with newer epoch. */ orw->orw_epoch = d_hlc_get(); exec_arg.flags |= ORF_RESEND; @@ -4023,6 +4035,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) uint32_t max_ver = 0; struct dtx_epoch epoch; int rc; + int retry = 0; bool need_abort = false; opi = crt_req_get(rpc); @@ -4163,6 +4176,17 @@ ds_obj_punch_handler(crt_rpc_t *rpc) break; } + /* If we have already retried once, but still failed for -DER_TX_RESTART, then + * it is quite possible that the -DER_TX_RESTART failure is related with server + * overload or some congestion caused RPC delay. Let's ask client to retry with + * some backoff delay. That will avoid increasing server workload/congestion and + * avoid client RPC timeout during server retry repeatedly. + */ + if (++retry > 1) { + rc = -DER_INPROGRESS; + break; + } + /* Only standalone punches use this RPC. Retry with newer epoch. */ opi->opi_epoch = d_hlc_get(); exec_arg.flags |= ORF_RESEND;