From 72b6f6bb9737f5bf065e1ef70902342c0f11e013 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Mon, 22 Jun 2026 10:39:03 +0200 Subject: [PATCH 1/6] fix(rs-dapi-client): rotate instead of ban on ResourceExhausted rate-limits ResourceExhausted is a congestion/backpressure signal, not endpoint ill-health. Banning a rate-limited (but healthy) node relocates its load onto survivors and cascades to NoAvailableAddressesToRetry. Now RE is classified as rate-limited: the node is not banned, the retry rotates to a different node via explicit exclusion, and the existing bounded retry count bounds an over-limit client. Genuine-failure ban path (60s x e^n) is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/rs-dapi-client/src/address_list.rs | 145 +++++++++++- packages/rs-dapi-client/src/dapi_client.rs | 206 ++++++++++++++++- packages/rs-dapi-client/src/executor.rs | 4 + packages/rs-dapi-client/src/lib.rs | 13 ++ packages/rs-dapi-client/src/transport.rs | 56 +++++ packages/rs-dapi-client/src/transport/grpc.rs | 8 + .../tests/rate_limit_rotation.rs | 207 ++++++++++++++++++ 7 files changed, 635 insertions(+), 4 deletions(-) create mode 100644 packages/rs-dapi-client/tests/rate_limit_rotation.rs diff --git a/packages/rs-dapi-client/src/address_list.rs b/packages/rs-dapi-client/src/address_list.rs index fc1d734ccfe..d299a20f968 100644 --- a/packages/rs-dapi-client/src/address_list.rs +++ b/packages/rs-dapi-client/src/address_list.rs @@ -239,6 +239,20 @@ impl AddressList { /// Randomly select a not banned address. pub fn get_live_address(&self) -> Option
{ + self.get_live_address_excluding(&[]) + } + + /// Randomly select a not-banned address that is not in `exclude`. + /// + /// Uses the same ban filtering as [`Self::get_live_address`] (skips + /// addresses whose ban period has not yet expired) but additionally skips + /// any address present in `exclude`. The retry loop uses this to rotate to + /// a *different* node after a failure — in particular after a rate-limit, + /// where the just-tried node is healthy and stays in the pool, so plain + /// ban-filtering would happily pick it again. + /// + /// Passing an empty slice is equivalent to [`Self::get_live_address`]. + pub fn get_live_address_excluding(&self, exclude: &[Address]) -> Option
{ let guard = self.addresses.read().unwrap(); let mut rng = SmallRng::from_entropy(); @@ -247,11 +261,12 @@ impl AddressList { guard .iter() - .filter(|(_, status)| { + .filter(|(addr, status)| { status .banned_until .map(|banned_until| banned_until < now) .unwrap_or(true) + && !exclude.contains(addr) }) .choose(&mut rng) .map(|(addr, _)| addr.clone()) @@ -755,4 +770,132 @@ mod tests { let list = AddressList::new(); assert!(list.ban_info().is_empty()); } + + #[test] + fn test_get_live_address_excluding_skips_excluded() { + let mut list = AddressList::new(); + let addr1: Address = "http://127.0.0.1:3000".parse().unwrap(); + let addr2: Address = "http://127.0.0.1:3001".parse().unwrap(); + let addr3: Address = "http://127.0.0.1:3002".parse().unwrap(); + list.add(addr1.clone()); + list.add(addr2.clone()); + list.add(addr3.clone()); + + // Excluding two leaves exactly the third, deterministically. + let exclude = [addr1, addr2]; + for _ in 0..20 { + let selected = list + .get_live_address_excluding(&exclude) + .expect("one address left"); + assert_eq!(selected, addr3); + } + } + + #[test] + fn test_get_live_address_excluding_empty_slice_equivalent_to_get_live_address() { + let mut list = AddressList::new(); + list.add("http://127.0.0.1:3000".parse().unwrap()); + assert!(list.get_live_address_excluding(&[]).is_some()); + assert!(list.get_live_address().is_some()); + } + + #[test] + fn test_get_live_address_excluding_all_excluded_returns_none() { + let mut list = AddressList::new(); + let addr1: Address = "http://127.0.0.1:3000".parse().unwrap(); + let addr2: Address = "http://127.0.0.1:3001".parse().unwrap(); + list.add(addr1.clone()); + list.add(addr2.clone()); + + let exclude = [addr1, addr2]; + assert!(list.get_live_address_excluding(&exclude).is_none()); + } + + #[test] + fn test_get_live_address_excluding_still_skips_banned() { + let mut list = AddressList::new(); + let addr1: Address = "http://127.0.0.1:3000".parse().unwrap(); + let addr2: Address = "http://127.0.0.1:3001".parse().unwrap(); + list.add(addr1.clone()); + list.add(addr2.clone()); + list.ban(&addr1); // addr1 banned → filtered out + + // addr1 is banned and addr2 is excluded → nothing live remains. + assert!(list + .get_live_address_excluding(std::slice::from_ref(&addr2)) + .is_none()); + // Without exclusion, only the unbanned addr2 is live. + assert_eq!(list.get_live_address_excluding(&[]), Some(addr2)); + } + + /// Mirrors the retry loop's selection algorithm: + /// `get_live_address_excluding(&tried).or_else(|| get_live_address())`. + /// With N healthy (unbanned) nodes, N consecutive selections must visit N + /// distinct nodes — i.e. each retry rotates off the just-tried node — and + /// once exclusion empties the pool, the graceful fallback still yields a + /// node rather than erroring. + #[test] + fn test_retry_rotation_visits_distinct_nodes_then_falls_back() { + let mut list = AddressList::new(); + let addrs: Vec
= (3000..3003) + .map(|p| format!("http://127.0.0.1:{p}").parse().unwrap()) + .collect(); + for a in &addrs { + list.add(a.clone()); + } + + let mut tried: Vec
= Vec::new(); + for _ in 0..addrs.len() { + let selected = list + .get_live_address_excluding(&tried) + .or_else(|| list.get_live_address()) + .expect("address available"); + assert!( + !tried.contains(&selected), + "retry must rotate off an already-tried node" + ); + tried.push(selected); + } + assert_eq!(tried.len(), addrs.len(), "every node visited exactly once"); + + // Pool exhausted by exclusion → graceful fallback keeps it non-empty so + // a healthy-but-throttled node can still be retried. + let fallback = list + .get_live_address_excluding(&tried) + .or_else(|| list.get_live_address()); + assert!(fallback.is_some(), "fallback must keep the pool non-empty"); + } + + /// Invariant 1 at the ladder source: the exponential ban window is + /// `base × e^ban_count`, `ban_count` incrementing on each ban. This pins the + /// exact formula independently of the `update_address_ban_status` entrypoint. + #[test] + fn test_ban_ladder_windows_match_exponential_formula() { + let mut status = AddressStatus::default(); + let base_secs = 60.0_f64; + let base = Duration::from_secs(60); + + for n in 0..3usize { + // coefficient uses ban_count BEFORE this ban (== n here). + let before = chrono::Utc::now(); + status.ban(&base); + let after = chrono::Utc::now(); + + assert_eq!(status.ban_count, n + 1, "ban_count must increment"); + let period = base_secs * (n as f64).exp(); + let banned_until = status.banned_until.expect("banned_until is set"); + let lower = (banned_until - before).num_milliseconds() as f64 / 1000.0; + let upper = (banned_until - after).num_milliseconds() as f64 / 1000.0; + assert!( + lower >= period - 0.05, + "ban #{} window lower bound {lower}s < expected {period}s", + n + 1 + ); + assert!( + upper <= period + 0.05, + "ban #{} window upper bound {upper}s > expected {period}s", + n + 1 + ); + } + } } diff --git a/packages/rs-dapi-client/src/dapi_client.rs b/packages/rs-dapi-client/src/dapi_client.rs index 72ccf31c378..34952e9a49a 100644 --- a/packages/rs-dapi-client/src/dapi_client.rs +++ b/packages/rs-dapi-client/src/dapi_client.rs @@ -67,6 +67,18 @@ impl CanRetry for DapiClientError { DapiClientError::NoAvailableAddresses | DapiClientError::NoAvailableAddressesToRetry(_) ) } + + fn is_rate_limited(&self) -> bool { + use DapiClientError::*; + match self { + NoAvailableAddresses => false, + NoAvailableAddressesToRetry(_) => false, + Transport(transport_error) => transport_error.is_rate_limited(), + AddressList(_) => false, + #[cfg(feature = "mocks")] + Mock(_) => false, + } + } } /// Serialization of [DapiClientError]. @@ -185,7 +197,19 @@ pub fn update_address_ban_status( } Err(error) => { if error.can_retry() { - if let Some(address) = error.address.as_ref() { + if error.is_rate_limited() { + // ResourceExhausted is congestion/backpressure, not endpoint + // ill-health. Banning a healthy-but-throttled node shifts its + // load onto the survivors and cascades into + // NoAvailableAddressesToRetry. Keep it in the pool; the retry + // loop rotates to a different node via explicit exclusion. The + // bounded retry count bounds an over-limit client. + tracing::debug!( + ?error, + address = ?error.address, + "not banning rate-limited (ResourceExhausted) address; will rotate to another node" + ); + } else if let Some(address) = error.address.as_ref() { if applied_settings.ban_failed_address { if address_list.ban_with_reason(address, Some(error.to_string())) { tracing::warn!( @@ -276,6 +300,54 @@ mod tests { assert!(!err.can_retry()); } + #[test] + fn test_dapi_client_error_is_rate_limited() { + // Transport ResourceExhausted → rate-limited (but still retryable). + let rate_limited = DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::resource_exhausted("429"), + )); + assert!(rate_limited.is_rate_limited()); + assert!(rate_limited.can_retry()); + + // Other transport codes are not rate-limited. + let unavailable = DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::unavailable("down"), + )); + assert!(!unavailable.is_rate_limited()); + + // Non-transport variants are never rate-limited. + assert!(!DapiClientError::NoAvailableAddresses.is_rate_limited()); + let to_retry = DapiClientError::NoAvailableAddressesToRetry(Box::new( + TransportError::Grpc(dapi_grpc::tonic::Status::resource_exhausted("429")), + )); + assert!(!to_retry.is_rate_limited()); + assert!( + !DapiClientError::AddressList(AddressListError::InvalidAddressUri("bad".to_string())) + .is_rate_limited() + ); + } + + #[test] + fn test_execution_error_is_rate_limited_delegates() { + let err: ExecutionError = ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::resource_exhausted("429"), + )), + retries: 0, + address: Some(mock_address()), + }; + assert!(err.is_rate_limited()); + + let err: ExecutionError = ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::internal("boom"), + )), + retries: 0, + address: Some(mock_address()), + }; + assert!(!err.is_rate_limited()); + } + #[cfg(feature = "mocks")] #[test] fn test_can_retry_mock_error() { @@ -364,6 +436,114 @@ mod tests { ); } + /// Invariant 1: the genuine-failure health-ban ladder is unchanged. + /// + /// A genuine failure (Unavailable / Internal) must still ban the node with + /// the `60s × e^ban_count` exponential window, `ban_count` incrementing on + /// each ban — byte-for-byte the pre-existing behavior. + #[test] + fn test_invariant_genuine_failure_still_bans_with_unchanged_ladder() { + let mut address_list = AddressList::new(); // base ban period = 60s + let addr = mock_address(); + address_list.add(addr.clone()); + + let base_secs = 60.0_f64; // DEFAULT_BASE_BAN_PERIOD + + for expected_count in 1..=3usize { + // Alternate genuine-failure codes to prove the ladder is not + // code-specific (both are non-rate-limited, retryable failures). + let status = if expected_count % 2 == 1 { + dapi_grpc::tonic::Status::unavailable("node down") + } else { + dapi_grpc::tonic::Status::internal("boom") + }; + + let before = chrono::Utc::now(); + let result: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc(status)), + retries: 0, + address: Some(addr.clone()), + }); + update_address_ban_status(&address_list, &result, &make_applied_settings(true)); + let after = chrono::Utc::now(); + + let info = address_list.ban_info(); + let entry = info + .iter() + .find(|i| i.uri.contains("3000")) + .expect("address present in ban info"); + + assert!(entry.banned, "genuine failure must ban the node"); + assert_eq!( + entry.ban_count, expected_count, + "ban_count must increment on each genuine failure" + ); + + // Window must equal base × e^(ban_count - 1). Since + // banned_until = t_call + period with before <= t_call <= after: + // (banned_until - before) >= period and + // (banned_until - after) <= period. + let period = base_secs * ((expected_count - 1) as f64).exp(); + let banned_until = entry.banned_until.expect("banned_until is set"); + let lower = (banned_until - before).num_milliseconds() as f64 / 1000.0; + let upper = (banned_until - after).num_milliseconds() as f64 / 1000.0; + assert!( + lower >= period - 0.05, + "ban #{expected_count} window lower bound {lower}s < expected {period}s", + ); + assert!( + upper <= period + 0.05, + "ban #{expected_count} window upper bound {upper}s > expected {period}s", + ); + } + } + + /// Invariant 2: congestion can never empty the live pool. + /// + /// Repeated `ResourceExhausted` across *every* address must never ban any of + /// them — `banned_until` stays `None`, `ban_count` stays 0 — and + /// `get_live_address` keeps returning `Some`. + #[test] + fn test_invariant_rate_limit_never_bans_and_pool_stays_live() { + let mut address_list = AddressList::new(); + let addresses: Vec = (3000..3003) + .map(|p| { + format!("http://127.0.0.1:{p}") + .parse() + .expect("valid address") + }) + .collect(); + for addr in &addresses { + address_list.add(addr.clone()); + } + + // Hammer every address with ResourceExhausted across many rounds. + for _ in 0..10 { + for addr in &addresses { + let result: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::resource_exhausted("429"), + )), + retries: 0, + address: Some(addr.clone()), + }); + update_address_ban_status(&address_list, &result, &make_applied_settings(true)); + } + } + + for info in address_list.ban_info() { + assert!(!info.banned, "address {} must not be banned", info.uri); + assert_eq!(info.ban_count, 0, "ban_count must stay 0 for {}", info.uri); + assert!( + info.banned_until.is_none(), + "banned_until must stay None for {}", + info.uri + ); + } + // The pool is never emptied by congestion. + assert!(address_list.get_live_address().is_some()); + } + #[test] fn test_update_address_ban_status_retryable_error_ban_disabled() { let mut address_list = AddressList::new(); @@ -561,11 +741,25 @@ impl DapiRequestExecutor for DapiClient { let mut retries: usize = 0; // Track the last transport error for when all addresses get exhausted let mut last_transport_error: Option = None; + // Addresses already tried in this execution. We exclude them when + // selecting the next node so a retry rotates to a *different* node — + // crucial for rate-limited (ResourceExhausted) nodes, which stay in + // the pool (not banned) and would otherwise be picked again. + let mut tried: Vec = Vec::new(); let result: ExecutionResult = async { loop { - // Try to get an address to initialize transport on: - let Some(address) = self.address_list.get_live_address() else { + // Try to get an address to initialize transport on, rotating off + // the nodes already tried in this execution. If excluding the + // tried set empties the live pool (e.g. a single-node pool, or + // every live node already tried), fall back to the non-excluding + // selection so a healthy-but-throttled node can still be retried + // rather than surfacing a spurious NoAvailableAddresses. + let Some(address) = self + .address_list + .get_live_address_excluding(&tried) + .or_else(|| self.address_list.get_live_address()) + else { // No available addresses - wrap with last meaningful error if we have one let error = if let Some(transport_error) = last_transport_error.take() { tracing::debug!( @@ -635,6 +829,9 @@ impl DapiRequestExecutor for DapiClient { // Store last transport error last_transport_error = Some(cloned_error); + // Rotate off this node on the next attempt. + tried.push(address.clone()); + retries += 1; tracing::warn!( error = ?execution_error, @@ -694,6 +891,9 @@ impl DapiRequestExecutor for DapiClient { last_transport_error = Some(te.clone()); } + // Rotate off this node on the next attempt. + tried.push(address.clone()); + retries += 1; tracing::warn!( ?error, diff --git a/packages/rs-dapi-client/src/executor.rs b/packages/rs-dapi-client/src/executor.rs index 963383742ae..0bd816bcae5 100644 --- a/packages/rs-dapi-client/src/executor.rs +++ b/packages/rs-dapi-client/src/executor.rs @@ -83,6 +83,10 @@ impl CanRetry for ExecutionError { fn is_no_available_addresses(&self) -> bool { self.inner.is_no_available_addresses() } + + fn is_rate_limited(&self) -> bool { + self.inner.is_rate_limited() + } } /// Request execution response. diff --git a/packages/rs-dapi-client/src/lib.rs b/packages/rs-dapi-client/src/lib.rs index b31d2b35a2c..0d99b6b2b06 100644 --- a/packages/rs-dapi-client/src/lib.rs +++ b/packages/rs-dapi-client/src/lib.rs @@ -95,6 +95,19 @@ pub trait CanRetry { false } + /// Returns true if this error is a rate-limit / congestion signal + /// (gRPC `ResourceExhausted`). + /// + /// Rate-limit errors are retryable (see [`CanRetry::can_retry`]) but, + /// unlike genuine node ill-health, the node must NOT be banned: it is + /// healthy, just throttled. Banning it would shift its load onto the + /// remaining nodes and cascade into `NoAvailableAddressesToRetry`. + /// Instead the retry should rotate to a *different* node and leave the + /// throttled one in the pool. See `update_address_ban_status`. + fn is_rate_limited(&self) -> bool { + false + } + /// Get boolean flag that indicates if the error is retryable. /// /// Deprecated in favor of [CanRetry::can_retry]. diff --git a/packages/rs-dapi-client/src/transport.rs b/packages/rs-dapi-client/src/transport.rs index f488aeaffa2..c96ceeabd7c 100644 --- a/packages/rs-dapi-client/src/transport.rs +++ b/packages/rs-dapi-client/src/transport.rs @@ -106,6 +106,12 @@ impl CanRetry for TransportError { TransportError::Grpc(status) => status.can_retry(), } } + + fn is_rate_limited(&self) -> bool { + match self { + TransportError::Grpc(status) => status.is_rate_limited(), + } + } } /// Serialization of [TransportError]. @@ -212,6 +218,56 @@ mod tests { assert!(!non_retryable.can_retry()); } + #[test] + fn test_tonic_status_is_rate_limited_only_resource_exhausted() { + // Exactly one code is a rate-limit signal. + assert!(dapi_grpc::tonic::Status::new(Code::ResourceExhausted, "429").is_rate_limited()); + + // Every other code — including the *other* retryable ones and + // DeadlineExceeded — must stay false. Widening this set would + // reintroduce the reverted multi-code classifier. + let not_rate_limited = [ + Code::Ok, + Code::Cancelled, + Code::Unknown, + Code::InvalidArgument, + Code::DeadlineExceeded, + Code::NotFound, + Code::AlreadyExists, + Code::PermissionDenied, + Code::FailedPrecondition, + Code::Aborted, + Code::OutOfRange, + Code::Unimplemented, + Code::Internal, + Code::Unavailable, + Code::DataLoss, + Code::Unauthenticated, + ]; + for code in not_rate_limited { + assert!( + !dapi_grpc::tonic::Status::new(code, "x").is_rate_limited(), + "code {:?} must NOT be classified as rate-limited", + code + ); + } + } + + #[test] + fn test_transport_error_is_rate_limited_delegates() { + let rate_limited = TransportError::Grpc(dapi_grpc::tonic::Status::new( + Code::ResourceExhausted, + "429", + )); + assert!(rate_limited.is_rate_limited()); + // Rate-limited is still retryable; only the ban decision differs. + assert!(rate_limited.can_retry()); + + let unavailable = TransportError::Grpc(dapi_grpc::tonic::Status::unavailable("down")); + assert!(!unavailable.is_rate_limited()); + assert!(unavailable.can_retry()); + } + #[test] fn test_transport_error_clone() { let original = TransportError::Grpc(dapi_grpc::tonic::Status::unavailable("test message")); diff --git a/packages/rs-dapi-client/src/transport/grpc.rs b/packages/rs-dapi-client/src/transport/grpc.rs index 68e0c5ae684..22d13b5c6b7 100644 --- a/packages/rs-dapi-client/src/transport/grpc.rs +++ b/packages/rs-dapi-client/src/transport/grpc.rs @@ -146,6 +146,14 @@ impl CanRetry for dapi_grpc::tonic::Status { | Unimplemented ) } + + fn is_rate_limited(&self) -> bool { + // ResourceExhausted is the gRPC mapping of an upstream rate-limit / + // backpressure (e.g. Envoy per-IP 429). It is retryable but the node + // is healthy, so it must rotate rather than ban (see + // `update_address_ban_status`). Exactly one code — do NOT widen this. + self.code() == dapi_grpc::tonic::Code::ResourceExhausted + } } /// Macro to implement the `TransportRequest` trait for a given request type, response type, client type, and settings. diff --git a/packages/rs-dapi-client/tests/rate_limit_rotation.rs b/packages/rs-dapi-client/tests/rate_limit_rotation.rs new file mode 100644 index 00000000000..8d04690b451 --- /dev/null +++ b/packages/rs-dapi-client/tests/rate_limit_rotation.rs @@ -0,0 +1,207 @@ +//! Failover behavior for gRPC `ResourceExhausted` (rate-limit / backpressure). +//! +//! Unlike a genuine node failure, a rate-limited node is *healthy* — it is just +//! throttled. The executor must therefore NOT ban it (banning would relocate +//! its load onto the survivors and cascade into `NoAvailableAddressesToRetry`). +//! Instead the retry must rotate to a *different* node, leaving the throttled +//! one in the live pool. The bounded retry count bounds an over-limit client. +//! +//! These tests drive the real `DapiClient::execute` retry loop through a fake +//! transport, complementing the unit tests in `src/`. + +use std::sync::{Arc, Mutex}; + +use dapi_grpc::mock::Mockable; +use dapi_grpc::tonic::Status; +use rs_dapi_client::transport::{ + AppliedRequestSettings, BoxFuture, TransportClient, TransportError, TransportRequest, +}; +use rs_dapi_client::{ + Address, AddressList, CanRetry, ConnectionPool, DapiClient, DapiClientError, + DapiRequestExecutor, RequestSettings, Uri, +}; + +/// Transport client that only remembers which node it was created for. +struct FakeClient { + uri: Uri, +} + +impl TransportClient for FakeClient { + fn with_uri(uri: Uri, _pool: &ConnectionPool) -> Result { + Ok(Self { uri }) + } + + fn with_uri_and_settings( + uri: Uri, + _settings: &AppliedRequestSettings, + _pool: &ConnectionPool, + ) -> Result { + Ok(Self { uri }) + } +} + +#[derive(Debug)] +struct FakeResponse; + +impl Mockable for FakeResponse {} + +#[derive(Debug, Default)] +struct State { + /// How many more attempts should be answered with RESOURCE_EXHAUSTED, + /// simulating a throttled (but healthy) node. + rate_limited_responses_left: usize, + /// Nodes that answered RESOURCE_EXHAUSTED, in order. + rate_limited_uris: Vec, +} + +/// Request that answers RESOURCE_EXHAUSTED for the first N attempts and +/// succeeds afterwards, recording which node served each throttled attempt. +#[derive(Clone, Debug)] +struct RateLimitedRequest { + state: Arc>, +} + +impl RateLimitedRequest { + fn with_rate_limited_responses(count: usize) -> Self { + Self { + state: Arc::new(Mutex::new(State { + rate_limited_responses_left: count, + rate_limited_uris: Vec::new(), + })), + } + } +} + +impl Mockable for RateLimitedRequest {} + +impl TransportRequest for RateLimitedRequest { + type Client = FakeClient; + type Response = FakeResponse; + + const SETTINGS_OVERRIDES: RequestSettings = RequestSettings::default(); + + fn method_name(&self) -> &'static str { + "fake_rate_limited_method" + } + + fn execute_transport<'c>( + self, + client: &'c mut Self::Client, + _settings: &AppliedRequestSettings, + ) -> BoxFuture<'c, Result> { + let result = { + let mut state = self.state.lock().unwrap(); + if state.rate_limited_responses_left > 0 { + state.rate_limited_responses_left -= 1; + state.rate_limited_uris.push(client.uri.clone()); + Err(TransportError::Grpc(Status::resource_exhausted( + "rate limit exceeded", + ))) + } else { + Ok(FakeResponse) + } + }; + + Box::pin(async move { result }) + } +} + +/// A rate-limited node must NOT be banned, and the retry must rotate to a +/// different node, where the request succeeds. +#[tokio::test] +async fn rate_limited_node_is_not_banned_and_request_rotates_to_another() { + // One throttled response, then success on the rotated-to node. + let request = RateLimitedRequest::with_rate_limited_responses(1); + let address_list: AddressList = "http://127.0.0.1:20001,http://127.0.0.1:20002" + .parse() + .expect("valid address list"); + let client = DapiClient::new(address_list, RequestSettings::default()); + + let response = client + .execute(request.clone(), RequestSettings::default()) + .await + .expect("request should succeed on the rotated-to node"); + + let throttled_uri = { + let state = request.state.lock().unwrap(); + assert_eq!( + state.rate_limited_uris.len(), + 1, + "exactly one node should have answered RESOURCE_EXHAUSTED" + ); + state.rate_limited_uris[0].clone() + }; + + assert_eq!(response.retries, 1); + assert_ne!( + response.address.uri(), + &throttled_uri, + "retry must rotate to a different node than the throttled one" + ); + + // The throttled node stays healthy in the pool — it must NOT be banned. + let throttled = Address::try_from(throttled_uri).expect("valid address"); + assert!( + !client.address_list().is_banned(&throttled), + "a rate-limited node must NOT be banned" + ); +} + +/// Sustained congestion across *every* node must never ban any of them and must +/// never empty the live pool. The loop is bounded by the retry budget and +/// surfaces the raw, still-retryable `ResourceExhausted`. +#[tokio::test] +async fn congestion_never_bans_nodes_nor_empties_pool() { + let settings = RequestSettings { + retries: Some(3), + ..Default::default() + }; + + // Every attempt is throttled. + let request = RateLimitedRequest::with_rate_limited_responses(usize::MAX); + let address_list: AddressList = "http://127.0.0.1:20003,http://127.0.0.1:20004" + .parse() + .expect("valid address list"); + let client = DapiClient::new(address_list, settings); + + let error = client + .execute(request.clone(), settings) + .await + .expect_err("request must fail when every node is throttled"); + + // Bounded purely by the retry budget (no banning, no address exhaustion): + // initial attempt + 3 retries = 4 attempts. + assert_eq!( + request.state.lock().unwrap().rate_limited_uris.len(), + 4, + "attempts must be bounded by retry budget (retries + 1), not by banning" + ); + + // No node was banned, so the pool is never emptied by congestion. + for info in client.address_list().ban_info() { + assert!(!info.banned, "address {} must not be banned", info.uri); + assert_eq!(info.ban_count, 0, "ban_count must stay 0 for {}", info.uri); + assert!( + info.banned_until.is_none(), + "banned_until must stay None for {}", + info.uri + ); + } + assert!( + client.address_list().get_live_address().is_some(), + "congestion must never empty the live pool" + ); + + // The raw ResourceExhausted surfaces and stays retryable — it never + // collapses into the non-retryable NoAvailableAddressesToRetry. + assert!( + error.can_retry(), + "surfaced ResourceExhausted must stay retryable" + ); + match error.inner { + DapiClientError::Transport(TransportError::Grpc(status)) => { + assert_eq!(status.code(), dapi_grpc::tonic::Code::ResourceExhausted) + } + other => panic!("expected raw Transport(Grpc(ResourceExhausted)), got: {other:?}"), + } +} From 396865fc4753821630c8d9dfc89f20e01573290b Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:55:00 +0200 Subject: [PATCH 2/6] fix(rs-dapi-client): exclude throttled node + backoff/jitter on rate-limit retry, lock can_retry invariant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three review fixes for the rotate-instead-of-ban rate-limit handling. M1 (invariant lock): the rotate-don't-ban behavior depends on `is_rate_limited() => can_retry()`. Add a debug_assert! at the rotation decision in `execute` and an exhaustive property test (`test_rate_limit_implies_retryable_invariant`) covering every gRPC code across tonic::Status, TransportError, DapiClientError and ExecutionError, so a future widening of `is_rate_limited` (or narrowing of `can_retry`) fails loudly instead of silently killing rotation. M2 (rotate off the throttled node + backoff): on a rate-limited retry the selection no longer re-picks the just-throttled node in small pools — the fallback now excludes the just-tried address and only reuses it when it is the single live node. The rate-limit retry path also replaces the flat 10ms delay with capped exponential backoff (base 10ms, x2 per attempt, capped at 500ms) plus full jitter, so a congested fleet de-correlates instead of retrying in lockstep. Genuine-failure delays are unchanged. Server-side RetryInfo is left as a documented TODO (needs metadata plumbing). M3 (rs-sdk propagation): override `is_rate_limited` on the SDK `Error` to delegate to the wrapped DapiClientError, so the rotate-don't-ban semantics survive at the SDK layer independently of how `can_retry` is defined. Pure future-proofing (today the can_retry guard short-circuits first); locked with a regression test. Co-Authored-By: Claude Opus 4.6 --- packages/rs-dapi-client/src/address_list.rs | 37 ++- packages/rs-dapi-client/src/dapi_client.rs | 253 +++++++++++++++++++- packages/rs-sdk/src/error.rs | 44 ++++ 3 files changed, 311 insertions(+), 23 deletions(-) diff --git a/packages/rs-dapi-client/src/address_list.rs b/packages/rs-dapi-client/src/address_list.rs index d299a20f968..74f880f08b1 100644 --- a/packages/rs-dapi-client/src/address_list.rs +++ b/packages/rs-dapi-client/src/address_list.rs @@ -253,6 +253,8 @@ impl AddressList { /// /// Passing an empty slice is equivalent to [`Self::get_live_address`]. pub fn get_live_address_excluding(&self, exclude: &[Address]) -> Option
{ + // TODO(low): module-wide `.read()/.write().unwrap()` panics on a poisoned + // lock; adopt poison-tolerant locking consistently (known LOW, SEC-003). let guard = self.addresses.read().unwrap(); let mut rng = SmallRng::from_entropy(); @@ -828,8 +830,12 @@ mod tests { assert_eq!(list.get_live_address_excluding(&[]), Some(addr2)); } - /// Mirrors the retry loop's selection algorithm: - /// `get_live_address_excluding(&tried).or_else(|| get_live_address())`. + /// Mirrors the retry loop's three-tier selection algorithm: + /// 1. `get_live_address_excluding(&tried)` — rotate off every tried node; + /// 2. else `get_live_address_excluding(&[just_tried])` — at least rotate + /// off the node we *just* failed on (small-pool alternation); + /// 3. else `get_live_address()` — reuse (single-node pool). + /// /// With N healthy (unbanned) nodes, N consecutive selections must visit N /// distinct nodes — i.e. each retry rotates off the just-tried node — and /// once exclusion empties the pool, the graceful fallback still yields a @@ -844,12 +850,19 @@ mod tests { list.add(a.clone()); } + let select = |list: &AddressList, tried: &[Address]| { + list.get_live_address_excluding(tried) + .or_else(|| { + tried.last().and_then(|last| { + list.get_live_address_excluding(std::slice::from_ref(last)) + }) + }) + .or_else(|| list.get_live_address()) + }; + let mut tried: Vec
= Vec::new(); for _ in 0..addrs.len() { - let selected = list - .get_live_address_excluding(&tried) - .or_else(|| list.get_live_address()) - .expect("address available"); + let selected = select(&list, &tried).expect("address available"); assert!( !tried.contains(&selected), "retry must rotate off an already-tried node" @@ -859,11 +872,15 @@ mod tests { assert_eq!(tried.len(), addrs.len(), "every node visited exactly once"); // Pool exhausted by exclusion → graceful fallback keeps it non-empty so - // a healthy-but-throttled node can still be retried. - let fallback = list - .get_live_address_excluding(&tried) - .or_else(|| list.get_live_address()); + // a healthy-but-throttled node can still be retried; the just-tried node + // is never the one re-selected while another node is available. + let fallback = select(&list, &tried); assert!(fallback.is_some(), "fallback must keep the pool non-empty"); + assert_ne!( + fallback.as_ref(), + tried.last(), + "fallback must rotate off the just-tried node when another exists" + ); } /// Invariant 1 at the ladder source: the exponential ban window is diff --git a/packages/rs-dapi-client/src/dapi_client.rs b/packages/rs-dapi-client/src/dapi_client.rs index 34952e9a49a..818893f2e1a 100644 --- a/packages/rs-dapi-client/src/dapi_client.rs +++ b/packages/rs-dapi-client/src/dapi_client.rs @@ -4,6 +4,7 @@ use dapi_grpc::mock::Mockable; use dapi_grpc::tonic::async_trait; #[cfg(not(target_arch = "wasm32"))] use dapi_grpc::tonic::transport::Certificate; +use rand::{rngs::SmallRng, Rng, SeedableRng}; use std::fmt::{Debug, Display}; use std::time::Duration; use tracing::Instrument; @@ -18,6 +19,59 @@ use crate::{ RequestSettings, }; +/// Base delay between retries. Genuine (non-rate-limited) failures keep this +/// flat delay — unchanged from the pre-existing behavior. +const RETRY_BASE_DELAY_MS: u64 = 10; + +/// Upper bound for the rate-limit backoff window. Caps the capped-exponential +/// growth so a throttled fleet gets breathing room without stalling the call +/// for too long. +const RATE_LIMIT_MAX_DELAY_MS: u64 = 500; + +/// Capped-exponential backoff window for a rate-limited retry: `base × 2^attempt`, +/// saturating at [`RATE_LIMIT_MAX_DELAY_MS`]. +/// +/// Pure (no jitter) so the growth/cap can be reasoned about and tested directly; +/// jitter is applied on top by [`retry_delay`]. `attempt` is the 0-based retry +/// index (0 for the first retry). +fn rate_limit_backoff_window(attempt: u32) -> Duration { + // `base << attempt` == `base * 2^attempt`; `checked_shl` guards against a + // huge `attempt` (e.g. a caller configuring an enormous retry budget). + let window_ms = RETRY_BASE_DELAY_MS + .checked_shl(attempt) + .unwrap_or(u64::MAX) + .min(RATE_LIMIT_MAX_DELAY_MS); + Duration::from_millis(window_ms) +} + +/// Delay to wait before the next retry attempt. +/// +/// * Genuine failures keep the pre-existing flat [`RETRY_BASE_DELAY_MS`] delay. +/// * Rate-limit (`ResourceExhausted`) retries use **capped exponential backoff +/// with full jitter**: the window grows as `base × 2^attempt`, saturating at +/// [`RATE_LIMIT_MAX_DELAY_MS`], and the actual sleep is a uniform-random +/// fraction of that window. Jitter de-correlates clients so a congested fleet +/// is not retried in lockstep (a synchronized retry storm); the growth + cap +/// bound the per-call amplification. +/// +/// `attempt` is the 0-based retry index (0 for the first retry). +// +// TODO(rate-limit): when the server attaches `google.rpc.RetryInfo` to the +// `ResourceExhausted` status, honor its `retry_delay` as the floor for this +// sleep. Extracting it needs decoding `grpc-status-details-bin`, so it is left +// as a follow-up rather than plumbed here. +fn retry_delay(rate_limited: bool, attempt: u32) -> Duration { + if !rate_limited { + return Duration::from_millis(RETRY_BASE_DELAY_MS); + } + + let window = rate_limit_backoff_window(attempt); + // Full jitter: sleep a uniform-random fraction of the window, i.e. a value + // in `[0, window)`. + let jitter: f64 = SmallRng::from_entropy().gen(); + window.mul_f64(jitter) +} + /// General DAPI request error type. #[derive(Debug, thiserror::Error, Clone)] #[cfg_attr(feature = "mocks", derive(serde::Serialize, serde::Deserialize))] @@ -348,6 +402,152 @@ mod tests { assert!(!err.is_rate_limited()); } + /// Property: the load-bearing `is_rate_limited() ⇒ can_retry()` invariant + /// holds for *every* gRPC status code across *every* in-crate `CanRetry` + /// implementor (`tonic::Status`, `TransportError`, `DapiClientError`, + /// `ExecutionError`). + /// + /// The rotate-don't-ban behavior depends on this: a rate-limited error that + /// is NOT retryable would skip the retry loop, silently killing rotation + /// (see the `CanRetry` docs and the `debug_assert!` in `execute`). This test + /// pins the invariant so any future widening of `is_rate_limited` (or + /// narrowing of `can_retry`) that breaks it fails loudly here. + #[test] + fn test_rate_limit_implies_retryable_invariant() { + use dapi_grpc::tonic::Code::*; + + // The full gRPC code set — keep exhaustive so a newly classified code + // can never silently dodge the invariant check. + let all_codes = [ + Ok, + Cancelled, + Unknown, + InvalidArgument, + DeadlineExceeded, + NotFound, + AlreadyExists, + PermissionDenied, + ResourceExhausted, + FailedPrecondition, + Aborted, + OutOfRange, + Unimplemented, + Internal, + Unavailable, + DataLoss, + Unauthenticated, + ]; + + let addr = mock_address(); + + for code in all_codes { + // `tonic::Status` is not `Clone`, so build a fresh one per layer. + let status = dapi_grpc::tonic::Status::new(code, "x"); + assert!( + !status.is_rate_limited() || status.can_retry(), + "tonic::Status {code:?}: is_rate_limited() must imply can_retry()" + ); + + let te = TransportError::Grpc(dapi_grpc::tonic::Status::new(code, "x")); + assert!( + !te.is_rate_limited() || te.can_retry(), + "TransportError {code:?}: is_rate_limited() must imply can_retry()" + ); + + let dce = DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::new(code, "x"), + )); + assert!( + !dce.is_rate_limited() || dce.can_retry(), + "DapiClientError {code:?}: is_rate_limited() must imply can_retry()" + ); + + let ee: ExecutionError = ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::new(code, "x"), + )), + retries: 0, + address: Some(addr.clone()), + }; + assert!( + !ee.is_rate_limited() || ee.can_retry(), + "ExecutionError {code:?}: is_rate_limited() must imply can_retry()" + ); + } + + // Guard the non-vacuous case: ResourceExhausted must actually be BOTH + // rate-limited AND retryable, otherwise the invariant could hold only + // because nothing is ever rate-limited (the feature would be dead). + let re = dapi_grpc::tonic::Status::resource_exhausted("429"); + assert!( + re.is_rate_limited() && re.can_retry(), + "ResourceExhausted must be rate-limited AND retryable" + ); + } + + #[test] + fn test_retry_delay_non_rate_limited_is_flat() { + // Genuine failures keep the pre-existing flat base delay, regardless of + // the attempt index — unchanged behavior. + let base = Duration::from_millis(RETRY_BASE_DELAY_MS); + for attempt in 0..8u32 { + assert_eq!(retry_delay(false, attempt), base); + } + } + + #[test] + fn test_rate_limit_backoff_window_grows_and_caps() { + // base × 2^attempt up to the cap. + assert_eq!(rate_limit_backoff_window(0), Duration::from_millis(10)); + assert_eq!(rate_limit_backoff_window(1), Duration::from_millis(20)); + assert_eq!(rate_limit_backoff_window(2), Duration::from_millis(40)); + assert_eq!(rate_limit_backoff_window(3), Duration::from_millis(80)); + assert_eq!(rate_limit_backoff_window(4), Duration::from_millis(160)); + assert_eq!(rate_limit_backoff_window(5), Duration::from_millis(320)); + // 10 × 2^6 = 640 → capped at 500. + assert_eq!( + rate_limit_backoff_window(6), + Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS) + ); + // Monotonic non-decreasing and never above the cap. + let mut prev = Duration::ZERO; + for attempt in 0..8u32 { + let w = rate_limit_backoff_window(attempt); + assert!(w >= prev, "window must be non-decreasing"); + assert!(w <= Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS)); + prev = w; + } + // A pathologically large attempt must not panic and stays capped. + assert_eq!( + rate_limit_backoff_window(u32::MAX), + Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS) + ); + } + + #[test] + fn test_retry_delay_rate_limited_is_jittered_within_window() { + // Use a wide window (attempt 5 → 320ms) so jitter has room to vary. + let attempt = 5u32; + let window = rate_limit_backoff_window(attempt); + + let samples: Vec = (0..128).map(|_| retry_delay(true, attempt)).collect(); + + // Full jitter: every sample is within [0, window). + for d in &samples { + assert!( + *d < window, + "jittered delay {d:?} must be strictly below the window {window:?}" + ); + } + // Jitter is actually applied: across many samples we see more than one + // distinct value (probability of all-equal is astronomically small). + let first = samples[0]; + assert!( + samples.iter().any(|d| *d != first), + "rate-limit delay must be randomized (jitter), got all equal" + ); + } + #[cfg(feature = "mocks")] #[test] fn test_can_retry_mock_error() { @@ -736,7 +936,6 @@ impl DapiRequestExecutor for DapiClient { let dump_request = request.clone(); let max_retries = applied_settings.retries; - let retry_delay = Duration::from_millis(10); let mut retries: usize = 0; // Track the last transport error for when all addresses get exhausted @@ -749,15 +948,30 @@ impl DapiRequestExecutor for DapiClient { let result: ExecutionResult = async { loop { - // Try to get an address to initialize transport on, rotating off - // the nodes already tried in this execution. If excluding the - // tried set empties the live pool (e.g. a single-node pool, or - // every live node already tried), fall back to the non-excluding - // selection so a healthy-but-throttled node can still be retried - // rather than surfacing a spurious NoAvailableAddresses. + // Select the next node, rotating off every node already tried in + // this execution so a retry goes to a *different* node. + // Rate-limited (ResourceExhausted) nodes stay in the pool (never + // banned), so without this exclusion a retry could re-pick the + // very node that just throttled us. + // + // Fallback order once every live node has been tried: + // 1. exclude only the node we *just* tried, so a small pool + // (e.g. two nodes) keeps alternating instead of re-hitting + // the just-throttled node at random; + // 2. if that is still empty (single-node pool, or the + // just-tried node is the only live one), reuse it — there is + // no alternative. The backoff below keeps this from + // hammering the throttled node. + let just_tried = tried.last().cloned(); let Some(address) = self .address_list .get_live_address_excluding(&tried) + .or_else(|| { + just_tried.as_ref().and_then(|last| { + self.address_list + .get_live_address_excluding(std::slice::from_ref(last)) + }) + }) .or_else(|| self.address_list.get_live_address()) else { // No available addresses - wrap with last meaningful error if we have one @@ -826,19 +1040,21 @@ impl DapiRequestExecutor for DapiClient { ); if can_retry_error && retries < max_retries { + let rate_limited = cloned_error.is_rate_limited(); // Store last transport error last_transport_error = Some(cloned_error); // Rotate off this node on the next attempt. tried.push(address.clone()); + let delay = retry_delay(rate_limited, retries as u32); retries += 1; tracing::warn!( error = ?execution_error, - "retrying error with sleeping {} secs", - retry_delay.as_secs_f32() + delay_ms = delay.as_millis() as u64, + "retrying error after backoff" ); - transport::sleep(retry_delay).await; + transport::sleep(delay).await; continue; } @@ -885,7 +1101,17 @@ impl DapiRequestExecutor for DapiClient { match execution_result { Ok(response) => return Ok(response), Err(error) => { + // Invariant lock (load-bearing): a rate-limited error MUST + // be retryable, otherwise the rotation below never fires + // and the rotate-don't-ban behavior silently dies. See the + // `CanRetry` docs and the `test_rate_limit_implies_retryable_invariant` + // property test. + debug_assert!( + !error.is_rate_limited() || error.can_retry(), + "is_rate_limited() must imply can_retry(); rate-limit rotation depends on it" + ); if error.can_retry() && retries < max_retries { + let rate_limited = error.is_rate_limited(); // Store last transport error if let DapiClientError::Transport(ref te) = error.inner { last_transport_error = Some(te.clone()); @@ -894,13 +1120,14 @@ impl DapiRequestExecutor for DapiClient { // Rotate off this node on the next attempt. tried.push(address.clone()); + let delay = retry_delay(rate_limited, retries as u32); retries += 1; tracing::warn!( ?error, - "retrying error with sleeping {} secs", - retry_delay.as_secs_f32() + delay_ms = delay.as_millis() as u64, + "retrying error after backoff" ); - transport::sleep(retry_delay).await; + transport::sleep(delay).await; continue; } diff --git a/packages/rs-sdk/src/error.rs b/packages/rs-sdk/src/error.rs index 64be94bc53e..17a2506fbfc 100644 --- a/packages/rs-sdk/src/error.rs +++ b/packages/rs-sdk/src/error.rs @@ -353,6 +353,23 @@ impl CanRetry for Error { | Error::DapiClientError(DapiClientError::NoAvailableAddressesToRetry(_)) ) } + + /// Delegate rate-limit classification to the wrapped [`DapiClientError`]. + /// + /// This keeps the rotate-don't-ban semantics introduced in `rs-dapi-client` + /// alive at the SDK layer independently of how [`Self::can_retry`] is + /// defined: should a future change make a wrapped transport error retryable, + /// `update_address_ban_status` will correctly rotate a throttled node rather + /// than ban it (today the `can_retry()` guard short-circuits first, so this + /// is pure future-proofing with no behavior change). + /// + /// Note this is the one `CanRetry` method where `is_rate_limited() ⇒ + /// can_retry()` need NOT hold at the SDK layer: the SDK's outer retry loop + /// is intentionally conservative (see `sync::retry`) and the actual + /// rate-limit rotation happens in the inner `rs-dapi-client` executor. + fn is_rate_limited(&self) -> bool { + matches!(self, Error::DapiClientError(inner) if inner.is_rate_limited()) + } } /// Server returned stale metadata @@ -566,4 +583,31 @@ mod tests { assert!(super::extract_drive_error_message(&payload).is_none()); } } + + /// Regression: the SDK `Error` must propagate the wrapped + /// `DapiClientError`'s rate-limit classification so a throttled node is not + /// banned at the SDK layer (rotate-don't-ban). Locks the delegation so a + /// future refactor cannot silently drop it. + #[test] + fn test_is_rate_limited_delegates_to_inner_dapi_client_error() { + // A ResourceExhausted transport error wrapped by the SDK is rate-limited. + let rate_limited: Error = DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::resource_exhausted("429"), + )) + .into(); + assert!( + rate_limited.is_rate_limited(), + "SDK Error must delegate rate-limit classification to the wrapped DapiClientError" + ); + + // A non-rate-limited transport error must not be classified as such. + let unavailable: Error = DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::unavailable("down"), + )) + .into(); + assert!(!unavailable.is_rate_limited()); + + // Non-DAPI errors fall back to the trait default (false). + assert!(!Error::Config("misconfigured".to_string()).is_rate_limited()); + } } From b8a8c329f222c7dd7eb3ee74b0fffcc0ca38ecbe Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Mon, 22 Jun 2026 17:01:22 +0200 Subject: [PATCH 3/6] fix(rs-dapi-client): clamp rate-limit backoff shift + symmetric invariant assert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two self-introduced nits from the verify pass. 1. `rate_limit_backoff_window`'s `checked_shl` guards only the shift AMOUNT, not value bit-loss: a large `attempt` (e.g. 63) shifts the set bits out of `10 << attempt`, collapsing the window to 0ms — no backoff at all. Pathological (needs retries >= 64; default is 5) but it's a footgun the backoff math created. Clamp the shift amount at 16 (already saturates the 500ms cap, so nothing changes in the valid range) and extend the unit test with the attempt=63/64 boundary asserting the window stays at the cap. 2. Add the symmetric `debug_assert!(!is_rate_limited() || can_retry())` to the transport-client-creation error block for parity with the transport-error block. Defense-in-depth; the property test already covers the invariant. Co-Authored-By: Claude Opus 4.6 --- packages/rs-dapi-client/src/dapi_client.rs | 30 +++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/packages/rs-dapi-client/src/dapi_client.rs b/packages/rs-dapi-client/src/dapi_client.rs index 818893f2e1a..c06879cb07e 100644 --- a/packages/rs-dapi-client/src/dapi_client.rs +++ b/packages/rs-dapi-client/src/dapi_client.rs @@ -35,10 +35,14 @@ const RATE_LIMIT_MAX_DELAY_MS: u64 = 500; /// jitter is applied on top by [`retry_delay`]. `attempt` is the 0-based retry /// index (0 for the first retry). fn rate_limit_backoff_window(attempt: u32) -> Duration { - // `base << attempt` == `base * 2^attempt`; `checked_shl` guards against a - // huge `attempt` (e.g. a caller configuring an enormous retry budget). + // `base << attempt` == `base * 2^attempt`. Clamp the shift AMOUNT first: + // `checked_shl` only guards against shifting by >= the bit width, not against + // the set bits being shifted out — e.g. `10 << 63` wraps to 0, silently + // collapsing the window to no backoff at all. The window already saturates + // at the cap by attempt 6, so clamping at 16 changes nothing in the valid + // range while removing that footgun. let window_ms = RETRY_BASE_DELAY_MS - .checked_shl(attempt) + .checked_shl(attempt.min(16)) .unwrap_or(u64::MAX) .min(RATE_LIMIT_MAX_DELAY_MS); Duration::from_millis(window_ms) @@ -517,6 +521,17 @@ mod tests { assert!(w <= Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS)); prev = w; } + // Regression: a large `attempt` must NOT collapse the window to 0 via + // bit-loss in the shift (`10 << 63` wraps to 0). The shift-amount clamp + // keeps the window pinned at the cap across the whole boundary. + assert_eq!( + rate_limit_backoff_window(63), + Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS) + ); + assert_eq!( + rate_limit_backoff_window(64), + Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS) + ); // A pathologically large attempt must not panic and stays capped. assert_eq!( rate_limit_backoff_window(u32::MAX), @@ -1027,6 +1042,15 @@ impl DapiRequestExecutor for DapiClient { // Clone error before moving it let cloned_error = transport_error.clone(); + // Invariant lock (mirror of the transport-error arm + // below): a rate-limited error MUST be retryable, else + // the rotation never fires. See `CanRetry` docs and + // `test_rate_limit_implies_retryable_invariant`. + debug_assert!( + !cloned_error.is_rate_limited() || cloned_error.can_retry(), + "is_rate_limited() must imply can_retry(); rate-limit rotation depends on it" + ); + let execution_error = ExecutionError { inner: DapiClientError::Transport(transport_error), retries, From 8a41c92d61d5dc3f85de1bf3f34d9ff3bd5a18d4 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 14:30:48 +0200 Subject: [PATCH 4/6] fix(rs-dapi-client): replace rotate-on-rate-limit with Envoy-driven ban-for-duration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the "rotate-don't-ban" approach with "ban for server-dictated window": - Add `rate_limit.rs`: decode `google.rpc.RetryInfo` from gRPC status details via minimal prost structs; `DAPI_RATE_LIMIT_BAN_MS` env-var fallback (default 60 s) when no RetryInfo is present; WASM-compatible guard on env-var read. - Add `AddressStatus::ban_for_duration()` + `AddressList::ban_for_duration()`: set `banned_until` without touching `ban_count` so the health-ban exponential ladder is never disturbed by rate-limit events. - Add `CanRetry::rate_limit_ban_duration()` (default `None`) and wire it through `tonic::Status`, `TransportError`, `ExecutionError`, `DapiClientError` and `dash_sdk::Error`. - Update `update_address_ban_status`: `ResourceExhausted` now calls `ban_for_duration(server_hint.unwrap_or(fallback))` instead of skipping. - Remove rotation machinery: `tried: Vec
`, 3-tier `get_live_address_excluding` selection, jitter backoff (`retry_delay`, `rate_limit_backoff_window`, `RATE_LIMIT_MAX_DELAY_MS`), and rand import. The retry loop now always calls `get_live_address()` — rate-limited nodes are naturally absent from the live pool until their ban window expires. - Fix `AddressBanInfo.banned`: now consistent with `get_live_address` filtering (checks only `banned_until`, not `ban_count > 0`), so rate-limit bans are visible in diagnostics. - Delete `tests/rate_limit_rotation.rs`; update/add invariant tests. All 419 unit/integration/doc tests pass. Zero clippy warnings. Co-Authored-By: Claude Sonnet 4.6 --- Cargo.lock | 1 + packages/rs-dapi-client/Cargo.toml | 1 + packages/rs-dapi-client/src/address_list.rs | 233 ++++++----- packages/rs-dapi-client/src/dapi_client.rs | 376 ++++++++---------- packages/rs-dapi-client/src/executor.rs | 4 + packages/rs-dapi-client/src/lib.rs | 22 +- packages/rs-dapi-client/src/rate_limit.rs | 234 +++++++++++ packages/rs-dapi-client/src/transport.rs | 6 + packages/rs-dapi-client/src/transport/grpc.rs | 12 +- .../tests/rate_limit_rotation.rs | 207 ---------- packages/rs-sdk/src/error.rs | 69 +++- 11 files changed, 600 insertions(+), 565 deletions(-) create mode 100644 packages/rs-dapi-client/src/rate_limit.rs delete mode 100644 packages/rs-dapi-client/tests/rate_limit_rotation.rs diff --git a/Cargo.lock b/Cargo.lock index e296c3aebdb..59c0ae19cda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6276,6 +6276,7 @@ dependencies = [ "http", "http-serde", "lru", + "prost 0.14.4", "rand 0.8.6", "serde", "serde_json", diff --git a/packages/rs-dapi-client/Cargo.toml b/packages/rs-dapi-client/Cargo.toml index 705f5ccec2f..9694c6496e2 100644 --- a/packages/rs-dapi-client/Cargo.toml +++ b/packages/rs-dapi-client/Cargo.toml @@ -60,6 +60,7 @@ lru = { version = "0.16.3" } serde = { version = "1.0.219", optional = true, features = ["derive"] } serde_json = { version = "1.0.140", optional = true } chrono = { version = "0.4.38", features = ["serde"] } +prost = { version = "0.14" } [dev-dependencies] tokio = { version = "1.40", features = ["macros"] } diff --git a/packages/rs-dapi-client/src/address_list.rs b/packages/rs-dapi-client/src/address_list.rs index 74f880f08b1..e46a050cd90 100644 --- a/packages/rs-dapi-client/src/address_list.rs +++ b/packages/rs-dapi-client/src/address_list.rs @@ -103,6 +103,29 @@ impl AddressStatus { self.ban_reason = reason; } + /// Ban the address for a fixed `duration` **without** incrementing the + /// health-ban counter ([`AddressStatus::ban_count`]). + /// + /// Used for rate-limit (`ResourceExhausted`) bans where the server + /// dictates when the node may be retried again. Because the node is + /// healthy (not failing), the exponential health-ban ladder must not be + /// touched — a future genuine failure should still get the first-offense + /// 60 s window. + /// + /// If the address already has a longer ban in effect (e.g. from a + /// previous health ban), the later expiry is kept so a health ban cannot + /// be shortened by a subsequent rate-limit. + pub fn ban_for_duration(&mut self, duration: Duration, reason: Option) { + let new_until = chrono::Utc::now() + duration; + self.banned_until = Some(match self.banned_until { + Some(existing) if existing > new_until => existing, + _ => new_until, + }); + self.ban_reason = reason; + // ban_count intentionally NOT incremented: rate-limit is not a + // health signal and must not affect the exponential ban ladder. + } + /// Check if [Address] is banned. pub fn is_banned(&self) -> bool { self.ban_count > 0 @@ -182,6 +205,27 @@ impl AddressList { true } + /// Ban the address for a fixed `duration` without touching the health-ban + /// counter. See [`AddressStatus::ban_for_duration`] for the full contract. + /// + /// Returns `false` if the address is not in the list. + pub fn ban_for_duration( + &self, + address: &Address, + duration: Duration, + reason: Option, + ) -> bool { + let mut guard = self.addresses.write().unwrap(); + + let Some(status) = guard.get_mut(address) else { + return false; + }; + + status.ban_for_duration(duration, reason); + + true + } + /// Clears address' ban record /// Returns false if the address is not in the list. pub fn unban(&self, address: &Address) -> bool { @@ -237,38 +281,25 @@ impl AddressList { self.add(Address::try_from(uri).expect("valid uri")) } - /// Randomly select a not banned address. - pub fn get_live_address(&self) -> Option
{ - self.get_live_address_excluding(&[]) - } - - /// Randomly select a not-banned address that is not in `exclude`. - /// - /// Uses the same ban filtering as [`Self::get_live_address`] (skips - /// addresses whose ban period has not yet expired) but additionally skips - /// any address present in `exclude`. The retry loop uses this to rotate to - /// a *different* node after a failure — in particular after a rate-limit, - /// where the just-tried node is healthy and stays in the pool, so plain - /// ban-filtering would happily pick it again. + /// Randomly select a not-banned address. /// - /// Passing an empty slice is equivalent to [`Self::get_live_address`]. - pub fn get_live_address_excluding(&self, exclude: &[Address]) -> Option
{ - // TODO(low): module-wide `.read()/.write().unwrap()` panics on a poisoned - // lock; adopt poison-tolerant locking consistently (known LOW, SEC-003). + /// An address is considered live when it has never been banned or when its + /// ban period has already expired. + pub fn get_live_address(&self) -> Option
{ + // TODO(low): module-wide `.read()/.write().unwrap()` panics on a + // poisoned lock; adopt poison-tolerant locking consistently (SEC-003). let guard = self.addresses.read().unwrap(); let mut rng = SmallRng::from_entropy(); - let now = chrono::Utc::now(); guard .iter() - .filter(|(addr, status)| { + .filter(|(_, status)| { status .banned_until .map(|banned_until| banned_until < now) .unwrap_or(true) - && !exclude.contains(addr) }) .choose(&mut rng) .map(|(addr, _)| addr.clone()) @@ -326,11 +357,16 @@ impl AddressList { guard .iter() .map(|(addr, status)| { - let banned = status.ban_count > 0 - && status - .banned_until - .map(|banned_until| banned_until >= now) - .unwrap_or(false); + // An address is "effectively banned" when its ban window is + // still in the future. This is intentionally consistent with + // the `banned_until`-only filter in `get_live_address`: a + // rate-limit ban (`ban_count == 0`, `banned_until` in the + // future) is still a ban from the caller's perspective even + // though the health-ban counter is not incremented. + let banned = status + .banned_until + .map(|banned_until| banned_until >= now) + .unwrap_or(false); AddressBanInfo { uri: addr.to_string(), banned, @@ -774,113 +810,72 @@ mod tests { } #[test] - fn test_get_live_address_excluding_skips_excluded() { - let mut list = AddressList::new(); - let addr1: Address = "http://127.0.0.1:3000".parse().unwrap(); - let addr2: Address = "http://127.0.0.1:3001".parse().unwrap(); - let addr3: Address = "http://127.0.0.1:3002".parse().unwrap(); - list.add(addr1.clone()); - list.add(addr2.clone()); - list.add(addr3.clone()); + fn test_address_status_ban_for_duration_sets_banned_until_no_ban_count() { + let mut status = AddressStatus::default(); + assert_eq!(status.ban_count, 0); + assert!(status.banned_until.is_none()); - // Excluding two leaves exactly the third, deterministically. - let exclude = [addr1, addr2]; - for _ in 0..20 { - let selected = list - .get_live_address_excluding(&exclude) - .expect("one address left"); - assert_eq!(selected, addr3); - } - } + let before = chrono::Utc::now(); + status.ban_for_duration(Duration::from_secs(30), Some("rate limited".into())); + let after = chrono::Utc::now(); - #[test] - fn test_get_live_address_excluding_empty_slice_equivalent_to_get_live_address() { - let mut list = AddressList::new(); - list.add("http://127.0.0.1:3000".parse().unwrap()); - assert!(list.get_live_address_excluding(&[]).is_some()); - assert!(list.get_live_address().is_some()); + // ban_count must stay at 0 — rate-limit is not a health signal. + assert_eq!( + status.ban_count, 0, + "ban_for_duration must not touch ban_count" + ); + + // banned_until should be roughly now + 30 s. + let until = status.banned_until.expect("banned_until must be set"); + let lower = (until - before).num_milliseconds() as f64 / 1000.0; + let upper = (until - after).num_milliseconds() as f64 / 1000.0; + assert!( + lower >= 29.9, + "banned_until lower bound too short: {lower}s" + ); + assert!(upper <= 30.1, "banned_until upper bound too long: {upper}s"); + assert_eq!(status.ban_reason.as_deref(), Some("rate limited")); } #[test] - fn test_get_live_address_excluding_all_excluded_returns_none() { - let mut list = AddressList::new(); - let addr1: Address = "http://127.0.0.1:3000".parse().unwrap(); - let addr2: Address = "http://127.0.0.1:3001".parse().unwrap(); - list.add(addr1.clone()); - list.add(addr2.clone()); - - let exclude = [addr1, addr2]; - assert!(list.get_live_address_excluding(&exclude).is_none()); + fn test_address_status_ban_for_duration_does_not_shorten_longer_ban() { + let mut status = AddressStatus::default(); + // First apply a long health ban via ban_for_duration (100 s). + status.ban_for_duration(Duration::from_secs(100), Some("node down".into())); + let long_until = status.banned_until.unwrap(); + + // Now apply a short rate-limit ban (10 s) — must NOT shorten the ban. + status.ban_for_duration(Duration::from_secs(10), Some("rate limited".into())); + assert_eq!( + status.banned_until.unwrap(), + long_until, + "ban_for_duration must not shorten a longer existing ban" + ); + // But the reason is updated. + assert_eq!(status.ban_reason.as_deref(), Some("rate limited")); } #[test] - fn test_get_live_address_excluding_still_skips_banned() { - let mut list = AddressList::new(); - let addr1: Address = "http://127.0.0.1:3000".parse().unwrap(); - let addr2: Address = "http://127.0.0.1:3001".parse().unwrap(); - list.add(addr1.clone()); - list.add(addr2.clone()); - list.ban(&addr1); // addr1 banned → filtered out - - // addr1 is banned and addr2 is excluded → nothing live remains. - assert!(list - .get_live_address_excluding(std::slice::from_ref(&addr2)) - .is_none()); - // Without exclusion, only the unbanned addr2 is live. - assert_eq!(list.get_live_address_excluding(&[]), Some(addr2)); + fn test_address_list_ban_for_duration_returns_false_for_unknown() { + let list = AddressList::new(); + let addr: Address = "http://127.0.0.1:3000".parse().unwrap(); + assert!(!list.ban_for_duration(&addr, Duration::from_secs(5), None)); } - /// Mirrors the retry loop's three-tier selection algorithm: - /// 1. `get_live_address_excluding(&tried)` — rotate off every tried node; - /// 2. else `get_live_address_excluding(&[just_tried])` — at least rotate - /// off the node we *just* failed on (small-pool alternation); - /// 3. else `get_live_address()` — reuse (single-node pool). - /// - /// With N healthy (unbanned) nodes, N consecutive selections must visit N - /// distinct nodes — i.e. each retry rotates off the just-tried node — and - /// once exclusion empties the pool, the graceful fallback still yields a - /// node rather than erroring. #[test] - fn test_retry_rotation_visits_distinct_nodes_then_falls_back() { + fn test_address_list_ban_for_duration_bans_known_address() { let mut list = AddressList::new(); - let addrs: Vec
= (3000..3003) - .map(|p| format!("http://127.0.0.1:{p}").parse().unwrap()) - .collect(); - for a in &addrs { - list.add(a.clone()); - } - - let select = |list: &AddressList, tried: &[Address]| { - list.get_live_address_excluding(tried) - .or_else(|| { - tried.last().and_then(|last| { - list.get_live_address_excluding(std::slice::from_ref(last)) - }) - }) - .or_else(|| list.get_live_address()) - }; + let addr: Address = "http://127.0.0.1:3000".parse().unwrap(); + list.add(addr.clone()); - let mut tried: Vec
= Vec::new(); - for _ in 0..addrs.len() { - let selected = select(&list, &tried).expect("address available"); - assert!( - !tried.contains(&selected), - "retry must rotate off an already-tried node" - ); - tried.push(selected); - } - assert_eq!(tried.len(), addrs.len(), "every node visited exactly once"); - - // Pool exhausted by exclusion → graceful fallback keeps it non-empty so - // a healthy-but-throttled node can still be retried; the just-tried node - // is never the one re-selected while another node is available. - let fallback = select(&list, &tried); - assert!(fallback.is_some(), "fallback must keep the pool non-empty"); - assert_ne!( - fallback.as_ref(), - tried.last(), - "fallback must rotate off the just-tried node when another exists" - ); + assert!(list.ban_for_duration(&addr, Duration::from_secs(60), Some("rl".into()))); + // The address must now be hidden from get_live_address. + assert!(list.get_live_address().is_none()); + // ban_count is still 0 (rate-limit ban, not health ban). + let info = list.ban_info(); + assert_eq!(info.len(), 1); + assert!(info[0].banned); + assert_eq!(info[0].ban_count, 0); } /// Invariant 1 at the ladder source: the exponential ban window is diff --git a/packages/rs-dapi-client/src/dapi_client.rs b/packages/rs-dapi-client/src/dapi_client.rs index c06879cb07e..3ac30b0cb75 100644 --- a/packages/rs-dapi-client/src/dapi_client.rs +++ b/packages/rs-dapi-client/src/dapi_client.rs @@ -4,13 +4,13 @@ use dapi_grpc::mock::Mockable; use dapi_grpc::tonic::async_trait; #[cfg(not(target_arch = "wasm32"))] use dapi_grpc::tonic::transport::Certificate; -use rand::{rngs::SmallRng, Rng, SeedableRng}; use std::fmt::{Debug, Display}; use std::time::Duration; use tracing::Instrument; use crate::address_list::AddressListError; use crate::connection_pool::ConnectionPool; +use crate::rate_limit::{fallback_rate_limit_ban_duration, RATE_LIMIT_BAN_ENV_VAR}; use crate::request_settings::AppliedRequestSettings; use crate::transport::{self, TransportError}; use crate::{ @@ -19,62 +19,13 @@ use crate::{ RequestSettings, }; -/// Base delay between retries. Genuine (non-rate-limited) failures keep this -/// flat delay — unchanged from the pre-existing behavior. -const RETRY_BASE_DELAY_MS: u64 = 10; - -/// Upper bound for the rate-limit backoff window. Caps the capped-exponential -/// growth so a throttled fleet gets breathing room without stalling the call -/// for too long. -const RATE_LIMIT_MAX_DELAY_MS: u64 = 500; - -/// Capped-exponential backoff window for a rate-limited retry: `base × 2^attempt`, -/// saturating at [`RATE_LIMIT_MAX_DELAY_MS`]. -/// -/// Pure (no jitter) so the growth/cap can be reasoned about and tested directly; -/// jitter is applied on top by [`retry_delay`]. `attempt` is the 0-based retry -/// index (0 for the first retry). -fn rate_limit_backoff_window(attempt: u32) -> Duration { - // `base << attempt` == `base * 2^attempt`. Clamp the shift AMOUNT first: - // `checked_shl` only guards against shifting by >= the bit width, not against - // the set bits being shifted out — e.g. `10 << 63` wraps to 0, silently - // collapsing the window to no backoff at all. The window already saturates - // at the cap by attempt 6, so clamping at 16 changes nothing in the valid - // range while removing that footgun. - let window_ms = RETRY_BASE_DELAY_MS - .checked_shl(attempt.min(16)) - .unwrap_or(u64::MAX) - .min(RATE_LIMIT_MAX_DELAY_MS); - Duration::from_millis(window_ms) -} - -/// Delay to wait before the next retry attempt. -/// -/// * Genuine failures keep the pre-existing flat [`RETRY_BASE_DELAY_MS`] delay. -/// * Rate-limit (`ResourceExhausted`) retries use **capped exponential backoff -/// with full jitter**: the window grows as `base × 2^attempt`, saturating at -/// [`RATE_LIMIT_MAX_DELAY_MS`], and the actual sleep is a uniform-random -/// fraction of that window. Jitter de-correlates clients so a congested fleet -/// is not retried in lockstep (a synchronized retry storm); the growth + cap -/// bound the per-call amplification. +/// Flat delay between retries for all non-rate-limited failures. /// -/// `attempt` is the 0-based retry index (0 for the first retry). -// -// TODO(rate-limit): when the server attaches `google.rpc.RetryInfo` to the -// `ResourceExhausted` status, honor its `retry_delay` as the floor for this -// sleep. Extracting it needs decoding `grpc-status-details-bin`, so it is left -// as a follow-up rather than plumbed here. -fn retry_delay(rate_limited: bool, attempt: u32) -> Duration { - if !rate_limited { - return Duration::from_millis(RETRY_BASE_DELAY_MS); - } - - let window = rate_limit_backoff_window(attempt); - // Full jitter: sleep a uniform-random fraction of the window, i.e. a value - // in `[0, window)`. - let jitter: f64 = SmallRng::from_entropy().gen(); - window.mul_f64(jitter) -} +/// Rate-limited (`ResourceExhausted`) failures ban the node for the +/// server-dictated window (see [`crate::rate_limit`] and +/// `DAPI_RATE_LIMIT_BAN_MS`); the retry then picks the next live node +/// immediately without an additional sleep. +const RETRY_DELAY_MS: u64 = 10; /// General DAPI request error type. #[derive(Debug, thiserror::Error, Clone)] @@ -137,6 +88,13 @@ impl CanRetry for DapiClientError { Mock(_) => false, } } + + fn rate_limit_ban_duration(&self) -> Option { + match self { + DapiClientError::Transport(te) => te.rate_limit_ban_duration(), + _ => None, + } + } } /// Serialization of [DapiClientError]. @@ -256,17 +214,47 @@ pub fn update_address_ban_status( Err(error) => { if error.can_retry() { if error.is_rate_limited() { - // ResourceExhausted is congestion/backpressure, not endpoint - // ill-health. Banning a healthy-but-throttled node shifts its - // load onto the survivors and cascades into - // NoAvailableAddressesToRetry. Keep it in the pool; the retry - // loop rotates to a different node via explicit exclusion. The - // bounded retry count bounds an over-limit client. - tracing::debug!( - ?error, - address = ?error.address, - "not banning rate-limited (ResourceExhausted) address; will rotate to another node" - ); + // ResourceExhausted is a server-side rate-limit / congestion + // signal, not a node health failure. Ban the node for the + // duration Envoy embeds in `google.rpc.RetryInfo`; fall back + // to the `DAPI_RATE_LIMIT_BAN_MS` env var (default 60 s) when + // no such hint is present. Because the node is healthy, the + // health-ban counter (ban_count) is NOT incremented — the + // exponential health-ban ladder stays intact. + if let Some(address) = error.address.as_ref() { + if applied_settings.ban_failed_address { + let ban_dur = error + .rate_limit_ban_duration() + .unwrap_or_else(fallback_rate_limit_ban_duration); + if address_list.ban_for_duration( + address, + ban_dur, + Some(error.to_string()), + ) { + tracing::debug!( + ?address, + ban_ms = ban_dur.as_millis() as u64, + "rate-limited (ResourceExhausted): banning \ + {address} for {}ms (env {RATE_LIMIT_BAN_ENV_VAR})", + ban_dur.as_millis() + ); + } else { + tracing::debug!( + ?address, + "unable to apply rate-limit ban to {address}: \ + not in address list (already removed?)" + ); + } + } else { + tracing::debug!( + ?error, + ?address, + "rate-limited address {address}: ban disabled by settings" + ); + } + } else { + tracing::debug!(?error, "rate-limited error has no address; skipping ban"); + } } else if let Some(address) = error.address.as_ref() { if applied_settings.ban_failed_address { if address_list.ban_with_reason(address, Some(error.to_string())) { @@ -489,80 +477,6 @@ mod tests { ); } - #[test] - fn test_retry_delay_non_rate_limited_is_flat() { - // Genuine failures keep the pre-existing flat base delay, regardless of - // the attempt index — unchanged behavior. - let base = Duration::from_millis(RETRY_BASE_DELAY_MS); - for attempt in 0..8u32 { - assert_eq!(retry_delay(false, attempt), base); - } - } - - #[test] - fn test_rate_limit_backoff_window_grows_and_caps() { - // base × 2^attempt up to the cap. - assert_eq!(rate_limit_backoff_window(0), Duration::from_millis(10)); - assert_eq!(rate_limit_backoff_window(1), Duration::from_millis(20)); - assert_eq!(rate_limit_backoff_window(2), Duration::from_millis(40)); - assert_eq!(rate_limit_backoff_window(3), Duration::from_millis(80)); - assert_eq!(rate_limit_backoff_window(4), Duration::from_millis(160)); - assert_eq!(rate_limit_backoff_window(5), Duration::from_millis(320)); - // 10 × 2^6 = 640 → capped at 500. - assert_eq!( - rate_limit_backoff_window(6), - Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS) - ); - // Monotonic non-decreasing and never above the cap. - let mut prev = Duration::ZERO; - for attempt in 0..8u32 { - let w = rate_limit_backoff_window(attempt); - assert!(w >= prev, "window must be non-decreasing"); - assert!(w <= Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS)); - prev = w; - } - // Regression: a large `attempt` must NOT collapse the window to 0 via - // bit-loss in the shift (`10 << 63` wraps to 0). The shift-amount clamp - // keeps the window pinned at the cap across the whole boundary. - assert_eq!( - rate_limit_backoff_window(63), - Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS) - ); - assert_eq!( - rate_limit_backoff_window(64), - Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS) - ); - // A pathologically large attempt must not panic and stays capped. - assert_eq!( - rate_limit_backoff_window(u32::MAX), - Duration::from_millis(RATE_LIMIT_MAX_DELAY_MS) - ); - } - - #[test] - fn test_retry_delay_rate_limited_is_jittered_within_window() { - // Use a wide window (attempt 5 → 320ms) so jitter has room to vary. - let attempt = 5u32; - let window = rate_limit_backoff_window(attempt); - - let samples: Vec = (0..128).map(|_| retry_delay(true, attempt)).collect(); - - // Full jitter: every sample is within [0, window). - for d in &samples { - assert!( - *d < window, - "jittered delay {d:?} must be strictly below the window {window:?}" - ); - } - // Jitter is actually applied: across many samples we see more than one - // distinct value (probability of all-equal is astronomically small). - let first = samples[0]; - assert!( - samples.iter().any(|d| *d != first), - "rate-limit delay must be randomized (jitter), got all equal" - ); - } - #[cfg(feature = "mocks")] #[test] fn test_can_retry_mock_error() { @@ -713,13 +627,16 @@ mod tests { } } - /// Invariant 2: congestion can never empty the live pool. + /// Invariant 2: rate-limited bans use a fixed-duration window, not the + /// exponential health-ban ladder. /// - /// Repeated `ResourceExhausted` across *every* address must never ban any of - /// them — `banned_until` stays `None`, `ban_count` stays 0 — and - /// `get_live_address` keeps returning `Some`. + /// After a `ResourceExhausted` error: + /// * the address IS banned (`banned_until` is `Some` and in the future), so + /// subsequent retries naturally pick a *different* node; + /// * `ban_count` stays at 0 — the health-ban ladder is untouched; + /// * the pool is NOT permanently empty: the ban expires after the window. #[test] - fn test_invariant_rate_limit_never_bans_and_pool_stays_live() { + fn test_invariant_rate_limited_bans_with_duration_not_health_ladder() { let mut address_list = AddressList::new(); let addresses: Vec = (3000..3003) .map(|p| { @@ -732,31 +649,92 @@ mod tests { address_list.add(addr.clone()); } - // Hammer every address with ResourceExhausted across many rounds. - for _ in 0..10 { - for addr in &addresses { - let result: ExecutionResult = Err(ExecutionError { - inner: DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::resource_exhausted("429"), - )), - retries: 0, - address: Some(addr.clone()), - }); - update_address_ban_status(&address_list, &result, &make_applied_settings(true)); - } + // Apply a ResourceExhausted to each address once (no RetryInfo → fallback duration). + for addr in &addresses { + let result: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::resource_exhausted("429"), + )), + retries: 0, + address: Some(addr.clone()), + }); + update_address_ban_status(&address_list, &result, &make_applied_settings(true)); } for info in address_list.ban_info() { - assert!(!info.banned, "address {} must not be banned", info.uri); - assert_eq!(info.ban_count, 0, "ban_count must stay 0 for {}", info.uri); + // Rate-limited: DOES ban (window from server or fallback). assert!( - info.banned_until.is_none(), - "banned_until must stay None for {}", + info.banned, + "rate-limited address {} must be temporarily banned", + info.uri + ); + // Health-ban ladder: untouched. + assert_eq!( + info.ban_count, 0, + "ban_count must stay 0 after rate-limit ban for {}", + info.uri + ); + assert!( + info.banned_until.is_some(), + "banned_until must be set after rate-limit ban for {}", info.uri ); } - // The pool is never emptied by congestion. - assert!(address_list.get_live_address().is_some()); + } + + /// Invariant 2b: a rate-limit ban must NOT increment ban_count even when + /// a genuine health ban was already applied to the same address. + #[test] + fn test_invariant_rate_limit_ban_does_not_increment_ban_count() { + let mut address_list = AddressList::new(); + let addr = mock_address(); + address_list.add(addr.clone()); + + // First: genuine health failure → ban_count = 1. + let genuine: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::unavailable("node down"), + )), + retries: 0, + address: Some(addr.clone()), + }); + update_address_ban_status(&address_list, &genuine, &make_applied_settings(true)); + let count_after_health_ban = address_list + .ban_info() + .into_iter() + .find(|i| i.uri.contains("3000")) + .map(|i| i.ban_count) + .unwrap_or(0); + assert_eq!( + count_after_health_ban, 1, + "health ban must set ban_count = 1" + ); + + // Unban to simulate ban expiry. + address_list.unban(&addr); + + // Then: rate-limit → ban_count must remain at 0 (fresh after unban). + // But even starting from ban_count=0 post-unban, a rate-limit must not + // push it to 1. + let rate_limited: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::resource_exhausted("429"), + )), + retries: 0, + address: Some(addr.clone()), + }); + update_address_ban_status(&address_list, &rate_limited, &make_applied_settings(true)); + + let info = address_list.ban_info(); + let entry = info + .iter() + .find(|i| i.uri == addr.to_string()) + .expect("address in ban_info"); + assert_eq!( + entry.ban_count, 0, + "rate-limit ban must NOT increment ban_count" + ); + assert!(entry.banned, "address must still be temporarily banned"); } #[test] @@ -953,42 +931,15 @@ impl DapiRequestExecutor for DapiClient { let max_retries = applied_settings.retries; let mut retries: usize = 0; - // Track the last transport error for when all addresses get exhausted + // Track the last transport error for when all addresses get exhausted. let mut last_transport_error: Option = None; - // Addresses already tried in this execution. We exclude them when - // selecting the next node so a retry rotates to a *different* node — - // crucial for rate-limited (ResourceExhausted) nodes, which stay in - // the pool (not banned) and would otherwise be picked again. - let mut tried: Vec = Vec::new(); let result: ExecutionResult = async { loop { - // Select the next node, rotating off every node already tried in - // this execution so a retry goes to a *different* node. - // Rate-limited (ResourceExhausted) nodes stay in the pool (never - // banned), so without this exclusion a retry could re-pick the - // very node that just throttled us. - // - // Fallback order once every live node has been tried: - // 1. exclude only the node we *just* tried, so a small pool - // (e.g. two nodes) keeps alternating instead of re-hitting - // the just-throttled node at random; - // 2. if that is still empty (single-node pool, or the - // just-tried node is the only live one), reuse it — there is - // no alternative. The backoff below keeps this from - // hammering the throttled node. - let just_tried = tried.last().cloned(); - let Some(address) = self - .address_list - .get_live_address_excluding(&tried) - .or_else(|| { - just_tried.as_ref().and_then(|last| { - self.address_list - .get_live_address_excluding(std::slice::from_ref(last)) - }) - }) - .or_else(|| self.address_list.get_live_address()) - else { + // Pick a live (not-banned) address. Rate-limited nodes are + // banned for a server-dictated window by update_address_ban_status, + // so they are naturally excluded until the window expires. + let Some(address) = self.address_list.get_live_address() else { // No available addresses - wrap with last meaningful error if we have one let error = if let Some(transport_error) = last_transport_error.take() { tracing::debug!( @@ -1064,21 +1015,16 @@ impl DapiRequestExecutor for DapiClient { ); if can_retry_error && retries < max_retries { - let rate_limited = cloned_error.is_rate_limited(); // Store last transport error last_transport_error = Some(cloned_error); - // Rotate off this node on the next attempt. - tried.push(address.clone()); - - let delay = retry_delay(rate_limited, retries as u32); retries += 1; tracing::warn!( error = ?execution_error, - delay_ms = delay.as_millis() as u64, - "retrying error after backoff" + delay_ms = RETRY_DELAY_MS, + "retrying error" ); - transport::sleep(delay).await; + transport::sleep(Duration::from_millis(RETRY_DELAY_MS)).await; continue; } @@ -1125,33 +1071,27 @@ impl DapiRequestExecutor for DapiClient { match execution_result { Ok(response) => return Ok(response), Err(error) => { - // Invariant lock (load-bearing): a rate-limited error MUST - // be retryable, otherwise the rotation below never fires - // and the rotate-don't-ban behavior silently dies. See the - // `CanRetry` docs and the `test_rate_limit_implies_retryable_invariant` - // property test. + // Invariant: a rate-limited error MUST be retryable so + // update_address_ban_status applies the ban-for-duration + // path. See CanRetry docs and + // test_rate_limit_implies_retryable_invariant. debug_assert!( !error.is_rate_limited() || error.can_retry(), - "is_rate_limited() must imply can_retry(); rate-limit rotation depends on it" + "is_rate_limited() must imply can_retry(); ban-for-duration depends on it" ); if error.can_retry() && retries < max_retries { - let rate_limited = error.is_rate_limited(); // Store last transport error if let DapiClientError::Transport(ref te) = error.inner { last_transport_error = Some(te.clone()); } - // Rotate off this node on the next attempt. - tried.push(address.clone()); - - let delay = retry_delay(rate_limited, retries as u32); retries += 1; tracing::warn!( ?error, - delay_ms = delay.as_millis() as u64, - "retrying error after backoff" + delay_ms = RETRY_DELAY_MS, + "retrying error" ); - transport::sleep(delay).await; + transport::sleep(Duration::from_millis(RETRY_DELAY_MS)).await; continue; } diff --git a/packages/rs-dapi-client/src/executor.rs b/packages/rs-dapi-client/src/executor.rs index 0bd816bcae5..b68c99961e7 100644 --- a/packages/rs-dapi-client/src/executor.rs +++ b/packages/rs-dapi-client/src/executor.rs @@ -87,6 +87,10 @@ impl CanRetry for ExecutionError { fn is_rate_limited(&self) -> bool { self.inner.is_rate_limited() } + + fn rate_limit_ban_duration(&self) -> Option { + self.inner.rate_limit_ban_duration() + } } /// Request execution response. diff --git a/packages/rs-dapi-client/src/lib.rs b/packages/rs-dapi-client/src/lib.rs index 0d99b6b2b06..3eb02ab47fa 100644 --- a/packages/rs-dapi-client/src/lib.rs +++ b/packages/rs-dapi-client/src/lib.rs @@ -11,6 +11,7 @@ pub mod dump; mod executor; #[cfg(feature = "mocks")] pub mod mock; +mod rate_limit; mod request_settings; pub mod transport; @@ -99,15 +100,26 @@ pub trait CanRetry { /// (gRPC `ResourceExhausted`). /// /// Rate-limit errors are retryable (see [`CanRetry::can_retry`]) but, - /// unlike genuine node ill-health, the node must NOT be banned: it is - /// healthy, just throttled. Banning it would shift its load onto the - /// remaining nodes and cascade into `NoAvailableAddressesToRetry`. - /// Instead the retry should rotate to a *different* node and leave the - /// throttled one in the pool. See `update_address_ban_status`. + /// unlike genuine node ill-health, the node is healthy — just throttled. + /// `update_address_ban_status` applies a short, server-dictated ban via + /// [`CanRetry::rate_limit_ban_duration`] instead of the exponential + /// health-ban ladder, so the node re-joins the live pool as soon as the + /// rate-limit window expires. fn is_rate_limited(&self) -> bool { false } + /// Returns the server-suggested ban duration for a rate-limited error. + /// + /// When a `ResourceExhausted` status carries a `google.rpc.RetryInfo` + /// detail, this returns the embedded `retry_delay`. Returns `None` for + /// all non-rate-limited errors and for rate-limited errors that carry no + /// server hint (the caller should use a configurable fallback in that + /// case — see `DAPI_RATE_LIMIT_BAN_MS`). + fn rate_limit_ban_duration(&self) -> Option { + None + } + /// Get boolean flag that indicates if the error is retryable. /// /// Deprecated in favor of [CanRetry::can_retry]. diff --git a/packages/rs-dapi-client/src/rate_limit.rs b/packages/rs-dapi-client/src/rate_limit.rs new file mode 100644 index 00000000000..d17413a596a --- /dev/null +++ b/packages/rs-dapi-client/src/rate_limit.rs @@ -0,0 +1,234 @@ +//! Rate-limit ban-duration extraction from Envoy gRPC status details. +//! +//! When Envoy returns `ResourceExhausted` it optionally attaches a +//! `google.rpc.RetryInfo` message inside the `grpc-status-details-bin` binary +//! trailer, telling the client exactly when it may retry. This module extracts +//! that hint so the address ban can expire at the right moment. +//! +//! If the trailer is absent or carries no `RetryInfo`, a configurable fallback +//! duration is used instead. Set `DAPI_RATE_LIMIT_BAN_MS` in the process +//! environment (e.g. via dashmate node configuration) to tune the fallback +//! without code changes. + +use prost::Message; +use std::time::Duration; + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/// Environment variable name for the fallback rate-limit ban duration +/// (milliseconds). +/// +/// When a `ResourceExhausted` status does not carry `google.rpc.RetryInfo`, +/// the address is banned for this many milliseconds. Absent or non-numeric +/// values fall back to [`DEFAULT_RATE_LIMIT_BAN_MS`] (60 000 ms = 60 s). +pub(crate) const RATE_LIMIT_BAN_ENV_VAR: &str = "DAPI_RATE_LIMIT_BAN_MS"; + +const DEFAULT_RATE_LIMIT_BAN_MS: u64 = 60_000; // 60 s + +/// Try to extract `google.rpc.RetryInfo.retry_delay` from the +/// `grpc-status-details-bin` binary trailer of a gRPC status. +/// +/// Returns `None` when +/// * the trailer is absent or empty, +/// * the bytes are not a valid `google.rpc.Status`, +/// * no `RetryInfo` detail is present, or +/// * the embedded delay is zero. +/// +/// The caller should fall back to [`fallback_rate_limit_ban_duration`] in +/// those cases. +pub(crate) fn rate_limit_ban_duration(status: &dapi_grpc::tonic::Status) -> Option { + let bytes = status.details(); + if bytes.is_empty() { + return None; + } + + let grpc_status = GrpcStatus::decode(bytes).ok()?; + + for any in &grpc_status.details { + // Match by type URL substring; the full canonical URL is + // "type.googleapis.com/google.rpc.RetryInfo". + if !any.type_url.contains("RetryInfo") { + continue; + } + let retry_info = RetryInfo::decode(any.value.as_slice()).ok()?; + if let Some(d) = retry_info.retry_delay { + let secs = d.seconds.max(0) as u64; + let nanos = d.nanos.max(0) as u32; + let duration = Duration::new(secs, nanos); + if !duration.is_zero() { + return Some(duration); + } + } + } + None +} + +/// Return the fallback rate-limit ban duration. +/// +/// Reads [`RATE_LIMIT_BAN_ENV_VAR`] (milliseconds). Falls back to +/// `DEFAULT_RATE_LIMIT_BAN_MS` (60 s) when the variable is absent or +/// contains a non-numeric value. +/// +/// On WASM targets environment variables are unavailable; the constant default +/// is always returned. +pub(crate) fn fallback_rate_limit_ban_duration() -> Duration { + #[cfg(not(target_arch = "wasm32"))] + if let Ok(v) = std::env::var(RATE_LIMIT_BAN_ENV_VAR) { + if let Ok(ms) = v.parse::() { + return Duration::from_millis(ms); + } + } + Duration::from_millis(DEFAULT_RATE_LIMIT_BAN_MS) +} + +// --------------------------------------------------------------------------- +// Minimal proto types +// +// These mirror only the fields we need from google.rpc.Status, +// google.protobuf.Any, google.rpc.RetryInfo, and google.protobuf.Duration. +// Proto3 field tags match the canonical google.rpc proto definitions. +// --------------------------------------------------------------------------- + +#[derive(Clone, prost::Message)] +struct GrpcStatus { + #[prost(int32, tag = "1")] + code: i32, + #[prost(string, tag = "2")] + message: String, + /// repeated google.protobuf.Any details = 3; + #[prost(message, repeated, tag = "3")] + details: Vec, +} + +#[derive(Clone, prost::Message)] +struct ProtoAny { + /// string type_url = 1; + #[prost(string, tag = "1")] + type_url: String, + /// bytes value = 2; + #[prost(bytes = "vec", tag = "2")] + value: Vec, +} + +#[derive(Clone, prost::Message)] +struct RetryInfo { + /// google.protobuf.Duration retry_delay = 1; + #[prost(message, optional, tag = "1")] + retry_delay: Option, +} + +#[derive(Clone, prost::Message)] +struct ProtoDuration { + /// int64 seconds = 1; + #[prost(int64, tag = "1")] + seconds: i64, + /// int32 nanos = 2; + #[prost(int32, tag = "2")] + nanos: i32, +} + +// --------------------------------------------------------------------------- +// Test helpers shared with other test modules +// --------------------------------------------------------------------------- + +/// Build a `ResourceExhausted` tonic status carrying a `google.rpc.RetryInfo` +/// with the given delay. Available in test builds only. +#[cfg(test)] +pub(crate) fn make_retry_info_status(seconds: i64, nanos: i32) -> dapi_grpc::tonic::Status { + let retry_info = RetryInfo { + retry_delay: Some(ProtoDuration { seconds, nanos }), + }; + let mut ri_bytes = Vec::new(); + retry_info.encode(&mut ri_bytes).expect("encode RetryInfo"); + + let grpc_status = GrpcStatus { + code: 8, // RESOURCE_EXHAUSTED + message: String::new(), + details: vec![ProtoAny { + type_url: "type.googleapis.com/google.rpc.RetryInfo".to_string(), + value: ri_bytes, + }], + }; + let mut gs_bytes = Vec::new(); + grpc_status + .encode(&mut gs_bytes) + .expect("encode GrpcStatus"); + + dapi_grpc::tonic::Status::with_details_and_metadata( + dapi_grpc::tonic::Code::ResourceExhausted, + "rate limited", + gs_bytes.into(), + Default::default(), + ) +} + +// --------------------------------------------------------------------------- +// Unit tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_rate_limit_ban_duration_extracts_retry_info() { + let status = make_retry_info_status(5, 0); + assert_eq!( + rate_limit_ban_duration(&status), + Some(Duration::from_secs(5)) + ); + } + + #[test] + fn test_rate_limit_ban_duration_fractional_seconds() { + // 2 seconds + 500 000 000 ns = 2.5 s + let status = make_retry_info_status(2, 500_000_000); + assert_eq!( + rate_limit_ban_duration(&status), + Some(Duration::new(2, 500_000_000)) + ); + } + + #[test] + fn test_rate_limit_ban_duration_no_details_returns_none() { + let status = dapi_grpc::tonic::Status::resource_exhausted("too many requests"); + assert!(rate_limit_ban_duration(&status).is_none()); + } + + #[test] + fn test_rate_limit_ban_duration_zero_delay_returns_none() { + // A RetryInfo with delay = 0 should be treated as absent. + let status = make_retry_info_status(0, 0); + assert!(rate_limit_ban_duration(&status).is_none()); + } + + #[test] + fn test_rate_limit_ban_duration_unknown_type_url_returns_none() { + // A detail whose type_url does not contain "RetryInfo" must be ignored. + let other = GrpcStatus { + code: 8, + message: String::new(), + details: vec![ProtoAny { + type_url: "type.googleapis.com/google.rpc.QuotaFailure".to_string(), + value: vec![], + }], + }; + let mut bytes = Vec::new(); + other.encode(&mut bytes).unwrap(); + let status = dapi_grpc::tonic::Status::with_details_and_metadata( + dapi_grpc::tonic::Code::ResourceExhausted, + "quota", + bytes.into(), + Default::default(), + ); + assert!(rate_limit_ban_duration(&status).is_none()); + } + + #[test] + fn test_fallback_rate_limit_ban_duration_positive() { + // Whatever the env var is (or isn't set), the fallback is always > 0. + assert!(fallback_rate_limit_ban_duration().as_millis() > 0); + } +} diff --git a/packages/rs-dapi-client/src/transport.rs b/packages/rs-dapi-client/src/transport.rs index c96ceeabd7c..04ddf3e15e9 100644 --- a/packages/rs-dapi-client/src/transport.rs +++ b/packages/rs-dapi-client/src/transport.rs @@ -112,6 +112,12 @@ impl CanRetry for TransportError { TransportError::Grpc(status) => status.is_rate_limited(), } } + + fn rate_limit_ban_duration(&self) -> Option { + match self { + TransportError::Grpc(status) => status.rate_limit_ban_duration(), + } + } } /// Serialization of [TransportError]. diff --git a/packages/rs-dapi-client/src/transport/grpc.rs b/packages/rs-dapi-client/src/transport/grpc.rs index 22d13b5c6b7..36f113c5527 100644 --- a/packages/rs-dapi-client/src/transport/grpc.rs +++ b/packages/rs-dapi-client/src/transport/grpc.rs @@ -150,10 +150,18 @@ impl CanRetry for dapi_grpc::tonic::Status { fn is_rate_limited(&self) -> bool { // ResourceExhausted is the gRPC mapping of an upstream rate-limit / // backpressure (e.g. Envoy per-IP 429). It is retryable but the node - // is healthy, so it must rotate rather than ban (see - // `update_address_ban_status`). Exactly one code — do NOT widen this. + // is healthy, so it receives a fixed-duration ban rather than the + // exponential health-ban ladder (see `update_address_ban_status`). + // Exactly one code — do NOT widen this. self.code() == dapi_grpc::tonic::Code::ResourceExhausted } + + fn rate_limit_ban_duration(&self) -> Option { + if self.code() != dapi_grpc::tonic::Code::ResourceExhausted { + return None; + } + crate::rate_limit::rate_limit_ban_duration(self) + } } /// Macro to implement the `TransportRequest` trait for a given request type, response type, client type, and settings. diff --git a/packages/rs-dapi-client/tests/rate_limit_rotation.rs b/packages/rs-dapi-client/tests/rate_limit_rotation.rs deleted file mode 100644 index 8d04690b451..00000000000 --- a/packages/rs-dapi-client/tests/rate_limit_rotation.rs +++ /dev/null @@ -1,207 +0,0 @@ -//! Failover behavior for gRPC `ResourceExhausted` (rate-limit / backpressure). -//! -//! Unlike a genuine node failure, a rate-limited node is *healthy* — it is just -//! throttled. The executor must therefore NOT ban it (banning would relocate -//! its load onto the survivors and cascade into `NoAvailableAddressesToRetry`). -//! Instead the retry must rotate to a *different* node, leaving the throttled -//! one in the live pool. The bounded retry count bounds an over-limit client. -//! -//! These tests drive the real `DapiClient::execute` retry loop through a fake -//! transport, complementing the unit tests in `src/`. - -use std::sync::{Arc, Mutex}; - -use dapi_grpc::mock::Mockable; -use dapi_grpc::tonic::Status; -use rs_dapi_client::transport::{ - AppliedRequestSettings, BoxFuture, TransportClient, TransportError, TransportRequest, -}; -use rs_dapi_client::{ - Address, AddressList, CanRetry, ConnectionPool, DapiClient, DapiClientError, - DapiRequestExecutor, RequestSettings, Uri, -}; - -/// Transport client that only remembers which node it was created for. -struct FakeClient { - uri: Uri, -} - -impl TransportClient for FakeClient { - fn with_uri(uri: Uri, _pool: &ConnectionPool) -> Result { - Ok(Self { uri }) - } - - fn with_uri_and_settings( - uri: Uri, - _settings: &AppliedRequestSettings, - _pool: &ConnectionPool, - ) -> Result { - Ok(Self { uri }) - } -} - -#[derive(Debug)] -struct FakeResponse; - -impl Mockable for FakeResponse {} - -#[derive(Debug, Default)] -struct State { - /// How many more attempts should be answered with RESOURCE_EXHAUSTED, - /// simulating a throttled (but healthy) node. - rate_limited_responses_left: usize, - /// Nodes that answered RESOURCE_EXHAUSTED, in order. - rate_limited_uris: Vec, -} - -/// Request that answers RESOURCE_EXHAUSTED for the first N attempts and -/// succeeds afterwards, recording which node served each throttled attempt. -#[derive(Clone, Debug)] -struct RateLimitedRequest { - state: Arc>, -} - -impl RateLimitedRequest { - fn with_rate_limited_responses(count: usize) -> Self { - Self { - state: Arc::new(Mutex::new(State { - rate_limited_responses_left: count, - rate_limited_uris: Vec::new(), - })), - } - } -} - -impl Mockable for RateLimitedRequest {} - -impl TransportRequest for RateLimitedRequest { - type Client = FakeClient; - type Response = FakeResponse; - - const SETTINGS_OVERRIDES: RequestSettings = RequestSettings::default(); - - fn method_name(&self) -> &'static str { - "fake_rate_limited_method" - } - - fn execute_transport<'c>( - self, - client: &'c mut Self::Client, - _settings: &AppliedRequestSettings, - ) -> BoxFuture<'c, Result> { - let result = { - let mut state = self.state.lock().unwrap(); - if state.rate_limited_responses_left > 0 { - state.rate_limited_responses_left -= 1; - state.rate_limited_uris.push(client.uri.clone()); - Err(TransportError::Grpc(Status::resource_exhausted( - "rate limit exceeded", - ))) - } else { - Ok(FakeResponse) - } - }; - - Box::pin(async move { result }) - } -} - -/// A rate-limited node must NOT be banned, and the retry must rotate to a -/// different node, where the request succeeds. -#[tokio::test] -async fn rate_limited_node_is_not_banned_and_request_rotates_to_another() { - // One throttled response, then success on the rotated-to node. - let request = RateLimitedRequest::with_rate_limited_responses(1); - let address_list: AddressList = "http://127.0.0.1:20001,http://127.0.0.1:20002" - .parse() - .expect("valid address list"); - let client = DapiClient::new(address_list, RequestSettings::default()); - - let response = client - .execute(request.clone(), RequestSettings::default()) - .await - .expect("request should succeed on the rotated-to node"); - - let throttled_uri = { - let state = request.state.lock().unwrap(); - assert_eq!( - state.rate_limited_uris.len(), - 1, - "exactly one node should have answered RESOURCE_EXHAUSTED" - ); - state.rate_limited_uris[0].clone() - }; - - assert_eq!(response.retries, 1); - assert_ne!( - response.address.uri(), - &throttled_uri, - "retry must rotate to a different node than the throttled one" - ); - - // The throttled node stays healthy in the pool — it must NOT be banned. - let throttled = Address::try_from(throttled_uri).expect("valid address"); - assert!( - !client.address_list().is_banned(&throttled), - "a rate-limited node must NOT be banned" - ); -} - -/// Sustained congestion across *every* node must never ban any of them and must -/// never empty the live pool. The loop is bounded by the retry budget and -/// surfaces the raw, still-retryable `ResourceExhausted`. -#[tokio::test] -async fn congestion_never_bans_nodes_nor_empties_pool() { - let settings = RequestSettings { - retries: Some(3), - ..Default::default() - }; - - // Every attempt is throttled. - let request = RateLimitedRequest::with_rate_limited_responses(usize::MAX); - let address_list: AddressList = "http://127.0.0.1:20003,http://127.0.0.1:20004" - .parse() - .expect("valid address list"); - let client = DapiClient::new(address_list, settings); - - let error = client - .execute(request.clone(), settings) - .await - .expect_err("request must fail when every node is throttled"); - - // Bounded purely by the retry budget (no banning, no address exhaustion): - // initial attempt + 3 retries = 4 attempts. - assert_eq!( - request.state.lock().unwrap().rate_limited_uris.len(), - 4, - "attempts must be bounded by retry budget (retries + 1), not by banning" - ); - - // No node was banned, so the pool is never emptied by congestion. - for info in client.address_list().ban_info() { - assert!(!info.banned, "address {} must not be banned", info.uri); - assert_eq!(info.ban_count, 0, "ban_count must stay 0 for {}", info.uri); - assert!( - info.banned_until.is_none(), - "banned_until must stay None for {}", - info.uri - ); - } - assert!( - client.address_list().get_live_address().is_some(), - "congestion must never empty the live pool" - ); - - // The raw ResourceExhausted surfaces and stays retryable — it never - // collapses into the non-retryable NoAvailableAddressesToRetry. - assert!( - error.can_retry(), - "surfaced ResourceExhausted must stay retryable" - ); - match error.inner { - DapiClientError::Transport(TransportError::Grpc(status)) => { - assert_eq!(status.code(), dapi_grpc::tonic::Code::ResourceExhausted) - } - other => panic!("expected raw Transport(Grpc(ResourceExhausted)), got: {other:?}"), - } -} diff --git a/packages/rs-sdk/src/error.rs b/packages/rs-sdk/src/error.rs index 17a2506fbfc..a523271cc49 100644 --- a/packages/rs-sdk/src/error.rs +++ b/packages/rs-sdk/src/error.rs @@ -356,20 +356,26 @@ impl CanRetry for Error { /// Delegate rate-limit classification to the wrapped [`DapiClientError`]. /// - /// This keeps the rotate-don't-ban semantics introduced in `rs-dapi-client` - /// alive at the SDK layer independently of how [`Self::can_retry`] is - /// defined: should a future change make a wrapped transport error retryable, - /// `update_address_ban_status` will correctly rotate a throttled node rather - /// than ban it (today the `can_retry()` guard short-circuits first, so this - /// is pure future-proofing with no behavior change). - /// - /// Note this is the one `CanRetry` method where `is_rate_limited() ⇒ - /// can_retry()` need NOT hold at the SDK layer: the SDK's outer retry loop - /// is intentionally conservative (see `sync::retry`) and the actual - /// rate-limit rotation happens in the inner `rs-dapi-client` executor. + /// When a `ResourceExhausted` status carries a `google.rpc.RetryInfo` hint, + /// `update_address_ban_status` applies a server-dictated ban duration rather + /// than the exponential health-ban ladder. Delegating here keeps the SDK + /// layer transparent to that mechanism. fn is_rate_limited(&self) -> bool { matches!(self, Error::DapiClientError(inner) if inner.is_rate_limited()) } + + /// Delegate the server-dictated ban duration to the wrapped error. + /// + /// Returns the `google.rpc.RetryInfo` retry delay when the inner transport + /// error is `ResourceExhausted` and the status carries that detail; `None` + /// otherwise (caller should fall back to `DAPI_RATE_LIMIT_BAN_MS`). + fn rate_limit_ban_duration(&self) -> Option { + if let Error::DapiClientError(inner) = self { + inner.rate_limit_ban_duration() + } else { + None + } + } } /// Server returned stale metadata @@ -585,9 +591,9 @@ mod tests { } /// Regression: the SDK `Error` must propagate the wrapped - /// `DapiClientError`'s rate-limit classification so a throttled node is not - /// banned at the SDK layer (rotate-don't-ban). Locks the delegation so a - /// future refactor cannot silently drop it. + /// `DapiClientError`'s rate-limit classification so a throttled node is + /// banned for the server-dictated window rather than the health-ban ladder. + /// Locks the delegation so a future refactor cannot silently drop it. #[test] fn test_is_rate_limited_delegates_to_inner_dapi_client_error() { // A ResourceExhausted transport error wrapped by the SDK is rate-limited. @@ -610,4 +616,39 @@ mod tests { // Non-DAPI errors fall back to the trait default (false). assert!(!Error::Config("misconfigured".to_string()).is_rate_limited()); } + + #[test] + fn test_rate_limit_ban_duration_delegates_to_inner() { + // Non-rate-limited transport error → always None. + let unavailable: Error = DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::unavailable("down"), + )) + .into(); + assert!( + unavailable.rate_limit_ban_duration().is_none(), + "non-rate-limited error must not have a ban duration" + ); + + // Non-DAPI error → None (default impl). + assert!( + Error::Config("bad".into()) + .rate_limit_ban_duration() + .is_none(), + "non-DAPI SDK errors must return None" + ); + + // ResourceExhausted with no RetryInfo → None (caller uses fallback). + // This validates the delegation path without requiring the internal + // test helper from rs-dapi-client; the RetryInfo extraction is tested + // exhaustively in rs-dapi-client's rate_limit module unit tests. + let rate_limited: Error = DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::resource_exhausted("429"), + )) + .into(); + // No RetryInfo attached → delegation returns None (fallback will be used). + assert!( + rate_limited.rate_limit_ban_duration().is_none(), + "ResourceExhausted without RetryInfo must return None (caller uses fallback)" + ); + } } From c905b1661c9c9d25c18b36eea6120f8b0788a4ac Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 14:53:55 +0200 Subject: [PATCH 5/6] refactor(rs-dapi-client): drive rate-limit ban from Envoy reset header; drop hand-rolled rotation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace prost/RetryInfo-based approach with direct `ratelimit-reset` HTTP header parsing. No hand-rolled rotation machinery — banning a node already achieves rotation because `get_live_address()` skips banned nodes. Changes: - Delete `rate_limit.rs` (prost/RetryInfo decoder) and remove prost dep - Replace `is_rate_limited() -> bool` on `CanRetry` trait with `rate_limit_ban_duration() -> Option` (default `None`) - `tonic::Status` impl: if `ResourceExhausted` + `ratelimit-reset` header is a parseable u64 > 0, return `Some(clamped [1s, 600s])`; else `None` - Add `AddressStatus::ban_for(period, reason)` + `AddressList::ban_for` that sets `banned_until = now + period`, `ban_count = max(ban_count, 1)`; does NOT inflate the exponential health-ban ladder - `update_address_ban_status`: dispatch to `ban_for` on `Some(period)`, fall back to existing `ban_with_reason` on `None` - Revert execute loop to base: flat 10ms retry delay, plain `get_live_address()` - Revert `rs-sdk/src/error.rs` to base (banning stays inside rs-dapi-client) - Add `LIMIT_RESPONSE_HEADERS_ENABLED=true` to dashmate rate-limiter compose - Replace deleted `tests/rate_limit_rotation.rs` with `tests/rate_limit_ban.rs` (5 focused tests for the header-parse path) Co-Authored-By: Claude Sonnet 4.6 --- Cargo.lock | 1 - .../dashmate/docker-compose.rate_limiter.yml | 4 + packages/rs-dapi-client/Cargo.toml | 1 - packages/rs-dapi-client/src/address_list.rs | 108 ++-- packages/rs-dapi-client/src/dapi_client.rs | 513 +++++------------- packages/rs-dapi-client/src/executor.rs | 4 - packages/rs-dapi-client/src/lib.rs | 26 +- packages/rs-dapi-client/src/rate_limit.rs | 234 -------- packages/rs-dapi-client/src/transport.rs | 85 +-- packages/rs-dapi-client/src/transport/grpc.rs | 39 +- .../rs-dapi-client/tests/rate_limit_ban.rs | 208 +++++++ packages/rs-sdk/src/error.rs | 85 --- 12 files changed, 470 insertions(+), 838 deletions(-) delete mode 100644 packages/rs-dapi-client/src/rate_limit.rs create mode 100644 packages/rs-dapi-client/tests/rate_limit_ban.rs diff --git a/Cargo.lock b/Cargo.lock index 59c0ae19cda..e296c3aebdb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6276,7 +6276,6 @@ dependencies = [ "http", "http-serde", "lru", - "prost 0.14.4", "rand 0.8.6", "serde", "serde_json", diff --git a/packages/dashmate/docker-compose.rate_limiter.yml b/packages/dashmate/docker-compose.rate_limiter.yml index d652b7fa22d..b75836f6eda 100644 --- a/packages/dashmate/docker-compose.rate_limiter.yml +++ b/packages/dashmate/docker-compose.rate_limiter.yml @@ -42,6 +42,10 @@ services: - GRPC_MAX_CONNECTION_AGE=1h - GRPC_MAX_CONNECTION_AGE_GRACE=10m - GRPC_PORT=8081 + # Emit RateLimit-Limit / RateLimit-Remaining / RateLimit-Reset response + # headers so rs-dapi-client can read the exact reset window and ban the + # node for that duration instead of the exponential health-ban ladder. + - LIMIT_RESPONSE_HEADERS_ENABLED=true expose: - 8081 profiles: diff --git a/packages/rs-dapi-client/Cargo.toml b/packages/rs-dapi-client/Cargo.toml index 9694c6496e2..705f5ccec2f 100644 --- a/packages/rs-dapi-client/Cargo.toml +++ b/packages/rs-dapi-client/Cargo.toml @@ -60,7 +60,6 @@ lru = { version = "0.16.3" } serde = { version = "1.0.219", optional = true, features = ["derive"] } serde_json = { version = "1.0.140", optional = true } chrono = { version = "0.4.38", features = ["serde"] } -prost = { version = "0.14" } [dev-dependencies] tokio = { version = "1.40", features = ["macros"] } diff --git a/packages/rs-dapi-client/src/address_list.rs b/packages/rs-dapi-client/src/address_list.rs index e46a050cd90..44ff2a16cae 100644 --- a/packages/rs-dapi-client/src/address_list.rs +++ b/packages/rs-dapi-client/src/address_list.rs @@ -103,27 +103,18 @@ impl AddressStatus { self.ban_reason = reason; } - /// Ban the address for a fixed `duration` **without** incrementing the - /// health-ban counter ([`AddressStatus::ban_count`]). + /// Ban the address for an exact `period` (server-advertised), bypassing the + /// exponential ladder used by [`AddressStatus::ban_with_reason`]. /// - /// Used for rate-limit (`ResourceExhausted`) bans where the server - /// dictates when the node may be retried again. Because the node is - /// healthy (not failing), the exponential health-ban ladder must not be - /// touched — a future genuine failure should still get the first-offense - /// 60 s window. - /// - /// If the address already has a longer ban in effect (e.g. from a - /// previous health ban), the later expiry is kept so a health ban cannot - /// be shortened by a subsequent rate-limit. - pub fn ban_for_duration(&mut self, duration: Duration, reason: Option) { - let new_until = chrono::Utc::now() + duration; - self.banned_until = Some(match self.banned_until { - Some(existing) if existing > new_until => existing, - _ => new_until, - }); + /// Sets `ban_count = max(ban_count, 1)` so that diagnostics + /// (`is_banned()`, `ban_info()`) correctly report the node as banned, but + /// the genuine-failure exponential ladder is not inflated — a subsequent + /// health failure still escalates from the existing `ban_count`, not from an + /// artificially incremented one. + pub fn ban_for(&mut self, period: Duration, reason: Option) { + self.banned_until = Some(chrono::Utc::now() + period); + self.ban_count = self.ban_count.max(1); self.ban_reason = reason; - // ban_count intentionally NOT incremented: rate-limit is not a - // health signal and must not affect the exponential ban ladder. } /// Check if [Address] is banned. @@ -205,23 +196,18 @@ impl AddressList { true } - /// Ban the address for a fixed `duration` without touching the health-ban - /// counter. See [`AddressStatus::ban_for_duration`] for the full contract. + /// Ban the address for an exact `period` (server-advertised). + /// See [`AddressStatus::ban_for`] for the full contract. /// /// Returns `false` if the address is not in the list. - pub fn ban_for_duration( - &self, - address: &Address, - duration: Duration, - reason: Option, - ) -> bool { + pub fn ban_for(&self, address: &Address, period: Duration, reason: Option) -> bool { let mut guard = self.addresses.write().unwrap(); let Some(status) = guard.get_mut(address) else { return false; }; - status.ban_for_duration(duration, reason); + status.ban_for(period, reason); true } @@ -357,16 +343,11 @@ impl AddressList { guard .iter() .map(|(addr, status)| { - // An address is "effectively banned" when its ban window is - // still in the future. This is intentionally consistent with - // the `banned_until`-only filter in `get_live_address`: a - // rate-limit ban (`ban_count == 0`, `banned_until` in the - // future) is still a ban from the caller's perspective even - // though the health-ban counter is not incremented. - let banned = status - .banned_until - .map(|banned_until| banned_until >= now) - .unwrap_or(false); + let banned = status.ban_count > 0 + && status + .banned_until + .map(|banned_until| banned_until >= now) + .unwrap_or(false); AddressBanInfo { uri: addr.to_string(), banned, @@ -810,72 +791,67 @@ mod tests { } #[test] - fn test_address_status_ban_for_duration_sets_banned_until_no_ban_count() { + fn test_address_status_ban_for_sets_exact_window_and_min_ban_count() { let mut status = AddressStatus::default(); assert_eq!(status.ban_count, 0); assert!(status.banned_until.is_none()); let before = chrono::Utc::now(); - status.ban_for_duration(Duration::from_secs(30), Some("rate limited".into())); + status.ban_for(Duration::from_secs(45), Some("rate limited".into())); let after = chrono::Utc::now(); - // ban_count must stay at 0 — rate-limit is not a health signal. - assert_eq!( - status.ban_count, 0, - "ban_for_duration must not touch ban_count" - ); + // ban_count must be at least 1 so is_banned() / ban_info().banned are consistent. + assert_eq!(status.ban_count, 1, "ban_for sets ban_count to max(0,1)=1"); - // banned_until should be roughly now + 30 s. + // banned_until should be roughly now + 45 s. let until = status.banned_until.expect("banned_until must be set"); let lower = (until - before).num_milliseconds() as f64 / 1000.0; let upper = (until - after).num_milliseconds() as f64 / 1000.0; assert!( - lower >= 29.9, + lower >= 44.9, "banned_until lower bound too short: {lower}s" ); - assert!(upper <= 30.1, "banned_until upper bound too long: {upper}s"); + assert!(upper <= 45.1, "banned_until upper bound too long: {upper}s"); assert_eq!(status.ban_reason.as_deref(), Some("rate limited")); } #[test] - fn test_address_status_ban_for_duration_does_not_shorten_longer_ban() { + fn test_address_status_ban_for_does_not_inflate_existing_ban_count() { + // A node already health-banned (ban_count = 3) gets rate-limited. + // ban_count must stay at 3, not grow to 4. let mut status = AddressStatus::default(); - // First apply a long health ban via ban_for_duration (100 s). - status.ban_for_duration(Duration::from_secs(100), Some("node down".into())); - let long_until = status.banned_until.unwrap(); - - // Now apply a short rate-limit ban (10 s) — must NOT shorten the ban. - status.ban_for_duration(Duration::from_secs(10), Some("rate limited".into())); + let base = Duration::from_secs(60); + status.ban_with_reason(&base, None); // → 1 + status.ban_with_reason(&base, None); // → 2 + status.ban_with_reason(&base, None); // → 3 + status.ban_for(Duration::from_secs(30), Some("rl".into())); assert_eq!( - status.banned_until.unwrap(), - long_until, - "ban_for_duration must not shorten a longer existing ban" + status.ban_count, 3, + "ban_for must not inflate ban_count above its existing value" ); - // But the reason is updated. - assert_eq!(status.ban_reason.as_deref(), Some("rate limited")); } #[test] - fn test_address_list_ban_for_duration_returns_false_for_unknown() { + fn test_address_list_ban_for_returns_false_for_unknown() { let list = AddressList::new(); let addr: Address = "http://127.0.0.1:3000".parse().unwrap(); - assert!(!list.ban_for_duration(&addr, Duration::from_secs(5), None)); + assert!(!list.ban_for(&addr, Duration::from_secs(5), None)); } #[test] - fn test_address_list_ban_for_duration_bans_known_address() { + fn test_address_list_ban_for_bans_known_address() { let mut list = AddressList::new(); let addr: Address = "http://127.0.0.1:3000".parse().unwrap(); list.add(addr.clone()); - assert!(list.ban_for_duration(&addr, Duration::from_secs(60), Some("rl".into()))); + assert!(list.ban_for(&addr, Duration::from_secs(60), Some("rl".into()))); // The address must now be hidden from get_live_address. assert!(list.get_live_address().is_none()); - // ban_count is still 0 (rate-limit ban, not health ban). + // ban_count is 1 (ban_for sets max(0,1)). let info = list.ban_info(); assert_eq!(info.len(), 1); assert!(info[0].banned); - assert_eq!(info[0].ban_count, 0); + assert_eq!(info[0].ban_count, 1); } /// Invariant 1 at the ladder source: the exponential ban window is diff --git a/packages/rs-dapi-client/src/dapi_client.rs b/packages/rs-dapi-client/src/dapi_client.rs index 3ac30b0cb75..ad73fa7eecc 100644 --- a/packages/rs-dapi-client/src/dapi_client.rs +++ b/packages/rs-dapi-client/src/dapi_client.rs @@ -10,7 +10,6 @@ use tracing::Instrument; use crate::address_list::AddressListError; use crate::connection_pool::ConnectionPool; -use crate::rate_limit::{fallback_rate_limit_ban_duration, RATE_LIMIT_BAN_ENV_VAR}; use crate::request_settings::AppliedRequestSettings; use crate::transport::{self, TransportError}; use crate::{ @@ -19,13 +18,13 @@ use crate::{ RequestSettings, }; -/// Flat delay between retries for all non-rate-limited failures. -/// -/// Rate-limited (`ResourceExhausted`) failures ban the node for the -/// server-dictated window (see [`crate::rate_limit`] and -/// `DAPI_RATE_LIMIT_BAN_MS`); the retry then picks the next live node -/// immediately without an additional sleep. -const RETRY_DELAY_MS: u64 = 10; +/// Floor for the Envoy-advertised `RateLimit-Reset` ban duration. +/// Prevents re-ban thrash when the header carries a value of 0 or 1. +pub(crate) const MIN_RATE_LIMIT_BAN_SECS: u64 = 1; +/// Ceiling for the Envoy-advertised `RateLimit-Reset` ban duration. +/// Prevents a misconfigured or hostile header from parking a healthy node for +/// an unreasonably long time. +pub(crate) const MAX_RATE_LIMIT_BAN_SECS: u64 = 600; /// General DAPI request error type. #[derive(Debug, thiserror::Error, Clone)] @@ -77,18 +76,6 @@ impl CanRetry for DapiClientError { ) } - fn is_rate_limited(&self) -> bool { - use DapiClientError::*; - match self { - NoAvailableAddresses => false, - NoAvailableAddressesToRetry(_) => false, - Transport(transport_error) => transport_error.is_rate_limited(), - AddressList(_) => false, - #[cfg(feature = "mocks")] - Mock(_) => false, - } - } - fn rate_limit_ban_duration(&self) -> Option { match self { DapiClientError::Transport(te) => te.rate_limit_ban_duration(), @@ -213,51 +200,27 @@ pub fn update_address_ban_status( } Err(error) => { if error.can_retry() { - if error.is_rate_limited() { - // ResourceExhausted is a server-side rate-limit / congestion - // signal, not a node health failure. Ban the node for the - // duration Envoy embeds in `google.rpc.RetryInfo`; fall back - // to the `DAPI_RATE_LIMIT_BAN_MS` env var (default 60 s) when - // no such hint is present. Because the node is healthy, the - // health-ban counter (ban_count) is NOT incremented — the - // exponential health-ban ladder stays intact. - if let Some(address) = error.address.as_ref() { - if applied_settings.ban_failed_address { - let ban_dur = error - .rate_limit_ban_duration() - .unwrap_or_else(fallback_rate_limit_ban_duration); - if address_list.ban_for_duration( - address, - ban_dur, - Some(error.to_string()), - ) { - tracing::debug!( - ?address, - ban_ms = ban_dur.as_millis() as u64, - "rate-limited (ResourceExhausted): banning \ - {address} for {}ms (env {RATE_LIMIT_BAN_ENV_VAR})", - ban_dur.as_millis() - ); - } else { + if let Some(address) = error.address.as_ref() { + if applied_settings.ban_failed_address { + let reason = Some(error.to_string()); + let banned = match error.rate_limit_ban_duration() { + // Envoy advertised a reset window: ban for exactly that period. + // ban_count is set to max(ban_count,1) so diagnostics see the node + // as banned, but the exponential ladder is not inflated. + Some(period) => { tracing::debug!( ?address, - "unable to apply rate-limit ban to {address}: \ - not in address list (already removed?)" + ban_secs = period.as_secs(), + "rate-limited (ResourceExhausted): banning {address} \ + for {}s (from RateLimit-Reset header)", + period.as_secs() ); + address_list.ban_for(address, period, reason) } - } else { - tracing::debug!( - ?error, - ?address, - "rate-limited address {address}: ban disabled by settings" - ); - } - } else { - tracing::debug!(?error, "rate-limited error has no address; skipping ban"); - } - } else if let Some(address) = error.address.as_ref() { - if applied_settings.ban_failed_address { - if address_list.ban_with_reason(address, Some(error.to_string())) { + // No rate-limit hint: normal exponential health-ban ladder. + None => address_list.ban_with_reason(address, reason), + }; + if banned { tracing::warn!( ?address, ?error, @@ -346,134 +309,126 @@ mod tests { assert!(!err.can_retry()); } + /// `rate_limit_ban_duration` returns `Some` only when the `ratelimit-reset` + /// header is present and positive on a `ResourceExhausted` response, and + /// the value is clamped to `[MIN_RATE_LIMIT_BAN_SECS, MAX_RATE_LIMIT_BAN_SECS]`. #[test] - fn test_dapi_client_error_is_rate_limited() { - // Transport ResourceExhausted → rate-limited (but still retryable). - let rate_limited = DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::resource_exhausted("429"), - )); - assert!(rate_limited.is_rate_limited()); - assert!(rate_limited.can_retry()); + fn test_rate_limit_ban_duration_header_parse() { + use dapi_grpc::tonic::metadata::MetadataValue; + + // Helper: build a ResourceExhausted status with a ratelimit-reset header. + let make_rl_status = |header: Option<&str>| -> dapi_grpc::tonic::Status { + let mut status = dapi_grpc::tonic::Status::resource_exhausted("429"); + if let Some(v) = header { + status + .metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from(v).unwrap()); + } + status + }; - // Other transport codes are not rate-limited. - let unavailable = DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::unavailable("down"), - )); - assert!(!unavailable.is_rate_limited()); + // Normal header value: returned clamped. + let s = make_rl_status(Some("45")); + let dur = TransportError::Grpc(s).rate_limit_ban_duration(); + assert_eq!(dur, Some(Duration::from_secs(45))); + + // Value above MAX → clamped to MAX. + let s = make_rl_status(Some("9999")); + let dur = TransportError::Grpc(s).rate_limit_ban_duration(); + assert_eq!(dur, Some(Duration::from_secs(MAX_RATE_LIMIT_BAN_SECS))); + + // Value below MIN (0) → no header → None. + let s = make_rl_status(Some("0")); + assert!(TransportError::Grpc(s).rate_limit_ban_duration().is_none()); + + // Non-numeric → None. + let s = make_rl_status(Some("garbage")); + assert!(TransportError::Grpc(s).rate_limit_ban_duration().is_none()); + + // Header absent → None. + let s = make_rl_status(None); + assert!(TransportError::Grpc(s).rate_limit_ban_duration().is_none()); + + // Non-ResourceExhausted code → None regardless of header. + let mut unavail = dapi_grpc::tonic::Status::unavailable("down"); + unavail + .metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("30").unwrap()); + assert!(TransportError::Grpc(unavail) + .rate_limit_ban_duration() + .is_none()); + } + + /// When `ResourceExhausted` carries a valid `ratelimit-reset` header, + /// `update_address_ban_status` calls `ban_for` (exact period, no ladder + /// inflation); when the header is absent it falls through to `ban_with_reason` + /// (normal exponential ladder). + #[test] + fn test_update_address_ban_status_rate_limit_ban_path() { + use dapi_grpc::tonic::metadata::MetadataValue; - // Non-transport variants are never rate-limited. - assert!(!DapiClientError::NoAvailableAddresses.is_rate_limited()); - let to_retry = DapiClientError::NoAvailableAddressesToRetry(Box::new( - TransportError::Grpc(dapi_grpc::tonic::Status::resource_exhausted("429")), - )); - assert!(!to_retry.is_rate_limited()); + let mut address_list = AddressList::new(); + let addr = mock_address(); + address_list.add(addr.clone()); + + // Build a ResourceExhausted status with ratelimit-reset: 45. + let mut status = dapi_grpc::tonic::Status::resource_exhausted("429"); + status + .metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("45").unwrap()); + + let result: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc(status)), + retries: 0, + address: Some(addr.clone()), + }); + let before = chrono::Utc::now(); + update_address_ban_status(&address_list, &result, &make_applied_settings(true)); + let after = chrono::Utc::now(); + + let info = address_list.ban_info(); + let entry = info.iter().find(|i| i.uri == addr.to_string()).unwrap(); + + // Node is banned for ~45 s. + assert!(entry.banned, "rate-limited node must be banned"); + assert_eq!(entry.ban_count, 1, "ban_count must be 1 after ban_for"); + let until = entry.banned_until.expect("banned_until set"); + let lo = (until - before).num_milliseconds() as f64 / 1000.0; + let hi = (until - after).num_milliseconds() as f64 / 1000.0; assert!( - !DapiClientError::AddressList(AddressListError::InvalidAddressUri("bad".to_string())) - .is_rate_limited() + lo >= 44.9 && hi <= 45.1, + "ban window must be ~45 s, got lo={lo} hi={hi}" ); } + /// When `ResourceExhausted` has NO `ratelimit-reset` header, + /// `update_address_ban_status` must fall back to the normal `ban_with_reason` + /// ladder (not produce a zero-second or panic ban). #[test] - fn test_execution_error_is_rate_limited_delegates() { - let err: ExecutionError = ExecutionError { - inner: DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::resource_exhausted("429"), - )), - retries: 0, - address: Some(mock_address()), - }; - assert!(err.is_rate_limited()); + fn test_update_address_ban_status_rate_limit_no_header_uses_ladder() { + let mut address_list = AddressList::new(); + let addr = mock_address(); + address_list.add(addr.clone()); - let err: ExecutionError = ExecutionError { + let result: ExecutionResult = Err(ExecutionError { inner: DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::internal("boom"), + dapi_grpc::tonic::Status::resource_exhausted("429"), )), retries: 0, - address: Some(mock_address()), - }; - assert!(!err.is_rate_limited()); - } - - /// Property: the load-bearing `is_rate_limited() ⇒ can_retry()` invariant - /// holds for *every* gRPC status code across *every* in-crate `CanRetry` - /// implementor (`tonic::Status`, `TransportError`, `DapiClientError`, - /// `ExecutionError`). - /// - /// The rotate-don't-ban behavior depends on this: a rate-limited error that - /// is NOT retryable would skip the retry loop, silently killing rotation - /// (see the `CanRetry` docs and the `debug_assert!` in `execute`). This test - /// pins the invariant so any future widening of `is_rate_limited` (or - /// narrowing of `can_retry`) that breaks it fails loudly here. - #[test] - fn test_rate_limit_implies_retryable_invariant() { - use dapi_grpc::tonic::Code::*; - - // The full gRPC code set — keep exhaustive so a newly classified code - // can never silently dodge the invariant check. - let all_codes = [ - Ok, - Cancelled, - Unknown, - InvalidArgument, - DeadlineExceeded, - NotFound, - AlreadyExists, - PermissionDenied, - ResourceExhausted, - FailedPrecondition, - Aborted, - OutOfRange, - Unimplemented, - Internal, - Unavailable, - DataLoss, - Unauthenticated, - ]; - - let addr = mock_address(); - - for code in all_codes { - // `tonic::Status` is not `Clone`, so build a fresh one per layer. - let status = dapi_grpc::tonic::Status::new(code, "x"); - assert!( - !status.is_rate_limited() || status.can_retry(), - "tonic::Status {code:?}: is_rate_limited() must imply can_retry()" - ); - - let te = TransportError::Grpc(dapi_grpc::tonic::Status::new(code, "x")); - assert!( - !te.is_rate_limited() || te.can_retry(), - "TransportError {code:?}: is_rate_limited() must imply can_retry()" - ); - - let dce = DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::new(code, "x"), - )); - assert!( - !dce.is_rate_limited() || dce.can_retry(), - "DapiClientError {code:?}: is_rate_limited() must imply can_retry()" - ); - - let ee: ExecutionError = ExecutionError { - inner: DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::new(code, "x"), - )), - retries: 0, - address: Some(addr.clone()), - }; - assert!( - !ee.is_rate_limited() || ee.can_retry(), - "ExecutionError {code:?}: is_rate_limited() must imply can_retry()" - ); - } + address: Some(addr.clone()), + }); + update_address_ban_status(&address_list, &result, &make_applied_settings(true)); - // Guard the non-vacuous case: ResourceExhausted must actually be BOTH - // rate-limited AND retryable, otherwise the invariant could hold only - // because nothing is ever rate-limited (the feature would be dead). - let re = dapi_grpc::tonic::Status::resource_exhausted("429"); + // The ban ladder is invoked: first ban → ban_count = 1, window = 60 s. + let info = address_list.ban_info(); + let entry = info.iter().find(|i| i.uri == addr.to_string()).unwrap(); assert!( - re.is_rate_limited() && re.can_retry(), - "ResourceExhausted must be rate-limited AND retryable" + entry.banned, + "node must be banned on ResourceExhausted without header" + ); + assert_eq!( + entry.ban_count, 1, + "first health-ladder ban → ban_count = 1" ); } @@ -565,178 +520,6 @@ mod tests { ); } - /// Invariant 1: the genuine-failure health-ban ladder is unchanged. - /// - /// A genuine failure (Unavailable / Internal) must still ban the node with - /// the `60s × e^ban_count` exponential window, `ban_count` incrementing on - /// each ban — byte-for-byte the pre-existing behavior. - #[test] - fn test_invariant_genuine_failure_still_bans_with_unchanged_ladder() { - let mut address_list = AddressList::new(); // base ban period = 60s - let addr = mock_address(); - address_list.add(addr.clone()); - - let base_secs = 60.0_f64; // DEFAULT_BASE_BAN_PERIOD - - for expected_count in 1..=3usize { - // Alternate genuine-failure codes to prove the ladder is not - // code-specific (both are non-rate-limited, retryable failures). - let status = if expected_count % 2 == 1 { - dapi_grpc::tonic::Status::unavailable("node down") - } else { - dapi_grpc::tonic::Status::internal("boom") - }; - - let before = chrono::Utc::now(); - let result: ExecutionResult = Err(ExecutionError { - inner: DapiClientError::Transport(TransportError::Grpc(status)), - retries: 0, - address: Some(addr.clone()), - }); - update_address_ban_status(&address_list, &result, &make_applied_settings(true)); - let after = chrono::Utc::now(); - - let info = address_list.ban_info(); - let entry = info - .iter() - .find(|i| i.uri.contains("3000")) - .expect("address present in ban info"); - - assert!(entry.banned, "genuine failure must ban the node"); - assert_eq!( - entry.ban_count, expected_count, - "ban_count must increment on each genuine failure" - ); - - // Window must equal base × e^(ban_count - 1). Since - // banned_until = t_call + period with before <= t_call <= after: - // (banned_until - before) >= period and - // (banned_until - after) <= period. - let period = base_secs * ((expected_count - 1) as f64).exp(); - let banned_until = entry.banned_until.expect("banned_until is set"); - let lower = (banned_until - before).num_milliseconds() as f64 / 1000.0; - let upper = (banned_until - after).num_milliseconds() as f64 / 1000.0; - assert!( - lower >= period - 0.05, - "ban #{expected_count} window lower bound {lower}s < expected {period}s", - ); - assert!( - upper <= period + 0.05, - "ban #{expected_count} window upper bound {upper}s > expected {period}s", - ); - } - } - - /// Invariant 2: rate-limited bans use a fixed-duration window, not the - /// exponential health-ban ladder. - /// - /// After a `ResourceExhausted` error: - /// * the address IS banned (`banned_until` is `Some` and in the future), so - /// subsequent retries naturally pick a *different* node; - /// * `ban_count` stays at 0 — the health-ban ladder is untouched; - /// * the pool is NOT permanently empty: the ban expires after the window. - #[test] - fn test_invariant_rate_limited_bans_with_duration_not_health_ladder() { - let mut address_list = AddressList::new(); - let addresses: Vec = (3000..3003) - .map(|p| { - format!("http://127.0.0.1:{p}") - .parse() - .expect("valid address") - }) - .collect(); - for addr in &addresses { - address_list.add(addr.clone()); - } - - // Apply a ResourceExhausted to each address once (no RetryInfo → fallback duration). - for addr in &addresses { - let result: ExecutionResult = Err(ExecutionError { - inner: DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::resource_exhausted("429"), - )), - retries: 0, - address: Some(addr.clone()), - }); - update_address_ban_status(&address_list, &result, &make_applied_settings(true)); - } - - for info in address_list.ban_info() { - // Rate-limited: DOES ban (window from server or fallback). - assert!( - info.banned, - "rate-limited address {} must be temporarily banned", - info.uri - ); - // Health-ban ladder: untouched. - assert_eq!( - info.ban_count, 0, - "ban_count must stay 0 after rate-limit ban for {}", - info.uri - ); - assert!( - info.banned_until.is_some(), - "banned_until must be set after rate-limit ban for {}", - info.uri - ); - } - } - - /// Invariant 2b: a rate-limit ban must NOT increment ban_count even when - /// a genuine health ban was already applied to the same address. - #[test] - fn test_invariant_rate_limit_ban_does_not_increment_ban_count() { - let mut address_list = AddressList::new(); - let addr = mock_address(); - address_list.add(addr.clone()); - - // First: genuine health failure → ban_count = 1. - let genuine: ExecutionResult = Err(ExecutionError { - inner: DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::unavailable("node down"), - )), - retries: 0, - address: Some(addr.clone()), - }); - update_address_ban_status(&address_list, &genuine, &make_applied_settings(true)); - let count_after_health_ban = address_list - .ban_info() - .into_iter() - .find(|i| i.uri.contains("3000")) - .map(|i| i.ban_count) - .unwrap_or(0); - assert_eq!( - count_after_health_ban, 1, - "health ban must set ban_count = 1" - ); - - // Unban to simulate ban expiry. - address_list.unban(&addr); - - // Then: rate-limit → ban_count must remain at 0 (fresh after unban). - // But even starting from ban_count=0 post-unban, a rate-limit must not - // push it to 1. - let rate_limited: ExecutionResult = Err(ExecutionError { - inner: DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::resource_exhausted("429"), - )), - retries: 0, - address: Some(addr.clone()), - }); - update_address_ban_status(&address_list, &rate_limited, &make_applied_settings(true)); - - let info = address_list.ban_info(); - let entry = info - .iter() - .find(|i| i.uri == addr.to_string()) - .expect("address in ban_info"); - assert_eq!( - entry.ban_count, 0, - "rate-limit ban must NOT increment ban_count" - ); - assert!(entry.banned, "address must still be temporarily banned"); - } - #[test] fn test_update_address_ban_status_retryable_error_ban_disabled() { let mut address_list = AddressList::new(); @@ -929,16 +712,15 @@ impl DapiRequestExecutor for DapiClient { let dump_request = request.clone(); let max_retries = applied_settings.retries; + let retry_delay = Duration::from_millis(10); let mut retries: usize = 0; - // Track the last transport error for when all addresses get exhausted. + // Track the last transport error for when all addresses get exhausted let mut last_transport_error: Option = None; let result: ExecutionResult = async { loop { - // Pick a live (not-banned) address. Rate-limited nodes are - // banned for a server-dictated window by update_address_ban_status, - // so they are naturally excluded until the window expires. + // Try to get an address to initialize transport on: let Some(address) = self.address_list.get_live_address() else { // No available addresses - wrap with last meaningful error if we have one let error = if let Some(transport_error) = last_transport_error.take() { @@ -993,15 +775,6 @@ impl DapiRequestExecutor for DapiClient { // Clone error before moving it let cloned_error = transport_error.clone(); - // Invariant lock (mirror of the transport-error arm - // below): a rate-limited error MUST be retryable, else - // the rotation never fires. See `CanRetry` docs and - // `test_rate_limit_implies_retryable_invariant`. - debug_assert!( - !cloned_error.is_rate_limited() || cloned_error.can_retry(), - "is_rate_limited() must imply can_retry(); rate-limit rotation depends on it" - ); - let execution_error = ExecutionError { inner: DapiClientError::Transport(transport_error), retries, @@ -1021,10 +794,10 @@ impl DapiRequestExecutor for DapiClient { retries += 1; tracing::warn!( error = ?execution_error, - delay_ms = RETRY_DELAY_MS, - "retrying error" + "retrying error with sleeping {} secs", + retry_delay.as_secs_f32() ); - transport::sleep(Duration::from_millis(RETRY_DELAY_MS)).await; + transport::sleep(retry_delay).await; continue; } @@ -1071,14 +844,6 @@ impl DapiRequestExecutor for DapiClient { match execution_result { Ok(response) => return Ok(response), Err(error) => { - // Invariant: a rate-limited error MUST be retryable so - // update_address_ban_status applies the ban-for-duration - // path. See CanRetry docs and - // test_rate_limit_implies_retryable_invariant. - debug_assert!( - !error.is_rate_limited() || error.can_retry(), - "is_rate_limited() must imply can_retry(); ban-for-duration depends on it" - ); if error.can_retry() && retries < max_retries { // Store last transport error if let DapiClientError::Transport(ref te) = error.inner { @@ -1088,10 +853,10 @@ impl DapiRequestExecutor for DapiClient { retries += 1; tracing::warn!( ?error, - delay_ms = RETRY_DELAY_MS, - "retrying error" + "retrying error with sleeping {} secs", + retry_delay.as_secs_f32() ); - transport::sleep(Duration::from_millis(RETRY_DELAY_MS)).await; + transport::sleep(retry_delay).await; continue; } diff --git a/packages/rs-dapi-client/src/executor.rs b/packages/rs-dapi-client/src/executor.rs index b68c99961e7..cdab8c6db1a 100644 --- a/packages/rs-dapi-client/src/executor.rs +++ b/packages/rs-dapi-client/src/executor.rs @@ -84,10 +84,6 @@ impl CanRetry for ExecutionError { self.inner.is_no_available_addresses() } - fn is_rate_limited(&self) -> bool { - self.inner.is_rate_limited() - } - fn rate_limit_ban_duration(&self) -> Option { self.inner.rate_limit_ban_duration() } diff --git a/packages/rs-dapi-client/src/lib.rs b/packages/rs-dapi-client/src/lib.rs index 3eb02ab47fa..2f77b2f7fcc 100644 --- a/packages/rs-dapi-client/src/lib.rs +++ b/packages/rs-dapi-client/src/lib.rs @@ -11,7 +11,6 @@ pub mod dump; mod executor; #[cfg(feature = "mocks")] pub mod mock; -mod rate_limit; mod request_settings; pub mod transport; @@ -96,26 +95,11 @@ pub trait CanRetry { false } - /// Returns true if this error is a rate-limit / congestion signal - /// (gRPC `ResourceExhausted`). - /// - /// Rate-limit errors are retryable (see [`CanRetry::can_retry`]) but, - /// unlike genuine node ill-health, the node is healthy — just throttled. - /// `update_address_ban_status` applies a short, server-dictated ban via - /// [`CanRetry::rate_limit_ban_duration`] instead of the exponential - /// health-ban ladder, so the node re-joins the live pool as soon as the - /// rate-limit window expires. - fn is_rate_limited(&self) -> bool { - false - } - - /// Returns the server-suggested ban duration for a rate-limited error. - /// - /// When a `ResourceExhausted` status carries a `google.rpc.RetryInfo` - /// detail, this returns the embedded `retry_delay`. Returns `None` for - /// all non-rate-limited errors and for rate-limited errors that carry no - /// server hint (the caller should use a configurable fallback in that - /// case — see `DAPI_RATE_LIMIT_BAN_MS`). + /// If this error is a gRPC `ResourceExhausted` (Envoy rate-limit) that + /// carries a `RateLimit-Reset` metadata header, returns the server-advertised + /// ban duration (clamped to a safe range). Returns `None` for all other + /// errors and for rate-limit errors that carry no usable header (the caller + /// falls back to the normal exponential ban ladder in that case). fn rate_limit_ban_duration(&self) -> Option { None } diff --git a/packages/rs-dapi-client/src/rate_limit.rs b/packages/rs-dapi-client/src/rate_limit.rs deleted file mode 100644 index d17413a596a..00000000000 --- a/packages/rs-dapi-client/src/rate_limit.rs +++ /dev/null @@ -1,234 +0,0 @@ -//! Rate-limit ban-duration extraction from Envoy gRPC status details. -//! -//! When Envoy returns `ResourceExhausted` it optionally attaches a -//! `google.rpc.RetryInfo` message inside the `grpc-status-details-bin` binary -//! trailer, telling the client exactly when it may retry. This module extracts -//! that hint so the address ban can expire at the right moment. -//! -//! If the trailer is absent or carries no `RetryInfo`, a configurable fallback -//! duration is used instead. Set `DAPI_RATE_LIMIT_BAN_MS` in the process -//! environment (e.g. via dashmate node configuration) to tune the fallback -//! without code changes. - -use prost::Message; -use std::time::Duration; - -// --------------------------------------------------------------------------- -// Public API -// --------------------------------------------------------------------------- - -/// Environment variable name for the fallback rate-limit ban duration -/// (milliseconds). -/// -/// When a `ResourceExhausted` status does not carry `google.rpc.RetryInfo`, -/// the address is banned for this many milliseconds. Absent or non-numeric -/// values fall back to [`DEFAULT_RATE_LIMIT_BAN_MS`] (60 000 ms = 60 s). -pub(crate) const RATE_LIMIT_BAN_ENV_VAR: &str = "DAPI_RATE_LIMIT_BAN_MS"; - -const DEFAULT_RATE_LIMIT_BAN_MS: u64 = 60_000; // 60 s - -/// Try to extract `google.rpc.RetryInfo.retry_delay` from the -/// `grpc-status-details-bin` binary trailer of a gRPC status. -/// -/// Returns `None` when -/// * the trailer is absent or empty, -/// * the bytes are not a valid `google.rpc.Status`, -/// * no `RetryInfo` detail is present, or -/// * the embedded delay is zero. -/// -/// The caller should fall back to [`fallback_rate_limit_ban_duration`] in -/// those cases. -pub(crate) fn rate_limit_ban_duration(status: &dapi_grpc::tonic::Status) -> Option { - let bytes = status.details(); - if bytes.is_empty() { - return None; - } - - let grpc_status = GrpcStatus::decode(bytes).ok()?; - - for any in &grpc_status.details { - // Match by type URL substring; the full canonical URL is - // "type.googleapis.com/google.rpc.RetryInfo". - if !any.type_url.contains("RetryInfo") { - continue; - } - let retry_info = RetryInfo::decode(any.value.as_slice()).ok()?; - if let Some(d) = retry_info.retry_delay { - let secs = d.seconds.max(0) as u64; - let nanos = d.nanos.max(0) as u32; - let duration = Duration::new(secs, nanos); - if !duration.is_zero() { - return Some(duration); - } - } - } - None -} - -/// Return the fallback rate-limit ban duration. -/// -/// Reads [`RATE_LIMIT_BAN_ENV_VAR`] (milliseconds). Falls back to -/// `DEFAULT_RATE_LIMIT_BAN_MS` (60 s) when the variable is absent or -/// contains a non-numeric value. -/// -/// On WASM targets environment variables are unavailable; the constant default -/// is always returned. -pub(crate) fn fallback_rate_limit_ban_duration() -> Duration { - #[cfg(not(target_arch = "wasm32"))] - if let Ok(v) = std::env::var(RATE_LIMIT_BAN_ENV_VAR) { - if let Ok(ms) = v.parse::() { - return Duration::from_millis(ms); - } - } - Duration::from_millis(DEFAULT_RATE_LIMIT_BAN_MS) -} - -// --------------------------------------------------------------------------- -// Minimal proto types -// -// These mirror only the fields we need from google.rpc.Status, -// google.protobuf.Any, google.rpc.RetryInfo, and google.protobuf.Duration. -// Proto3 field tags match the canonical google.rpc proto definitions. -// --------------------------------------------------------------------------- - -#[derive(Clone, prost::Message)] -struct GrpcStatus { - #[prost(int32, tag = "1")] - code: i32, - #[prost(string, tag = "2")] - message: String, - /// repeated google.protobuf.Any details = 3; - #[prost(message, repeated, tag = "3")] - details: Vec, -} - -#[derive(Clone, prost::Message)] -struct ProtoAny { - /// string type_url = 1; - #[prost(string, tag = "1")] - type_url: String, - /// bytes value = 2; - #[prost(bytes = "vec", tag = "2")] - value: Vec, -} - -#[derive(Clone, prost::Message)] -struct RetryInfo { - /// google.protobuf.Duration retry_delay = 1; - #[prost(message, optional, tag = "1")] - retry_delay: Option, -} - -#[derive(Clone, prost::Message)] -struct ProtoDuration { - /// int64 seconds = 1; - #[prost(int64, tag = "1")] - seconds: i64, - /// int32 nanos = 2; - #[prost(int32, tag = "2")] - nanos: i32, -} - -// --------------------------------------------------------------------------- -// Test helpers shared with other test modules -// --------------------------------------------------------------------------- - -/// Build a `ResourceExhausted` tonic status carrying a `google.rpc.RetryInfo` -/// with the given delay. Available in test builds only. -#[cfg(test)] -pub(crate) fn make_retry_info_status(seconds: i64, nanos: i32) -> dapi_grpc::tonic::Status { - let retry_info = RetryInfo { - retry_delay: Some(ProtoDuration { seconds, nanos }), - }; - let mut ri_bytes = Vec::new(); - retry_info.encode(&mut ri_bytes).expect("encode RetryInfo"); - - let grpc_status = GrpcStatus { - code: 8, // RESOURCE_EXHAUSTED - message: String::new(), - details: vec![ProtoAny { - type_url: "type.googleapis.com/google.rpc.RetryInfo".to_string(), - value: ri_bytes, - }], - }; - let mut gs_bytes = Vec::new(); - grpc_status - .encode(&mut gs_bytes) - .expect("encode GrpcStatus"); - - dapi_grpc::tonic::Status::with_details_and_metadata( - dapi_grpc::tonic::Code::ResourceExhausted, - "rate limited", - gs_bytes.into(), - Default::default(), - ) -} - -// --------------------------------------------------------------------------- -// Unit tests -// --------------------------------------------------------------------------- - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_rate_limit_ban_duration_extracts_retry_info() { - let status = make_retry_info_status(5, 0); - assert_eq!( - rate_limit_ban_duration(&status), - Some(Duration::from_secs(5)) - ); - } - - #[test] - fn test_rate_limit_ban_duration_fractional_seconds() { - // 2 seconds + 500 000 000 ns = 2.5 s - let status = make_retry_info_status(2, 500_000_000); - assert_eq!( - rate_limit_ban_duration(&status), - Some(Duration::new(2, 500_000_000)) - ); - } - - #[test] - fn test_rate_limit_ban_duration_no_details_returns_none() { - let status = dapi_grpc::tonic::Status::resource_exhausted("too many requests"); - assert!(rate_limit_ban_duration(&status).is_none()); - } - - #[test] - fn test_rate_limit_ban_duration_zero_delay_returns_none() { - // A RetryInfo with delay = 0 should be treated as absent. - let status = make_retry_info_status(0, 0); - assert!(rate_limit_ban_duration(&status).is_none()); - } - - #[test] - fn test_rate_limit_ban_duration_unknown_type_url_returns_none() { - // A detail whose type_url does not contain "RetryInfo" must be ignored. - let other = GrpcStatus { - code: 8, - message: String::new(), - details: vec![ProtoAny { - type_url: "type.googleapis.com/google.rpc.QuotaFailure".to_string(), - value: vec![], - }], - }; - let mut bytes = Vec::new(); - other.encode(&mut bytes).unwrap(); - let status = dapi_grpc::tonic::Status::with_details_and_metadata( - dapi_grpc::tonic::Code::ResourceExhausted, - "quota", - bytes.into(), - Default::default(), - ); - assert!(rate_limit_ban_duration(&status).is_none()); - } - - #[test] - fn test_fallback_rate_limit_ban_duration_positive() { - // Whatever the env var is (or isn't set), the fallback is always > 0. - assert!(fallback_rate_limit_ban_duration().as_millis() > 0); - } -} diff --git a/packages/rs-dapi-client/src/transport.rs b/packages/rs-dapi-client/src/transport.rs index 04ddf3e15e9..bea0968fd2f 100644 --- a/packages/rs-dapi-client/src/transport.rs +++ b/packages/rs-dapi-client/src/transport.rs @@ -107,12 +107,6 @@ impl CanRetry for TransportError { } } - fn is_rate_limited(&self) -> bool { - match self { - TransportError::Grpc(status) => status.is_rate_limited(), - } - } - fn rate_limit_ban_duration(&self) -> Option { match self { TransportError::Grpc(status) => status.rate_limit_ban_duration(), @@ -224,53 +218,62 @@ mod tests { assert!(!non_retryable.can_retry()); } + /// `rate_limit_ban_duration` returns `Some` only for `ResourceExhausted` with + /// a parseable positive `ratelimit-reset` header. Every other code returns + /// `None` regardless of headers. #[test] - fn test_tonic_status_is_rate_limited_only_resource_exhausted() { - // Exactly one code is a rate-limit signal. - assert!(dapi_grpc::tonic::Status::new(Code::ResourceExhausted, "429").is_rate_limited()); - - // Every other code — including the *other* retryable ones and - // DeadlineExceeded — must stay false. Widening this set would - // reintroduce the reverted multi-code classifier. - let not_rate_limited = [ + fn test_tonic_status_rate_limit_ban_duration() { + use dapi_grpc::tonic::metadata::MetadataValue; + + // ResourceExhausted + valid header → Some. + let mut status = dapi_grpc::tonic::Status::new(Code::ResourceExhausted, "429"); + status + .metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("30").unwrap()); + assert_eq!( + status.rate_limit_ban_duration(), + Some(std::time::Duration::from_secs(30)) + ); + + // ResourceExhausted without header → None. + let no_header = dapi_grpc::tonic::Status::new(Code::ResourceExhausted, "429"); + assert!(no_header.rate_limit_ban_duration().is_none()); + + // Non-ResourceExhausted codes → None regardless. + for code in [ Code::Ok, - Code::Cancelled, - Code::Unknown, - Code::InvalidArgument, - Code::DeadlineExceeded, - Code::NotFound, - Code::AlreadyExists, - Code::PermissionDenied, - Code::FailedPrecondition, - Code::Aborted, - Code::OutOfRange, - Code::Unimplemented, - Code::Internal, Code::Unavailable, - Code::DataLoss, - Code::Unauthenticated, - ]; - for code in not_rate_limited { + Code::Internal, + Code::DeadlineExceeded, + ] { + let mut s = dapi_grpc::tonic::Status::new(code, "x"); + s.metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("30").unwrap()); assert!( - !dapi_grpc::tonic::Status::new(code, "x").is_rate_limited(), - "code {:?} must NOT be classified as rate-limited", - code + s.rate_limit_ban_duration().is_none(), + "code {code:?} must return None" ); } } #[test] - fn test_transport_error_is_rate_limited_delegates() { - let rate_limited = TransportError::Grpc(dapi_grpc::tonic::Status::new( - Code::ResourceExhausted, - "429", - )); - assert!(rate_limited.is_rate_limited()); - // Rate-limited is still retryable; only the ban decision differs. + fn test_transport_error_rate_limit_ban_duration_delegates() { + use dapi_grpc::tonic::metadata::MetadataValue; + + let mut status = dapi_grpc::tonic::Status::new(Code::ResourceExhausted, "429"); + status + .metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("45").unwrap()); + let rate_limited = TransportError::Grpc(status); + assert_eq!( + rate_limited.rate_limit_ban_duration(), + Some(std::time::Duration::from_secs(45)) + ); + // Still retryable — rate-limit ban duration doesn't affect can_retry. assert!(rate_limited.can_retry()); let unavailable = TransportError::Grpc(dapi_grpc::tonic::Status::unavailable("down")); - assert!(!unavailable.is_rate_limited()); + assert!(unavailable.rate_limit_ban_duration().is_none()); assert!(unavailable.can_retry()); } diff --git a/packages/rs-dapi-client/src/transport/grpc.rs b/packages/rs-dapi-client/src/transport/grpc.rs index 36f113c5527..4253ea3d95a 100644 --- a/packages/rs-dapi-client/src/transport/grpc.rs +++ b/packages/rs-dapi-client/src/transport/grpc.rs @@ -147,20 +147,37 @@ impl CanRetry for dapi_grpc::tonic::Status { ) } - fn is_rate_limited(&self) -> bool { - // ResourceExhausted is the gRPC mapping of an upstream rate-limit / - // backpressure (e.g. Envoy per-IP 429). It is retryable but the node - // is healthy, so it receives a fixed-duration ban rather than the - // exponential health-ban ladder (see `update_address_ban_status`). - // Exactly one code — do NOT widen this. - self.code() == dapi_grpc::tonic::Code::ResourceExhausted - } - + /// Returns the Envoy-advertised ban duration for a `ResourceExhausted` + /// response, or `None` if this is not a rate-limit or carries no usable + /// `RateLimit-Reset` header. + /// + /// Envoy's global rate-limit filter emits `RateLimit-Reset: ` when + /// `LIMIT_RESPONSE_HEADERS_ENABLED=true` is set on the Lyft RLS container + /// (see `packages/dashmate/docker-compose.rate_limiter.yml`). The value is + /// the whole-second count until the per-IP window resets. + /// + /// Parse rules (adversarial-input safe): + /// * Non-`ResourceExhausted` code → `None`. + /// * Header absent, non-numeric, or `0` → `None` (caller uses normal ban + /// ladder). + /// * Valid positive integer → clamped to + /// [`[MIN_RATE_LIMIT_BAN_SECS, MAX_RATE_LIMIT_BAN_SECS]`] + /// (`dapi_client.rs`) and returned as `Some(Duration)`. fn rate_limit_ban_duration(&self) -> Option { - if self.code() != dapi_grpc::tonic::Code::ResourceExhausted { + use crate::dapi_client::{MAX_RATE_LIMIT_BAN_SECS, MIN_RATE_LIMIT_BAN_SECS}; + use dapi_grpc::tonic::Code; + if self.code() != Code::ResourceExhausted { return None; } - crate::rate_limit::rate_limit_ban_duration(self) + let secs = self + .metadata() + .get("ratelimit-reset") + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.trim().parse::().ok()) + .filter(|&s| s > 0)?; + Some(std::time::Duration::from_secs( + secs.clamp(MIN_RATE_LIMIT_BAN_SECS, MAX_RATE_LIMIT_BAN_SECS), + )) } } diff --git a/packages/rs-dapi-client/tests/rate_limit_ban.rs b/packages/rs-dapi-client/tests/rate_limit_ban.rs new file mode 100644 index 00000000000..61a98276dd4 --- /dev/null +++ b/packages/rs-dapi-client/tests/rate_limit_ban.rs @@ -0,0 +1,208 @@ +//! Integration test: `ResourceExhausted` with a `RateLimit-Reset` header causes +//! the node to be banned for that exact period (`ban_for`), while a missing +//! header falls back to the normal exponential health-ban ladder. + +use dapi_grpc::tonic::metadata::MetadataValue; +use rs_dapi_client::transport::{AppliedRequestSettings, TransportError}; +use rs_dapi_client::{ + update_address_ban_status, AddressList, CanRetry, DapiClientError, ExecutionError, + ExecutionResult, +}; +use std::time::Duration; + +fn make_address() -> rs_dapi_client::Address { + "http://127.0.0.1:3000".parse().expect("valid address") +} + +fn applied_settings(ban: bool) -> AppliedRequestSettings { + AppliedRequestSettings { + connect_timeout: None, + timeout: Duration::from_secs(10), + retries: 5, + ban_failed_address: ban, + max_decoding_message_size: None, + #[cfg(not(target_arch = "wasm32"))] + ca_certificate: None, + } +} + +/// `ResourceExhausted` + `ratelimit-reset: 45` → `ban_for` with a ~45s window. +/// `ban_count` must be set to at least 1 (diagnostics) but NOT escalated further. +#[test] +fn test_resource_exhausted_with_header_bans_for_advertised_period() { + let mut address_list = AddressList::new(); + let addr = make_address(); + address_list.add(addr.clone()); + + let mut status = dapi_grpc::tonic::Status::resource_exhausted("429"); + status + .metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("45").unwrap()); + + let result: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc(status)), + retries: 0, + address: Some(addr.clone()), + }); + + let before = chrono::Utc::now(); + update_address_ban_status(&address_list, &result, &applied_settings(true)); + let after = chrono::Utc::now(); + + let info = address_list.ban_info(); + let entry = info.iter().find(|i| i.uri == addr.to_string()).unwrap(); + + assert!( + entry.banned, + "node must be banned after ResourceExhausted+header" + ); + assert_eq!(entry.ban_count, 1, "ban_for sets ban_count to max(0,1)=1"); + + // Ban window must be approximately 45 s. + let until = entry.banned_until.expect("banned_until must be set"); + let lo = (until - before).num_milliseconds() as f64 / 1000.0; + let hi = (until - after).num_milliseconds() as f64 / 1000.0; + assert!( + lo >= 44.9 && hi <= 45.1, + "ban window must be ~45 s; got lo={lo:.2}s hi={hi:.2}s" + ); +} + +/// Large `ratelimit-reset` values are clamped to MAX_RATE_LIMIT_BAN_SECS (600). +#[test] +fn test_ratelimit_reset_clamped_to_max() { + let mut address_list = AddressList::new(); + let addr = make_address(); + address_list.add(addr.clone()); + + let mut status = dapi_grpc::tonic::Status::resource_exhausted("429"); + status + .metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("9999").unwrap()); + + let result: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc(status)), + retries: 0, + address: Some(addr.clone()), + }); + + let before = chrono::Utc::now(); + update_address_ban_status(&address_list, &result, &applied_settings(true)); + let after = chrono::Utc::now(); + + let info = address_list.ban_info(); + let entry = info.iter().find(|i| i.uri == addr.to_string()).unwrap(); + + let until = entry.banned_until.expect("banned_until set"); + let lo = (until - before).num_milliseconds() as f64 / 1000.0; + let hi = (until - after).num_milliseconds() as f64 / 1000.0; + // Clamped at 600 s. + assert!( + lo >= 599.5 && hi <= 600.5, + "9999s must be clamped to 600s; got lo={lo:.2} hi={hi:.2}" + ); +} + +/// `ratelimit-reset: 0` or non-numeric → `None` → normal `ban_with_reason` ladder. +#[test] +fn test_zero_and_garbage_header_falls_back_to_ladder() { + for bad in &["0", "garbage", ""] { + let mut address_list = AddressList::new(); + let addr = make_address(); + address_list.add(addr.clone()); + + let mut status = dapi_grpc::tonic::Status::resource_exhausted("429"); + if !bad.is_empty() { + status + .metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from(*bad).unwrap()); + } + + let result: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc(status)), + retries: 0, + address: Some(addr.clone()), + }); + update_address_ban_status(&address_list, &result, &applied_settings(true)); + + let info = address_list.ban_info(); + let entry = info.iter().find(|i| i.uri == addr.to_string()).unwrap(); + assert!( + entry.banned, + "bad header '{bad}' must still result in a ban via the ladder" + ); + assert_eq!( + entry.ban_count, 1, + "ladder ban → ban_count = 1 for header '{bad}'" + ); + } +} + +/// Missing `ratelimit-reset` header → `None` → normal exponential health-ban ladder. +#[test] +fn test_missing_header_falls_back_to_ladder() { + let mut address_list = AddressList::new(); + let addr = make_address(); + address_list.add(addr.clone()); + + let result: ExecutionResult = Err(ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc( + dapi_grpc::tonic::Status::resource_exhausted("429"), + )), + retries: 0, + address: Some(addr.clone()), + }); + update_address_ban_status(&address_list, &result, &applied_settings(true)); + + let info = address_list.ban_info(); + let entry = info.iter().find(|i| i.uri == addr.to_string()).unwrap(); + assert!( + entry.banned, + "missing header must still result in a ladder ban" + ); + assert_eq!(entry.ban_count, 1, "first ladder ban → ban_count = 1"); +} + +/// `rate_limit_ban_duration` on `CanRetry` returns `Some` only for +/// `ResourceExhausted` with a parseable positive `ratelimit-reset`. +#[test] +fn test_rate_limit_ban_duration_trait_delegation() { + use rs_dapi_client::ExecutionError; + + // With header → Some(45s). + let mut s = dapi_grpc::tonic::Status::resource_exhausted("429"); + s.metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("45").unwrap()); + let te = TransportError::Grpc(s); + assert_eq!(te.rate_limit_ban_duration(), Some(Duration::from_secs(45))); + + // Unavailable → None. + let unavail = TransportError::Grpc(dapi_grpc::tonic::Status::unavailable("down")); + assert!(unavail.rate_limit_ban_duration().is_none()); + + // ResourceExhausted without header → None. + let re_no_header = TransportError::Grpc(dapi_grpc::tonic::Status::resource_exhausted("429")); + assert!(re_no_header.rate_limit_ban_duration().is_none()); + + // DapiClientError delegates. + let dce = DapiClientError::Transport(TransportError::Grpc({ + let mut s2 = dapi_grpc::tonic::Status::resource_exhausted("429"); + s2.metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("30").unwrap()); + s2 + })); + assert_eq!(dce.rate_limit_ban_duration(), Some(Duration::from_secs(30))); + + // ExecutionError delegates. + let ee: ExecutionError = ExecutionError { + inner: DapiClientError::Transport(TransportError::Grpc({ + let mut s3 = dapi_grpc::tonic::Status::resource_exhausted("429"); + s3.metadata_mut() + .insert("ratelimit-reset", MetadataValue::try_from("20").unwrap()); + s3 + })), + retries: 0, + address: Some(make_address()), + }; + assert_eq!(ee.rate_limit_ban_duration(), Some(Duration::from_secs(20))); +} diff --git a/packages/rs-sdk/src/error.rs b/packages/rs-sdk/src/error.rs index a523271cc49..64be94bc53e 100644 --- a/packages/rs-sdk/src/error.rs +++ b/packages/rs-sdk/src/error.rs @@ -353,29 +353,6 @@ impl CanRetry for Error { | Error::DapiClientError(DapiClientError::NoAvailableAddressesToRetry(_)) ) } - - /// Delegate rate-limit classification to the wrapped [`DapiClientError`]. - /// - /// When a `ResourceExhausted` status carries a `google.rpc.RetryInfo` hint, - /// `update_address_ban_status` applies a server-dictated ban duration rather - /// than the exponential health-ban ladder. Delegating here keeps the SDK - /// layer transparent to that mechanism. - fn is_rate_limited(&self) -> bool { - matches!(self, Error::DapiClientError(inner) if inner.is_rate_limited()) - } - - /// Delegate the server-dictated ban duration to the wrapped error. - /// - /// Returns the `google.rpc.RetryInfo` retry delay when the inner transport - /// error is `ResourceExhausted` and the status carries that detail; `None` - /// otherwise (caller should fall back to `DAPI_RATE_LIMIT_BAN_MS`). - fn rate_limit_ban_duration(&self) -> Option { - if let Error::DapiClientError(inner) = self { - inner.rate_limit_ban_duration() - } else { - None - } - } } /// Server returned stale metadata @@ -589,66 +566,4 @@ mod tests { assert!(super::extract_drive_error_message(&payload).is_none()); } } - - /// Regression: the SDK `Error` must propagate the wrapped - /// `DapiClientError`'s rate-limit classification so a throttled node is - /// banned for the server-dictated window rather than the health-ban ladder. - /// Locks the delegation so a future refactor cannot silently drop it. - #[test] - fn test_is_rate_limited_delegates_to_inner_dapi_client_error() { - // A ResourceExhausted transport error wrapped by the SDK is rate-limited. - let rate_limited: Error = DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::resource_exhausted("429"), - )) - .into(); - assert!( - rate_limited.is_rate_limited(), - "SDK Error must delegate rate-limit classification to the wrapped DapiClientError" - ); - - // A non-rate-limited transport error must not be classified as such. - let unavailable: Error = DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::unavailable("down"), - )) - .into(); - assert!(!unavailable.is_rate_limited()); - - // Non-DAPI errors fall back to the trait default (false). - assert!(!Error::Config("misconfigured".to_string()).is_rate_limited()); - } - - #[test] - fn test_rate_limit_ban_duration_delegates_to_inner() { - // Non-rate-limited transport error → always None. - let unavailable: Error = DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::unavailable("down"), - )) - .into(); - assert!( - unavailable.rate_limit_ban_duration().is_none(), - "non-rate-limited error must not have a ban duration" - ); - - // Non-DAPI error → None (default impl). - assert!( - Error::Config("bad".into()) - .rate_limit_ban_duration() - .is_none(), - "non-DAPI SDK errors must return None" - ); - - // ResourceExhausted with no RetryInfo → None (caller uses fallback). - // This validates the delegation path without requiring the internal - // test helper from rs-dapi-client; the RetryInfo extraction is tested - // exhaustively in rs-dapi-client's rate_limit module unit tests. - let rate_limited: Error = DapiClientError::Transport(TransportError::Grpc( - dapi_grpc::tonic::Status::resource_exhausted("429"), - )) - .into(); - // No RetryInfo attached → delegation returns None (fallback will be used). - assert!( - rate_limited.rate_limit_ban_duration().is_none(), - "ResourceExhausted without RetryInfo must return None (caller uses fallback)" - ); - } } From dade457a2ee45f99c5d923387ebb2b7a888ab57f Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:30:55 +0200 Subject: [PATCH 6/6] test(rs-dapi-client): apply QA-001..005 doc-accuracy and test-honesty fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QA-001: Correct `AddressStatus::ban_for` / `AddressList::ban_for` docstrings — the "no ladder inflation" claim was false for fresh nodes (ban_count 0 → 1 raises the next genuine health ban from ~60 s to ~163 s). Docs now describe the floor/side-effect accurately. New test pins the 0→1 ladder-floor behaviour. QA-002: Fix `MIN_RATE_LIMIT_BAN_SECS` comment — the `> 0` filter before the clamp already rejects 0, so MIN=1 is a documentation/intent marker, not an active lower clamp. QA-003: De-tautologize "bad/zero/missing header → ladder" tests. Previous assertions (ban_count == 1) passed for BOTH ban_for and ban_with_reason. Now also assert the banned_until window is ~60 s (first ladder rung), which would fail if a bad header were wrongly routed to ban_for. QA-004: Add clamp-edge coverage: 1 → 1 s, 600 → 600 s, 601 → 600 s (was only 45 and 9999). QA-005: Add test that a ban_for'd address re-enters rotation via get_live_address after its window expires (zero-duration re-ban simulates an expired window). Co-Authored-By: Claude Sonnet 4.6 --- packages/rs-dapi-client/src/address_list.rs | 83 +++++++++++++++++-- packages/rs-dapi-client/src/dapi_client.rs | 30 ++++++- .../rs-dapi-client/tests/rate_limit_ban.rs | 49 +++++++++++ 3 files changed, 152 insertions(+), 10 deletions(-) diff --git a/packages/rs-dapi-client/src/address_list.rs b/packages/rs-dapi-client/src/address_list.rs index 44ff2a16cae..2b17a27212f 100644 --- a/packages/rs-dapi-client/src/address_list.rs +++ b/packages/rs-dapi-client/src/address_list.rs @@ -106,11 +106,13 @@ impl AddressStatus { /// Ban the address for an exact `period` (server-advertised), bypassing the /// exponential ladder used by [`AddressStatus::ban_with_reason`]. /// - /// Sets `ban_count = max(ban_count, 1)` so that diagnostics - /// (`is_banned()`, `ban_info()`) correctly report the node as banned, but - /// the genuine-failure exponential ladder is not inflated — a subsequent - /// health failure still escalates from the existing `ban_count`, not from an - /// artificially incremented one. + /// The ban window is flat (not exponential), but `ban_count` is raised to + /// `max(ban_count, 1)` so that `is_banned()` and `ban_info()` correctly + /// report the node as banned. Side-effect: a previously-clean node + /// (ban_count 0) enters the ladder at floor 1, meaning its *next* genuine + /// health failure via [`AddressStatus::ban_with_reason`] uses + /// `60 s × e¹ ≈ 163 s` rather than the first-rung `60 s × e⁰ = 60 s`. + /// The counter resets to 0 on [`AddressStatus::unban`]. pub fn ban_for(&mut self, period: Duration, reason: Option) { self.banned_until = Some(chrono::Utc::now() + period); self.ban_count = self.ban_count.max(1); @@ -196,8 +198,9 @@ impl AddressList { true } - /// Ban the address for an exact `period` (server-advertised). - /// See [`AddressStatus::ban_for`] for the full contract. + /// Ban the address for an exact `period` (server-advertised); delegates to + /// [`AddressStatus::ban_for`] — see that method for the full contract + /// including the `ban_count` floor and ladder side-effect. /// /// Returns `false` if the address is not in the list. pub fn ban_for(&self, address: &Address, period: Duration, reason: Option) -> bool { @@ -815,6 +818,44 @@ mod tests { assert_eq!(status.ban_reason.as_deref(), Some("rate limited")); } + /// `ban_for` on a fresh node (ban_count = 0) raises ban_count to 1 (the + /// ladder floor). That means the *next* genuine health ban will escalate + /// from position 1 (~163 s) instead of position 0 (~60 s). This pins the + /// documented side-effect so regressions are caught. + #[test] + fn test_ban_for_raises_fresh_node_to_ladder_floor() { + let mut status = AddressStatus::default(); + assert_eq!(status.ban_count, 0, "starts clean"); + + // Rate-limit ban on a never-before-banned node. + status.ban_for(Duration::from_secs(10), Some("rl".into())); + assert_eq!( + status.ban_count, 1, + "ban_for must raise ban_count 0 → 1 (ladder floor)" + ); + + // Subsequent genuine health failure must escalate from the floor (1), + // yielding ~60 s × e^1 ≈ 163 s, NOT the first-rung ~60 s × e^0 = 60 s. + let base = Duration::from_secs(60); + let before = chrono::Utc::now(); + status.ban_with_reason(&base, None); // ban_count 1 → 2; window = 60s × e^1 + let after = chrono::Utc::now(); + assert_eq!(status.ban_count, 2); + + let until = status.banned_until.expect("banned_until set"); + let lo = (until - before).num_milliseconds() as f64 / 1000.0; + let hi = (until - after).num_milliseconds() as f64 / 1000.0; + let expected = 60.0_f64 * std::f64::consts::E; // ≈ 163 s + assert!( + lo >= expected - 0.5, + "window lower {lo:.1}s < expected {expected:.1}s (should escalate from floor 1)" + ); + assert!( + hi <= expected + 0.5, + "window upper {hi:.1}s > expected {expected:.1}s" + ); + } + #[test] fn test_address_status_ban_for_does_not_inflate_existing_ban_count() { // A node already health-banned (ban_count = 3) gets rate-limited. @@ -854,6 +895,34 @@ mod tests { assert_eq!(info[0].ban_count, 1); } + /// After `ban_for`'s window expires the address re-enters rotation via + /// `get_live_address`. We verify both directions: the node is hidden during + /// an active window, and becomes live once the window passes. + /// + /// A zero-duration window means `banned_until = Utc::now()` at call time; + /// `get_live_address` samples `Utc::now()` fresh, so at least one clock tick + /// separates the two calls and `banned_until < new_now` holds. + #[test] + fn test_ban_for_address_re_enters_rotation_after_window_expires() { + let mut list = AddressList::new(); + let addr: Address = "http://127.0.0.1:3000".parse().unwrap(); + list.add(addr.clone()); + + // Active 300-second window → node hidden. + assert!(list.ban_for(&addr, Duration::from_secs(300), Some("rl".into()))); + assert!( + list.get_live_address().is_none(), + "node must be hidden during active ban window" + ); + + // Re-ban with a zero window (already expired when get_live_address runs). + assert!(list.ban_for(&addr, Duration::ZERO, None)); + assert!( + list.get_live_address().is_some(), + "address must re-enter rotation after ban_for window expires" + ); + } + /// Invariant 1 at the ladder source: the exponential ban window is /// `base × e^ban_count`, `ban_count` incrementing on each ban. This pins the /// exact formula independently of the `update_address_ban_status` entrypoint. diff --git a/packages/rs-dapi-client/src/dapi_client.rs b/packages/rs-dapi-client/src/dapi_client.rs index ad73fa7eecc..1d0138747f8 100644 --- a/packages/rs-dapi-client/src/dapi_client.rs +++ b/packages/rs-dapi-client/src/dapi_client.rs @@ -18,8 +18,11 @@ use crate::{ RequestSettings, }; -/// Floor for the Envoy-advertised `RateLimit-Reset` ban duration. -/// Prevents re-ban thrash when the header carries a value of 0 or 1. +/// Intended minimum for the Envoy-advertised `RateLimit-Reset` ban duration. +/// Note: the `> 0` filter applied before the clamp already rejects 0 → `None`, +/// so this constant never actively clamps the lower bound — it documents intent +/// (the smallest meaningful reset is 1 s) and acts as the `.clamp(MIN, MAX)` +/// lower argument for clarity. pub(crate) const MIN_RATE_LIMIT_BAN_SECS: u64 = 1; /// Ceiling for the Envoy-advertised `RateLimit-Reset` ban duration. /// Prevents a misconfigured or hostile header from parking a healthy node for @@ -337,7 +340,28 @@ mod tests { let dur = TransportError::Grpc(s).rate_limit_ban_duration(); assert_eq!(dur, Some(Duration::from_secs(MAX_RATE_LIMIT_BAN_SECS))); - // Value below MIN (0) → no header → None. + // Clamp edge: exactly MIN (1) → 1 s (passes through unchanged). + let s = make_rl_status(Some("1")); + assert_eq!( + TransportError::Grpc(s).rate_limit_ban_duration(), + Some(Duration::from_secs(1)) + ); + + // Clamp edge: exactly MAX (600) → 600 s (not clamped). + let s = make_rl_status(Some("600")); + assert_eq!( + TransportError::Grpc(s).rate_limit_ban_duration(), + Some(Duration::from_secs(600)) + ); + + // One above MAX (601) → clamped to 600 s. + let s = make_rl_status(Some("601")); + assert_eq!( + TransportError::Grpc(s).rate_limit_ban_duration(), + Some(Duration::from_secs(600)) + ); + + // Value below MIN (0) → filtered to None before clamp. let s = make_rl_status(Some("0")); assert!(TransportError::Grpc(s).rate_limit_ban_duration().is_none()); diff --git a/packages/rs-dapi-client/tests/rate_limit_ban.rs b/packages/rs-dapi-client/tests/rate_limit_ban.rs index 61a98276dd4..266db5044eb 100644 --- a/packages/rs-dapi-client/tests/rate_limit_ban.rs +++ b/packages/rs-dapi-client/tests/rate_limit_ban.rs @@ -104,8 +104,16 @@ fn test_ratelimit_reset_clamped_to_max() { } /// `ratelimit-reset: 0` or non-numeric → `None` → normal `ban_with_reason` ladder. +/// +/// The key assertion is the resulting `banned_until` window: the ladder's first +/// ban is `60 s × e^0 = 60 s`, **not** some header-derived value. Checking only +/// `ban_count == 1` would pass even if the wrong path (ban_for) were taken. #[test] fn test_zero_and_garbage_header_falls_back_to_ladder() { + // Default AddressList uses DEFAULT_BASE_BAN_PERIOD = 60 s. + // First ladder ban: coefficient = e^0 = 1.0, window = 60 s. + const EXPECTED_WINDOW_SECS: f64 = 60.0; + for bad in &["0", "garbage", ""] { let mut address_list = AddressList::new(); let addr = make_address(); @@ -123,10 +131,14 @@ fn test_zero_and_garbage_header_falls_back_to_ladder() { retries: 0, address: Some(addr.clone()), }); + + let before = chrono::Utc::now(); update_address_ban_status(&address_list, &result, &applied_settings(true)); + let after = chrono::Utc::now(); let info = address_list.ban_info(); let entry = info.iter().find(|i| i.uri == addr.to_string()).unwrap(); + assert!( entry.banned, "bad header '{bad}' must still result in a ban via the ladder" @@ -135,10 +147,31 @@ fn test_zero_and_garbage_header_falls_back_to_ladder() { entry.ban_count, 1, "ladder ban → ban_count = 1 for header '{bad}'" ); + + // The ban window must be the exponential ladder's first rung (~60 s), + // NOT a header-derived value. This assertion fails if ban_for were + // mistakenly called instead of ban_with_reason. + let until = entry + .banned_until + .expect("banned_until set for header '{bad}'"); + let lo = (until - before).num_milliseconds() as f64 / 1000.0; + let hi = (until - after).num_milliseconds() as f64 / 1000.0; + assert!( + lo >= EXPECTED_WINDOW_SECS - 0.5, + "bad header '{bad}': ban window lower {lo:.1}s < expected ~{EXPECTED_WINDOW_SECS}s (ladder path)" + ); + assert!( + hi <= EXPECTED_WINDOW_SECS + 0.5, + "bad header '{bad}': ban window upper {hi:.1}s > expected ~{EXPECTED_WINDOW_SECS}s (should be ladder, not ban_for)" + ); } } /// Missing `ratelimit-reset` header → `None` → normal exponential health-ban ladder. +/// +/// Asserts the `banned_until` window is ~60 s (first ladder rung), NOT a +/// header-derived value. A mere `ban_count == 1` check would pass even if +/// `ban_for` were wrongly invoked (both paths yield ban_count 1 on first ban). #[test] fn test_missing_header_falls_back_to_ladder() { let mut address_list = AddressList::new(); @@ -152,7 +185,10 @@ fn test_missing_header_falls_back_to_ladder() { retries: 0, address: Some(addr.clone()), }); + + let before = chrono::Utc::now(); update_address_ban_status(&address_list, &result, &applied_settings(true)); + let after = chrono::Utc::now(); let info = address_list.ban_info(); let entry = info.iter().find(|i| i.uri == addr.to_string()).unwrap(); @@ -161,6 +197,19 @@ fn test_missing_header_falls_back_to_ladder() { "missing header must still result in a ladder ban" ); assert_eq!(entry.ban_count, 1, "first ladder ban → ban_count = 1"); + + // Window must be the first exponential rung: 60 s × e^0 = 60 s. + let until = entry.banned_until.expect("banned_until set"); + let lo = (until - before).num_milliseconds() as f64 / 1000.0; + let hi = (until - after).num_milliseconds() as f64 / 1000.0; + assert!( + lo >= 59.5, + "ladder window lower {lo:.1}s < expected ~60 s (missing header must use ladder)" + ); + assert!( + hi <= 60.5, + "ladder window upper {hi:.1}s > expected ~60 s (should be ladder, not ban_for)" + ); } /// `rate_limit_ban_duration` on `CanRetry` returns `Some` only for