From f76fa7fbc011f4c3498d0c90b80b4ecd38edabbf Mon Sep 17 00:00:00 2001 From: Nick Marden Date: Tue, 16 Jun 2026 12:07:42 -0400 Subject: [PATCH 1/2] fix: ensure recovery loop can reclaim stuck messages before expiry DefaultWebhookExpiry was 30s but pendingIdleTimeout was 60s, meaning messages stuck as pending after a server pod restart would always expire before the recovery loop (which runs every 30s) could reclaim them. Fix the invariant: DefaultWebhookExpiry (2m) > pendingIdleTimeout (30s) + recoveryInterval (30s), so stuck messages are reclaimed with ~60s to spare. --- internal/relay/manager.go | 6 ++++-- internal/relay/redis_manager.go | 2 +- internal/relay/redis_manager_test.go | 6 +++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/internal/relay/manager.go b/internal/relay/manager.go index cf81502..162768b 100644 --- a/internal/relay/manager.go +++ b/internal/relay/manager.go @@ -17,8 +17,10 @@ var ( ) const ( - // DefaultWebhookExpiry is the default time after which a queued webhook expires - DefaultWebhookExpiry = 30 * time.Second + // DefaultWebhookExpiry is the default time after which a queued webhook expires. + // Must be greater than pendingIdleTimeout + defaultRecoveryInterval so that the + // recovery loop can reclaim stuck messages before they expire. + DefaultWebhookExpiry = 2 * time.Minute ) // Webhook represents a webhook request to be delivered via relay diff --git a/internal/relay/redis_manager.go b/internal/relay/redis_manager.go index 5ace163..2b22741 100644 --- a/internal/relay/redis_manager.go +++ b/internal/relay/redis_manager.go @@ -35,7 +35,7 @@ const ( // Recovery settings defaultRecoveryInterval = 30 * time.Second // How often to check for stuck messages - pendingIdleTimeout = 60 * time.Second // How long a message can be pending before reclaim + pendingIdleTimeout = 30 * time.Second // How long a message can be pending before reclaim maxDeliveryAttempts = 3 // Max retries before dead letter ) diff --git a/internal/relay/redis_manager_test.go b/internal/relay/redis_manager_test.go index 2e0feb9..f377562 100644 --- a/internal/relay/redis_manager_test.go +++ b/internal/relay/redis_manager_test.go @@ -908,9 +908,9 @@ func TestRedisManager_Deliver_SetsExpiry(t *testing.T) { if webhook.ExpiresAt == 0 { t.Error("expected ExpiresAt to be set") } - // Should be about 30 seconds in the future - expectedMin := time.Now().Add(25 * time.Second).Unix() - expectedMax := time.Now().Add(35 * time.Second).Unix() + // Should be about 2 minutes in the future + expectedMin := time.Now().Add(115 * time.Second).Unix() + expectedMax := time.Now().Add(125 * time.Second).Unix() if webhook.ExpiresAt < expectedMin || webhook.ExpiresAt > expectedMax { t.Errorf("ExpiresAt %d not in expected range [%d, %d]", webhook.ExpiresAt, expectedMin, expectedMax) } From 7786b5e6eb4e4349e36078e1e25aff277b72915a Mon Sep 17 00:00:00 2001 From: Nick Marden Date: Tue, 16 Jun 2026 13:30:22 -0400 Subject: [PATCH 2/2] docs: add CHANGELOG entry for relay recovery expiry fix --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ad70b1..777eebe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- Relay recovery loop could not reclaim stuck messages after a pod restart because `DefaultWebhookExpiry` (30s) was shorter than `pendingIdleTimeout` (60s). Fixed by raising `DefaultWebhookExpiry` to 2 minutes and lowering `pendingIdleTimeout` to 30s, ensuring the recovery loop always has time to reclaim a stuck message before it expires. + ## [0.2.13] - 2026-06-11 ### Changed