From adb6fbb16e9a9b88521ece2429d5a1f377891f9f Mon Sep 17 00:00:00 2001
From: Juan Sugg <juanpedrosugg@gmail.com>
Date: Wed, 17 Jun 2026 15:33:07 -0300
Subject: [PATCH] fix(ci): ride out transient registry blips in nightly image
 pulls

The Linux nightly fresh-host job failed intermittently when 'docker pull' could not reach registry-1.docker.io ('context deadline exceeded' / 'request canceled while waiting for connection'). These registry network timeouts were not classified as transient, so the pull helper retried them with the short generic backoff (2s, 4s) and burned its retry budget within seconds.

Classify registry connectivity timeouts as a distinct transient failure and apply a longer, capped exponential backoff (10s, 20s, 40s, 45s) so a brief upstream outage is ridden out across the retry budget. Raise the Linux fresh-host pull retry budget to 5 so the backoff has room to work. Add unit coverage for the new backoff schedule.
---
 .github/workflows/fresh-host-core.yml        |  3 +-
 tests/suites/unit/ci/test_hosted_docker.py   | 67 ++++++++++++++++++++
 tests/utils/helpers/_hosted_docker/images.py | 42 +++++++++++-
 tests/utils/helpers/_hosted_docker/models.py |  8 +++
 4 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/fresh-host-core.yml b/.github/workflows/fresh-host-core.yml
index d081fca..d3b6fa2 100644
--- a/.github/workflows/fresh-host-core.yml
+++ b/.github/workflows/fresh-host-core.yml
@@ -22,7 +22,8 @@ permissions:
 env:
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
   FRESH_HOST_DOCKER_PULL_PARALLELISM: "4"
-  FRESH_HOST_DOCKER_PULL_MAX_ATTEMPTS: "3"
+  # 5 retries let registry backoff survive transient Docker Hub issues
+  FRESH_HOST_DOCKER_PULL_MAX_ATTEMPTS: "5"
   FRESH_HOST_CACHE_ROOT: ${{ github.workspace }}/.github-cache
   UV_CACHE_DIR: ${{ github.workspace }}/.github-cache/uv
   npm_config_cache: ${{ github.workspace }}/.github-cache/npm
diff --git a/tests/suites/unit/ci/test_hosted_docker.py b/tests/suites/unit/ci/test_hosted_docker.py
index dafd36a..b619d91 100644
--- a/tests/suites/unit/ci/test_hosted_docker.py
+++ b/tests/suites/unit/ci/test_hosted_docker.py
@@ -238,6 +238,73 @@ def fake_log(message: str) -> None:
     )
 
 
+def test_pull_images_uses_extended_backoff_for_transient_registry_failure(
+    tmp_path: Path,
+    test_context: TestContext,
+) -> None:
+    """Transient registry timeouts should retry with a longer, capped exponential backoff."""
+    attempts: dict[str, int] = {"qdrant/qdrant:v1": 0}
+    sleeps: list[float] = []
+    logs: list[str] = []
+
+    def fake_pull_one_image(image: str, timeout_seconds: int) -> tuple[str, int, float, str]:
+        assert timeout_seconds == hosted_docker.DEFAULT_DOCKER_PULL_TIMEOUT_SECONDS
+        attempts[image] += 1
+        if attempts[image] <= 4:
+            return (
+                image,
+                1,
+                15.0,
+                (
+                    'Error response from daemon: Get "https://registry-1.docker.io/v2/": '
+                    "context deadline exceeded (Client.Timeout exceeded while awaiting headers)"
+                ),
+            )
+        return image, 0, 0.1, ""
+
+    def fake_wait_for_docker_ready(
+        *,
+        cwd: Path,
+        env: dict[str, str],
+        max_attempts: int = 60,
+    ) -> None:
+        del cwd, env, max_attempts
+
+    def capture_sleep(seconds: float) -> None:
+        sleeps.append(seconds)
+
+    def fake_log(message: str) -> None:
+        logs.append(message)
+
+    test_context.patch.patch_object(hosted_docker_images, "pull_one_image", new=fake_pull_one_image)
+    test_context.patch.patch_object(
+        hosted_docker_images,
+        "wait_for_docker_ready",
+        new=fake_wait_for_docker_ready,
+    )
+    test_context.patch.patch_object(hosted_docker_images, "log", new=fake_log)
+    test_context.patch.patch_object(hosted_docker_images.time, "sleep", new=capture_sleep)
+
+    report = hosted_docker.pull_images(
+        ["qdrant/qdrant:v1"],
+        parallelism=2,
+        max_attempts=5,
+        recovery_cwd=tmp_path,
+        recovery_env={"DOCKER_HOST": "unix:///tmp/docker.sock"},
+    )
+
+    assert report.exit_code == 0
+    assert report.attempt_count == 5
+    assert report.retried_images == ["qdrant/qdrant:v1"]
+    # Exponential growth from the 10s base, capped at 45s — far longer than the 2s/4s
+    # generic backoff so a brief Docker Hub outage is ridden out across the retry budget.
+    assert sleeps == [10, 20, 40, 45]
+    assert any(
+        "Detected transient registry connectivity failure; backing off before retry." in entry
+        for entry in logs
+    )
+
+
 def test_pull_images_requires_recovery_cwd_and_env_together() -> None:
     """Pull recovery wiring should reject partial recovery configuration."""
     with pytest.raises(fresh_host.FreshHostError, match="recovery_cwd and recovery_env"):
diff --git a/tests/utils/helpers/_hosted_docker/images.py b/tests/utils/helpers/_hosted_docker/images.py
index 5cfb721..f94fe3f 100644
--- a/tests/utils/helpers/_hosted_docker/images.py
+++ b/tests/utils/helpers/_hosted_docker/images.py
@@ -18,6 +18,9 @@
 from tests.utils.helpers._hosted_docker.models import (
     DEFAULT_DOCKER_PULL_TIMEOUT_SECONDS,
     PULL_HEARTBEAT_SECONDS,
+    PULL_RETRY_BACKOFF_CAP_SECONDS,
+    REGISTRY_RETRY_BACKOFF_BASE_SECONDS,
+    REGISTRY_RETRY_BACKOFF_CAP_SECONDS,
     ImageEnsureReport,
     PullReport,
 )
@@ -33,6 +36,17 @@
     "error during connect",
     "unexpected eof",
 )
+# Transient registry network errors (not daemon failures); retry with longer backoff
+REGISTRY_TRANSIENT_FAILURE_MARKERS: Final[tuple[str, ...]] = (
+    "context deadline exceeded",
+    "request canceled while waiting for connection",
+    "client.timeout exceeded while awaiting headers",
+    "timeout exceeded while awaiting headers",
+    "i/o timeout",
+    "tls handshake timeout",
+    "temporary failure in name resolution",
+    "no such host",
+)
 
 
 def compose_probe_env(
@@ -128,6 +142,24 @@ def _is_daemon_connectivity_failure(output: str) -> bool:
     return "docker.sock" in lowered and "eof" in lowered
 
 
+def _is_transient_registry_failure(output: str) -> bool:
+    """Return whether a pull failure output indicates a transient registry network blip."""
+    lowered = output.lower()
+    return any(marker in lowered for marker in REGISTRY_TRANSIENT_FAILURE_MARKERS)
+
+
+def _retry_backoff_seconds(*, attempt_count: int, registry_failure: bool) -> int:
+    """Return the backoff before the next pull attempt.
+
+    Transient registry connectivity failures use a longer exponential backoff so a brief
+    upstream outage is ridden out instead of exhausting the retry budget in seconds.
+    """
+    if registry_failure:
+        growth = REGISTRY_RETRY_BACKOFF_BASE_SECONDS << max(0, attempt_count - 1)
+        return min(REGISTRY_RETRY_BACKOFF_CAP_SECONDS, growth)
+    return min(PULL_RETRY_BACKOFF_CAP_SECONDS, 2 * attempt_count)
+
+
 def pull_images(
     images: Sequence[str],
     *,
@@ -156,6 +188,7 @@ def pull_images(
         )
         failures: list[str] = []
         daemon_connectivity_failure = False
+        transient_registry_failure = False
         with concurrent.futures.ThreadPoolExecutor(
             max_workers=min(attempt_parallelism, len(outstanding))
         ) as executor:
@@ -184,6 +217,8 @@ def pull_images(
                         print(output, flush=True)
                         if _is_daemon_connectivity_failure(output):
                             daemon_connectivity_failure = True
+                        elif _is_transient_registry_failure(output):
+                            transient_registry_failure = True
                     failures.append(image)
         outstanding = failures
         if not outstanding or attempt_count >= max_attempts:
@@ -195,6 +230,8 @@ def pull_images(
         if recovery_cwd is not None and recovery_env is not None:
             if daemon_connectivity_failure:
                 log("Detected Docker daemon connectivity failure; waiting for runtime recovery.")
+            elif transient_registry_failure:
+                log("Detected transient registry connectivity failure; backing off before retry.")
             else:
                 log("Retrying image pull after failure; probing Docker runtime before retry.")
             try:
@@ -212,7 +249,10 @@ def pull_images(
         if next_parallelism != attempt_parallelism:
             log(f"Reducing retry parallelism {attempt_parallelism}->{next_parallelism}.")
         attempt_parallelism = next_parallelism
-        backoff_seconds = min(10, 2 * attempt_count)
+        backoff_seconds = _retry_backoff_seconds(
+            attempt_count=attempt_count,
+            registry_failure=transient_registry_failure and not daemon_connectivity_failure,
+        )
         log(f"Retrying {len(outstanding)} image(s) after {backoff_seconds}s.")
         time.sleep(backoff_seconds)
     return PullReport(
diff --git a/tests/utils/helpers/_hosted_docker/models.py b/tests/utils/helpers/_hosted_docker/models.py
index 3848588..889d45e 100644
--- a/tests/utils/helpers/_hosted_docker/models.py
+++ b/tests/utils/helpers/_hosted_docker/models.py
@@ -8,6 +8,14 @@
 LOG_PREFIX: Final[str] = "[hosted-docker]"
 DEFAULT_DOCKER_PULL_TIMEOUT_SECONDS: Final[int] = 1800
 PULL_HEARTBEAT_SECONDS: Final[int] = 30
+# Backoff cap for generic pull failures (daemon hiccups, transient tooling errors).
+PULL_RETRY_BACKOFF_CAP_SECONDS: Final[int] = 10
+# Transient Docker registry connectivity failures (e.g. Docker Hub "context deadline
+# exceeded" while reaching registry-1.docker.io) need a longer, exponential backoff so a
+# brief registry outage is ridden out across the bounded retry budget instead of burning
+# every attempt within a few seconds.
+REGISTRY_RETRY_BACKOFF_BASE_SECONDS: Final[int] = 10
+REGISTRY_RETRY_BACKOFF_CAP_SECONDS: Final[int] = 45
 
 
 @dataclass(slots=True)