diff --git a/.github/workflows/fresh-host-core.yml b/.github/workflows/fresh-host-core.yml index d081fca..d3b6fa2 100644 --- a/.github/workflows/fresh-host-core.yml +++ b/.github/workflows/fresh-host-core.yml @@ -22,7 +22,8 @@ permissions: env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true" FRESH_HOST_DOCKER_PULL_PARALLELISM: "4" - FRESH_HOST_DOCKER_PULL_MAX_ATTEMPTS: "3" + # 5 retries let registry backoff survive transient Docker Hub issues + FRESH_HOST_DOCKER_PULL_MAX_ATTEMPTS: "5" FRESH_HOST_CACHE_ROOT: ${{ github.workspace }}/.github-cache UV_CACHE_DIR: ${{ github.workspace }}/.github-cache/uv npm_config_cache: ${{ github.workspace }}/.github-cache/npm diff --git a/tests/suites/unit/ci/test_hosted_docker.py b/tests/suites/unit/ci/test_hosted_docker.py index dafd36a..b619d91 100644 --- a/tests/suites/unit/ci/test_hosted_docker.py +++ b/tests/suites/unit/ci/test_hosted_docker.py @@ -238,6 +238,73 @@ def fake_log(message: str) -> None: ) +def test_pull_images_uses_extended_backoff_for_transient_registry_failure( + tmp_path: Path, + test_context: TestContext, +) -> None: + """Transient registry timeouts should retry with a longer, capped exponential backoff.""" + attempts: dict[str, int] = {"qdrant/qdrant:v1": 0} + sleeps: list[float] = [] + logs: list[str] = [] + + def fake_pull_one_image(image: str, timeout_seconds: int) -> tuple[str, int, float, str]: + assert timeout_seconds == hosted_docker.DEFAULT_DOCKER_PULL_TIMEOUT_SECONDS + attempts[image] += 1 + if attempts[image] <= 4: + return ( + image, + 1, + 15.0, + ( + 'Error response from daemon: Get "https://registry-1.docker.io/v2/": ' + "context deadline exceeded (Client.Timeout exceeded while awaiting headers)" + ), + ) + return image, 0, 0.1, "" + + def fake_wait_for_docker_ready( + *, + cwd: Path, + env: dict[str, str], + max_attempts: int = 60, + ) -> None: + del cwd, env, max_attempts + + def capture_sleep(seconds: float) -> None: + sleeps.append(seconds) + + def fake_log(message: str) -> None: + logs.append(message) + + test_context.patch.patch_object(hosted_docker_images, "pull_one_image", new=fake_pull_one_image) + test_context.patch.patch_object( + hosted_docker_images, + "wait_for_docker_ready", + new=fake_wait_for_docker_ready, + ) + test_context.patch.patch_object(hosted_docker_images, "log", new=fake_log) + test_context.patch.patch_object(hosted_docker_images.time, "sleep", new=capture_sleep) + + report = hosted_docker.pull_images( + ["qdrant/qdrant:v1"], + parallelism=2, + max_attempts=5, + recovery_cwd=tmp_path, + recovery_env={"DOCKER_HOST": "unix:///tmp/docker.sock"}, + ) + + assert report.exit_code == 0 + assert report.attempt_count == 5 + assert report.retried_images == ["qdrant/qdrant:v1"] + # Exponential growth from the 10s base, capped at 45s — far longer than the 2s/4s + # generic backoff so a brief Docker Hub outage is ridden out across the retry budget. + assert sleeps == [10, 20, 40, 45] + assert any( + "Detected transient registry connectivity failure; backing off before retry." in entry + for entry in logs + ) + + def test_pull_images_requires_recovery_cwd_and_env_together() -> None: """Pull recovery wiring should reject partial recovery configuration.""" with pytest.raises(fresh_host.FreshHostError, match="recovery_cwd and recovery_env"): diff --git a/tests/utils/helpers/_hosted_docker/images.py b/tests/utils/helpers/_hosted_docker/images.py index 5cfb721..f94fe3f 100644 --- a/tests/utils/helpers/_hosted_docker/images.py +++ b/tests/utils/helpers/_hosted_docker/images.py @@ -18,6 +18,9 @@ from tests.utils.helpers._hosted_docker.models import ( DEFAULT_DOCKER_PULL_TIMEOUT_SECONDS, PULL_HEARTBEAT_SECONDS, + PULL_RETRY_BACKOFF_CAP_SECONDS, + REGISTRY_RETRY_BACKOFF_BASE_SECONDS, + REGISTRY_RETRY_BACKOFF_CAP_SECONDS, ImageEnsureReport, PullReport, ) @@ -33,6 +36,17 @@ "error during connect", "unexpected eof", ) +# Transient registry network errors (not daemon failures); retry with longer backoff +REGISTRY_TRANSIENT_FAILURE_MARKERS: Final[tuple[str, ...]] = ( + "context deadline exceeded", + "request canceled while waiting for connection", + "client.timeout exceeded while awaiting headers", + "timeout exceeded while awaiting headers", + "i/o timeout", + "tls handshake timeout", + "temporary failure in name resolution", + "no such host", +) def compose_probe_env( @@ -128,6 +142,24 @@ def _is_daemon_connectivity_failure(output: str) -> bool: return "docker.sock" in lowered and "eof" in lowered +def _is_transient_registry_failure(output: str) -> bool: + """Return whether a pull failure output indicates a transient registry network blip.""" + lowered = output.lower() + return any(marker in lowered for marker in REGISTRY_TRANSIENT_FAILURE_MARKERS) + + +def _retry_backoff_seconds(*, attempt_count: int, registry_failure: bool) -> int: + """Return the backoff before the next pull attempt. + + Transient registry connectivity failures use a longer exponential backoff so a brief + upstream outage is ridden out instead of exhausting the retry budget in seconds. + """ + if registry_failure: + growth = REGISTRY_RETRY_BACKOFF_BASE_SECONDS << max(0, attempt_count - 1) + return min(REGISTRY_RETRY_BACKOFF_CAP_SECONDS, growth) + return min(PULL_RETRY_BACKOFF_CAP_SECONDS, 2 * attempt_count) + + def pull_images( images: Sequence[str], *, @@ -156,6 +188,7 @@ def pull_images( ) failures: list[str] = [] daemon_connectivity_failure = False + transient_registry_failure = False with concurrent.futures.ThreadPoolExecutor( max_workers=min(attempt_parallelism, len(outstanding)) ) as executor: @@ -184,6 +217,8 @@ def pull_images( print(output, flush=True) if _is_daemon_connectivity_failure(output): daemon_connectivity_failure = True + elif _is_transient_registry_failure(output): + transient_registry_failure = True failures.append(image) outstanding = failures if not outstanding or attempt_count >= max_attempts: @@ -195,6 +230,8 @@ def pull_images( if recovery_cwd is not None and recovery_env is not None: if daemon_connectivity_failure: log("Detected Docker daemon connectivity failure; waiting for runtime recovery.") + elif transient_registry_failure: + log("Detected transient registry connectivity failure; backing off before retry.") else: log("Retrying image pull after failure; probing Docker runtime before retry.") try: @@ -212,7 +249,10 @@ def pull_images( if next_parallelism != attempt_parallelism: log(f"Reducing retry parallelism {attempt_parallelism}->{next_parallelism}.") attempt_parallelism = next_parallelism - backoff_seconds = min(10, 2 * attempt_count) + backoff_seconds = _retry_backoff_seconds( + attempt_count=attempt_count, + registry_failure=transient_registry_failure and not daemon_connectivity_failure, + ) log(f"Retrying {len(outstanding)} image(s) after {backoff_seconds}s.") time.sleep(backoff_seconds) return PullReport( diff --git a/tests/utils/helpers/_hosted_docker/models.py b/tests/utils/helpers/_hosted_docker/models.py index 3848588..889d45e 100644 --- a/tests/utils/helpers/_hosted_docker/models.py +++ b/tests/utils/helpers/_hosted_docker/models.py @@ -8,6 +8,14 @@ LOG_PREFIX: Final[str] = "[hosted-docker]" DEFAULT_DOCKER_PULL_TIMEOUT_SECONDS: Final[int] = 1800 PULL_HEARTBEAT_SECONDS: Final[int] = 30 +# Backoff cap for generic pull failures (daemon hiccups, transient tooling errors). +PULL_RETRY_BACKOFF_CAP_SECONDS: Final[int] = 10 +# Transient Docker registry connectivity failures (e.g. Docker Hub "context deadline +# exceeded" while reaching registry-1.docker.io) need a longer, exponential backoff so a +# brief registry outage is ridden out across the bounded retry budget instead of burning +# every attempt within a few seconds. +REGISTRY_RETRY_BACKOFF_BASE_SECONDS: Final[int] = 10 +REGISTRY_RETRY_BACKOFF_CAP_SECONDS: Final[int] = 45 @dataclass(slots=True)