Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/fresh-host-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ permissions:
env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
FRESH_HOST_DOCKER_PULL_PARALLELISM: "4"
FRESH_HOST_DOCKER_PULL_MAX_ATTEMPTS: "3"
# 5 retries let registry backoff survive transient Docker Hub issues
FRESH_HOST_DOCKER_PULL_MAX_ATTEMPTS: "5"
FRESH_HOST_CACHE_ROOT: ${{ github.workspace }}/.github-cache
UV_CACHE_DIR: ${{ github.workspace }}/.github-cache/uv
npm_config_cache: ${{ github.workspace }}/.github-cache/npm
Expand Down
67 changes: 67 additions & 0 deletions tests/suites/unit/ci/test_hosted_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,73 @@ def fake_log(message: str) -> None:
)


def test_pull_images_uses_extended_backoff_for_transient_registry_failure(
tmp_path: Path,
test_context: TestContext,
) -> None:
"""Transient registry timeouts should retry with a longer, capped exponential backoff."""
attempts: dict[str, int] = {"qdrant/qdrant:v1": 0}
sleeps: list[float] = []
logs: list[str] = []

def fake_pull_one_image(image: str, timeout_seconds: int) -> tuple[str, int, float, str]:
assert timeout_seconds == hosted_docker.DEFAULT_DOCKER_PULL_TIMEOUT_SECONDS
attempts[image] += 1
if attempts[image] <= 4:
return (
image,
1,
15.0,
(
'Error response from daemon: Get "https://registry-1.docker.io/v2/": '
"context deadline exceeded (Client.Timeout exceeded while awaiting headers)"
),
)
return image, 0, 0.1, ""

def fake_wait_for_docker_ready(
*,
cwd: Path,
env: dict[str, str],
max_attempts: int = 60,
) -> None:
del cwd, env, max_attempts

def capture_sleep(seconds: float) -> None:
sleeps.append(seconds)

def fake_log(message: str) -> None:
logs.append(message)

test_context.patch.patch_object(hosted_docker_images, "pull_one_image", new=fake_pull_one_image)
test_context.patch.patch_object(
hosted_docker_images,
"wait_for_docker_ready",
new=fake_wait_for_docker_ready,
)
test_context.patch.patch_object(hosted_docker_images, "log", new=fake_log)
test_context.patch.patch_object(hosted_docker_images.time, "sleep", new=capture_sleep)

report = hosted_docker.pull_images(
["qdrant/qdrant:v1"],
parallelism=2,
max_attempts=5,
recovery_cwd=tmp_path,
recovery_env={"DOCKER_HOST": "unix:///tmp/docker.sock"},
)

assert report.exit_code == 0
assert report.attempt_count == 5
assert report.retried_images == ["qdrant/qdrant:v1"]
# Exponential growth from the 10s base, capped at 45s — far longer than the 2s/4s
# generic backoff so a brief Docker Hub outage is ridden out across the retry budget.
assert sleeps == [10, 20, 40, 45]
assert any(
"Detected transient registry connectivity failure; backing off before retry." in entry
for entry in logs
)


def test_pull_images_requires_recovery_cwd_and_env_together() -> None:
"""Pull recovery wiring should reject partial recovery configuration."""
with pytest.raises(fresh_host.FreshHostError, match="recovery_cwd and recovery_env"):
Expand Down
42 changes: 41 additions & 1 deletion tests/utils/helpers/_hosted_docker/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
from tests.utils.helpers._hosted_docker.models import (
DEFAULT_DOCKER_PULL_TIMEOUT_SECONDS,
PULL_HEARTBEAT_SECONDS,
PULL_RETRY_BACKOFF_CAP_SECONDS,
REGISTRY_RETRY_BACKOFF_BASE_SECONDS,
REGISTRY_RETRY_BACKOFF_CAP_SECONDS,
ImageEnsureReport,
PullReport,
)
Expand All @@ -33,6 +36,17 @@
"error during connect",
"unexpected eof",
)
# Transient registry network errors (not daemon failures); retry with longer backoff
REGISTRY_TRANSIENT_FAILURE_MARKERS: Final[tuple[str, ...]] = (
"context deadline exceeded",
"request canceled while waiting for connection",
"client.timeout exceeded while awaiting headers",
"timeout exceeded while awaiting headers",
"i/o timeout",
"tls handshake timeout",
"temporary failure in name resolution",
"no such host",
)


def compose_probe_env(
Expand Down Expand Up @@ -128,6 +142,24 @@ def _is_daemon_connectivity_failure(output: str) -> bool:
return "docker.sock" in lowered and "eof" in lowered


def _is_transient_registry_failure(output: str) -> bool:
"""Return whether a pull failure output indicates a transient registry network blip."""
lowered = output.lower()
return any(marker in lowered for marker in REGISTRY_TRANSIENT_FAILURE_MARKERS)


def _retry_backoff_seconds(*, attempt_count: int, registry_failure: bool) -> int:
"""Return the backoff before the next pull attempt.

Transient registry connectivity failures use a longer exponential backoff so a brief
upstream outage is ridden out instead of exhausting the retry budget in seconds.
"""
if registry_failure:
growth = REGISTRY_RETRY_BACKOFF_BASE_SECONDS << max(0, attempt_count - 1)
return min(REGISTRY_RETRY_BACKOFF_CAP_SECONDS, growth)
return min(PULL_RETRY_BACKOFF_CAP_SECONDS, 2 * attempt_count)


def pull_images(
images: Sequence[str],
*,
Expand Down Expand Up @@ -156,6 +188,7 @@ def pull_images(
)
failures: list[str] = []
daemon_connectivity_failure = False
transient_registry_failure = False
with concurrent.futures.ThreadPoolExecutor(
max_workers=min(attempt_parallelism, len(outstanding))
) as executor:
Expand Down Expand Up @@ -184,6 +217,8 @@ def pull_images(
print(output, flush=True)
if _is_daemon_connectivity_failure(output):
daemon_connectivity_failure = True
elif _is_transient_registry_failure(output):
transient_registry_failure = True
failures.append(image)
outstanding = failures
if not outstanding or attempt_count >= max_attempts:
Expand All @@ -195,6 +230,8 @@ def pull_images(
if recovery_cwd is not None and recovery_env is not None:
if daemon_connectivity_failure:
log("Detected Docker daemon connectivity failure; waiting for runtime recovery.")
elif transient_registry_failure:
log("Detected transient registry connectivity failure; backing off before retry.")
else:
log("Retrying image pull after failure; probing Docker runtime before retry.")
try:
Expand All @@ -212,7 +249,10 @@ def pull_images(
if next_parallelism != attempt_parallelism:
log(f"Reducing retry parallelism {attempt_parallelism}->{next_parallelism}.")
attempt_parallelism = next_parallelism
backoff_seconds = min(10, 2 * attempt_count)
backoff_seconds = _retry_backoff_seconds(
attempt_count=attempt_count,
registry_failure=transient_registry_failure and not daemon_connectivity_failure,
)
log(f"Retrying {len(outstanding)} image(s) after {backoff_seconds}s.")
time.sleep(backoff_seconds)
return PullReport(
Expand Down
8 changes: 8 additions & 0 deletions tests/utils/helpers/_hosted_docker/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@
LOG_PREFIX: Final[str] = "[hosted-docker]"
DEFAULT_DOCKER_PULL_TIMEOUT_SECONDS: Final[int] = 1800
PULL_HEARTBEAT_SECONDS: Final[int] = 30
# Backoff cap for generic pull failures (daemon hiccups, transient tooling errors).
PULL_RETRY_BACKOFF_CAP_SECONDS: Final[int] = 10
# Transient Docker registry connectivity failures (e.g. Docker Hub "context deadline
# exceeded" while reaching registry-1.docker.io) need a longer, exponential backoff so a
# brief registry outage is ridden out across the bounded retry budget instead of burning
# every attempt within a few seconds.
REGISTRY_RETRY_BACKOFF_BASE_SECONDS: Final[int] = 10
REGISTRY_RETRY_BACKOFF_CAP_SECONDS: Final[int] = 45


@dataclass(slots=True)
Expand Down