Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 154 additions & 11 deletions tools/ec2_worker_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,18 +214,32 @@ def build_network_interfaces(
available ENI slots.

`associate_public_ip` (off by default) sets AssociatePublicIpAddress
on the primary ENI only — the entry with DeviceIndex=0 and (when
multi-card) NetworkCardIndex=0. AWS rejects the field on secondaries.
on the primary ENI — but ONLY when the resulting payload has exactly
one NetworkInterface entry. AWS hard-rejects RunInstances with
`InvalidParameterCombination — The associatePublicIPAddress
parameter cannot be specified when launching with multiple network
interfaces` whenever len(NetworkInterfaces) > 1, even if the field
appears only on the primary entry. Multi-ENI launches that need a
public IP must allocate+associate an Elastic IP post-launch — see
`Ec2WorkerManager._associate_public_ip_post_launch`.

Needed because RunInstances ignores the subnet's MapPublicIpOnLaunch
when an explicit NetworkInterfaces[] is passed; without this opt-in
the launch comes up with no public IP. See PR #65
issuecomment-4338158487 (anygpt-48) for the failure mode.
issuecomment-4338158487 (anygpt-48) for the original failure mode
and PR #65 issuecomment-4339242358 (anygpt-52) for the multi-ENI
rejection that motivated the post-launch path.
"""
if target_count < 1:
raise ValueError("target_count must be >= 1")
if not subnet_ids:
raise ValueError("subnet_ids must contain at least one subnet")
placements = distribute_enis_across_cards(target_count, network_cards)
# AWS rejects AssociatePublicIpAddress whenever more than one
# NetworkInterface entry is supplied. Only emit it inline when the
# resulting payload is single-NIC; multi-ENI launches must rely on
# post-launch allocate-address + associate-address.
inline_associate_public_ip = associate_public_ip and len(placements) == 1
interfaces: list[dict[str, Any]] = []
primary_assigned = False
for sequence_index, (card_index, device_index) in enumerate(placements):
Expand All @@ -241,13 +255,7 @@ def build_network_interfaces(
spec["NetworkCardIndex"] = card_index
if security_group_ids:
spec["Groups"] = list(security_group_ids)
# AssociatePublicIpAddress is only valid on the primary ENI.
# Set it on exactly one entry: DeviceIndex=0 plus (when we're
# emitting NetworkCardIndex) NetworkCardIndex=0. The placement
# helper guarantees the primary ENI lands first, so we only
# need to flip the flag the first time we see a DeviceIndex=0
# entry on card 0.
if associate_public_ip and not primary_assigned and device_index == 0:
if inline_associate_public_ip and not primary_assigned and device_index == 0:
on_primary_card = (not network_cards) or card_index == 0
if on_primary_card:
spec["AssociatePublicIpAddress"] = True
Expand Down Expand Up @@ -918,6 +926,12 @@ def _resolve_eni_subnet_pool(self) -> list[str]:
def recreate_instance(self) -> dict[str, Any]:
current_id = self.current_instance_id()
terminated = None
# If a previous run allocated an Elastic IP for the multi-ENI
# post-launch path, release it before terminating so the
# AllocationId doesn't leak. Terminating the instance auto-
# disassociates the EIP but leaves it allocated to the account
# (charged hourly until released).
released_eip = self._release_recorded_eip()
if current_id:
terminated = self.ec2.terminate_instances(InstanceIds=[current_id])
ssh_rule = self.ensure_ssh_access()
Expand Down Expand Up @@ -1004,13 +1018,142 @@ def recreate_instance(self) -> dict[str, Any]:
self.state["launched_at_epoch"] = launch_time.timestamp()
except Exception:
pass

# Multi-ENI launches cannot carry AssociatePublicIpAddress on the
# NetworkInterfaces[] payload (AWS hard-rejects with
# InvalidParameterCombination), so we have to allocate an Elastic
# IP and associate it with the primary ENI post-launch when the
# operator opted into ANYSCAN_EC2_ASSOCIATE_PUBLIC_IP. Any failure
# is recorded in eni_attach but does not abort the recreate:
# operators can retry the EIP step manually and the worker is
# still usable on private IPs in the meantime. See PR #65
# issuecomment-4339242358 (anygpt-52) for the failure mode.
if target_count > 1 and self.config.associate_public_ip:
eni_attach_info["public_ip"] = self._associate_public_ip_post_launch(
instance
)

self._save_state()
return {
result: dict[str, Any] = {
"terminated": terminated,
"launched": instance,
"ssh_rule": ssh_rule,
"eni_attach": eni_attach_info,
}
if released_eip is not None:
result["released_eip"] = released_eip
return result

def _release_recorded_eip(self) -> dict[str, Any] | None:
"""Release the EIP recorded by the previous post-launch association.

Returns None when no EIP was recorded; otherwise returns a status
dict suitable for embedding in recreate_instance's result. Errors
are logged in the dict but do not raise: a stuck release should
not block the worker recreate.
"""
allocation_id = self.state.get("eip_allocation_id")
if not allocation_id:
return None
association_id = self.state.get("eip_association_id")
disassoc_error: dict[str, str] | None = None
if association_id:
try:
self.ec2.disassociate_address(AssociationId=association_id)
except botocore.exceptions.ClientError as exc:
disassoc_error = {
Comment on lines +1061 to +1064
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Catch non-ClientError exceptions in EIP release path

_release_recorded_eip is documented to log cleanup errors without blocking recreate, but both AWS calls only catch botocore.exceptions.ClientError. Transient transport failures (for example EndpointConnectionError/ReadTimeoutError) will escape this method, and run_once only handles ClientError, so the daemon can crash before terminating/relaunching the worker. This makes recreate fragile exactly when AWS/network is degraded.

Useful? React with 👍 / 👎.

"code": exc.response["Error"].get("Code") or "",
"message": exc.response["Error"].get("Message") or "",
}
try:
self.ec2.release_address(AllocationId=allocation_id)
except botocore.exceptions.ClientError as exc:
return {
"status": "release_address_failed",
"allocation_id": allocation_id,
"code": exc.response["Error"].get("Code"),
"error": exc.response["Error"].get("Message"),
"disassociate_error": disassoc_error,
}
self.state.pop("eip_allocation_id", None)
self.state.pop("eip_association_id", None)
return {
"status": "released",
"allocation_id": allocation_id,
"disassociate_error": disassoc_error,
}

def _associate_public_ip_post_launch(
self, instance: dict[str, Any]
) -> dict[str, Any]:
"""Allocate an Elastic IP and associate it with the primary ENI.

Used after a multi-ENI RunInstances call where the
AssociatePublicIpAddress field on NetworkInterfaces[] would have
been rejected by AWS. Failures are returned as a status dict
rather than raised so the worker still comes up on private IPs.

The AllocationId/AssociationId are recorded in self.state so the
next recreate_instance can release the EIP before launching a
replacement (otherwise we leak EIPs on every recreate).
"""
primary_eni_id: str | None = None
for nic in instance.get("NetworkInterfaces", []) or []:
attachment = nic.get("Attachment") or {}
if attachment.get("DeviceIndex") == 0:
primary_eni_id = nic.get("NetworkInterfaceId")
break
if not primary_eni_id:
return {
"status": "no_primary_eni",
"error": "no DeviceIndex=0 ENI found in RunInstances response",
}

try:
alloc = self.ec2.allocate_address(Domain="vpc")
except botocore.exceptions.ClientError as exc:
return {
"status": "allocate_address_failed",
"code": exc.response["Error"].get("Code"),
"error": exc.response["Error"].get("Message"),
}
allocation_id = alloc.get("AllocationId")
public_ip = alloc.get("PublicIp")
if not allocation_id:
return {
"status": "allocate_address_no_id",
"error": "AllocateAddress did not return AllocationId",
}

try:
assoc = self.ec2.associate_address(
AllocationId=allocation_id,
NetworkInterfaceId=primary_eni_id,
AllowReassociation=True,
)
except botocore.exceptions.ClientError as exc:
# Allocation succeeded but association failed; record the
# AllocationId so the next recreate can release it, otherwise
# we leak an unattached EIP on every retry.
self.state["eip_allocation_id"] = allocation_id
return {
"status": "associate_address_failed",
"code": exc.response["Error"].get("Code"),
"error": exc.response["Error"].get("Message"),
"allocation_id": allocation_id,
"public_ip": public_ip,
}
association_id = assoc.get("AssociationId")
self.state["eip_allocation_id"] = allocation_id
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve old allocation ID when prior EIP release failed

If _release_recorded_eip fails to release the previously recorded EIP, recreate_instance still proceeds and _associate_public_ip_post_launch overwrites self.state["eip_allocation_id"] with the new allocation. That drops the only pointer to the unreleased old allocation, so the next recreate cannot clean it up and the old EIP leaks permanently in this failure scenario.

Useful? React with 👍 / 👎.

if association_id:
self.state["eip_association_id"] = association_id
return {
"status": "associated",
"allocation_id": allocation_id,
"association_id": association_id,
"network_interface_id": primary_eni_id,
"public_ip": public_ip,
}

def run_once(self) -> dict[str, Any]:
snapshot = self.health_snapshot()
Expand Down
Loading