Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 57 additions & 2 deletions cmd/ateom-gvisor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,36 @@ func NewService(interiorNetNS netns.NsHandle, eth0LinkInfo *SaveLinkInfo, actorL
return svc
}

func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkloadRequest) (*ateompb.RunWorkloadResponse, error) {
// ensureEth0InPodNetns moves eth0 back to the pod netns if a prior
// Run/Restore left it stuck in the interior netns. Idempotent: returns
// nil if eth0 is already in the pod netns or absent from both.
func ensureEth0InPodNetns(ctx context.Context, s *AteomService) error {
if _, err := netlink.LinkByName("eth0"); err == nil {
return nil
}
podNetNS, err := netns.Get()
if err != nil {
return fmt.Errorf("while getting pod netns: %w", err)
}
var moved bool
err = netNSDo(ctx, s.interiorNetNS, func(_ context.Context) error {
link, lookupErr := netlink.LinkByName("eth0")
if lookupErr != nil {
return nil
}
if mvErr := netlink.LinkSetNsFd(link, int(podNetNS)); mvErr != nil {
return fmt.Errorf("while moving eth0 to pod netns: %w", mvErr)
}
moved = true
return nil
})
if moved {
slog.WarnContext(ctx, "Recovered eth0 from interior netns to pod netns")
}
return err
}

func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkloadRequest) (resp *ateompb.RunWorkloadResponse, retErr error) {
s.lock.Lock()
defer s.lock.Unlock()

Expand All @@ -206,6 +235,10 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload
// * Correct runsc version is downloaded and placed on disk.
// * All OCI bundles are set up, including for "pause" container.

if err := ensureEth0InPodNetns(ctx, s); err != nil {
return nil, fmt.Errorf("while recovering eth0 from prior failure: %w", err)
}

// Move pod eth0 into interior netns
eth0Link, err := netlink.LinkByName("eth0")
if err != nil {
Expand All @@ -214,6 +247,15 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload
if err := netlink.LinkSetNsFd(eth0Link, int(s.interiorNetNS)); err != nil {
return nil, fmt.Errorf("while moving eth0 into interior network namespace: %w", err)
}
// Roll eth0 back to the pod netns if any subsequent step errors,
// otherwise the worker pod is bricked for the next actor.
defer func() {
if retErr != nil {
if cleanupErr := ensureEth0InPodNetns(ctx, s); cleanupErr != nil {
slog.WarnContext(ctx, "Failed to roll back eth0 after Run failure", "err", cleanupErr)
}
}
}()

slog.InfoContext(ctx, "Restoring eth0 routes/addresses in interior netns")
err = netNSDo(ctx, s.interiorNetNS, func(ctx context.Context) error {
Expand Down Expand Up @@ -356,7 +398,7 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec
return nil, nil
}

func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.RestoreWorkloadRequest) (*ateompb.RestoreWorkloadResponse, error) {
func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.RestoreWorkloadRequest) (resp *ateompb.RestoreWorkloadResponse, retErr error) {
s.lock.Lock()
defer s.lock.Unlock()

Expand All @@ -368,6 +410,10 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore
// * All OCI bundles are set up, including for "pause" container.
// * Checkpoint downloaded and placed on disk

if err := ensureEth0InPodNetns(ctx, s); err != nil {
return nil, fmt.Errorf("while recovering eth0 from prior failure: %w", err)
}

// Move pod eth0 into interior netns
eth0Link, err := netlink.LinkByName("eth0")
if err != nil {
Expand All @@ -376,6 +422,15 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore
if err := netlink.LinkSetNsFd(eth0Link, int(s.interiorNetNS)); err != nil {
return nil, fmt.Errorf("while moving eth0 into interior network namespace: %w", err)
}
// Roll eth0 back to the pod netns if any subsequent step errors,
// otherwise the worker pod is bricked for the next actor.
defer func() {
if retErr != nil {
if cleanupErr := ensureEth0InPodNetns(ctx, s); cleanupErr != nil {
slog.WarnContext(ctx, "Failed to roll back eth0 after Restore failure", "err", cleanupErr)
}
}
}()

// Restore route and IP information from save onto eth0.
slog.InfoContext(ctx, "Restoring eth0 routes/addresses in interior netns")
Expand Down
Loading