Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions deploy/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ full-clean:
clean-spoke:
@./openshift-clusters/scripts/clean-spoke.sh

clean-mutable-topology:
@./openshift-clusters/scripts/clean-mutable-topology.sh

ssh:
@./aws-hypervisor/scripts/ssh.sh

Expand All @@ -73,6 +76,12 @@ arbiter-ipi:
arbiter-agent:
@./openshift-clusters/scripts/deploy-cluster.sh --topology arbiter --method agent

sno-ipi:
@./openshift-clusters/scripts/deploy-cluster.sh --topology sno --method ipi

sno-agent:
@./openshift-clusters/scripts/deploy-cluster.sh --topology sno --method agent

arbiter-kcli:
@./openshift-clusters/scripts/deploy-cluster.sh --topology arbiter --method kcli

Expand All @@ -83,6 +92,9 @@ fencing-assisted:
@$(MAKE) fencing-ipi
@./openshift-clusters/scripts/deploy-fencing-assisted.sh

sno-to-3node:
@./openshift-clusters/scripts/sno-to-3node.sh

patch-nodes:
@./openshift-clusters/scripts/patch-nodes.sh
get-tnf-logs:
Expand Down Expand Up @@ -115,6 +127,7 @@ help:
@echo " arbiter-kcli - Deploy arbiter cluster using kcli (non-interactive)"
@echo " fencing-kcli - Deploy fencing cluster using kcli (non-interactive)"
@echo " fencing-assisted - Deploy hub + spoke TNF cluster via assisted installer"
@echo " sno-to-3node - Transition existing SNO cluster to 3-node HA (platform:none)"
@echo ""
@echo "OpenShift Cluster Management:"
@echo " redeploy-cluster - Redeploy OpenShift cluster using dev-scripts make redeploy"
Expand All @@ -123,6 +136,7 @@ help:
@echo " clean - Clean OpenShift cluster using dev-scripts clean target"
@echo " full-clean - Fully clean instance cache and OpenShift cluster using dev-scripts realclean target"
@echo " clean-spoke - Clean spoke cluster resources (VMs, network, auth) from assisted installer"
@echo " clean-mutable-topology - Remove master-1/2 VMs, disks, DHCP/DNS entries (sno-to-3node cleanup)"
@echo " patch-nodes - Build resource-agents RPM and patch cluster nodes (default version: 4.11)"
@echo ""
@echo "Cluster Utilities:"
Expand Down
43 changes: 43 additions & 0 deletions deploy/openshift-clusters/clean-mutable-topology.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
---
- hosts: metal_machine
gather_facts: yes

pre_tasks:
- name: Confirm mutable topology cleanup
ansible.builtin.pause:
prompt: >-
This will destroy the master-1 and master-2 VMs, their disks, DHCP reservations,
and DNS entries. master-0 (if still present) will be unaffected.
Press Enter to proceed or Ctrl+C to abort.
delegate_to: localhost
run_once: true
when: interactive_mode | default(true) | bool

- name: Detect cluster domain from kubeconfig (best-effort)
shell: |
KUBECONFIG={{ dev_scripts_path | default('openshift-metal3/dev-scripts') }}/ocp/{{ sno_cluster_name | default('ostest') }}/auth/kubeconfig \
oc get infrastructure cluster -o jsonpath='{.status.apiServerInternalURI}' 2>/dev/null \
| sed 's|^https://api-int\.||; s|:6443$||'
register: detected_domain
changed_when: false
failed_when: false
ignore_errors: true

- name: Set cluster domain (use detected, variable override, or default)
set_fact:
sno_cluster_domain: >-
{{ sno_cluster_domain
if (sno_cluster_domain is defined and sno_cluster_domain)
else (detected_domain.stdout | trim
if (detected_domain.stdout is defined and detected_domain.stdout | trim)
else 'ostest.test.metalkube.org') }}

tasks:
- name: Run mutable topology cleanup
import_role:
name: mutable-topology/sno-to-3node
tasks_from: clean.yml
Comment on lines +35 to +39

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🩺 Stability & Availability | 🟠 Major | ⚡ Quick win

Refuse cleanup once the cluster is already HA unless the caller forces it.

This imports tasks/clean.yml unconditionally, and that role deletes master-1/master-2 plus their disks and DHCP state. On a successful 3-node cluster, removing two control-plane nodes here can drop etcd from 3 members to 1 and take the cluster down. Please gate this on the current control-plane topology, or require an explicit force_cleanup=true override before running the destructive role.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@deploy/openshift-clusters/clean-mutable-topology.yml` around lines 35 - 39,
The cleanup playbook currently imports mutable-topology/sno-to-3node
tasks/clean.yml unconditionally, which can destroy a healthy HA control plane.
Add a topology check in clean-mutable-topology.yml before the import_role so it
only runs when the cluster is not already HA, or require an explicit
force_cleanup=true override to proceed. Use the existing import_role task as the
entry point and gate it with a condition or precheck that prevents destructive
cleanup unless the override is set.


- name: Cleanup complete
ansible.builtin.debug:
msg: "Mutable topology cleanup complete. master-1 and master-2 VMs have been removed."
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ ci_token
clusterbot-ci_token
config_arbiter.sh
config_fencing.sh
config_sno.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash


# Please copy one of the config values below for IPI or Agent based installs into your
# config.
# BEGIN IPI Specific Install Config Variables
export IP_STACK="v4"
export NUM_WORKERS=0
export MASTER_MEMORY=32768
export MASTER_DISK=100
export MASTER_VCPU=4
export NUM_MASTERS=1
## END IPI Specific Install Config Variables

## BEGIN Agent Specific Install Config Variables
export AGENT_E2E_TEST_SCENARIO="SNO_IPV4"
# Sets the install-config.yaml's platform type.
# The default is 'baremetal'.
# See https://github.com/openshift-metal3/dev-scripts/blob/master/config_example.sh for more details on this variable and its effects.
#export AGENT_PLATFORM_TYPE=none
## END Agent Specific Install Config Variables
####

# TechPreview FeatureSet not needed for 4.20 and above OCP
# export FEATURE_SET="TechPreviewNoUpgrade"
export OPENSHIFT_CI="true"

# If you want to avoid using the CI_TOKEN, uncomment this variable, but it has side effects.
# You can read more on this here: https://github.com/openshift-metal3/dev-scripts/blob/3f070cfd36977381a186cadfb44887856d652bed/config_example.sh#L21
# export OPENSHIFT_CI="true"

# You can find the latest public images in https://quay.io/repository/openshift-release-dev/ocp-release?tab=tags
# and select your preferred version. Public sources can be found at https://mirror.openshift.com/pub/openshift-v4/

export OPENSHIFT_RELEASE_IMAGE=quay.io/openshift-release-dev/ocp-release:4.21.0-x86_64
# Unless you need to override the installer image, this is not needed
# export OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=""

# Disable sigstore image verification during installation
export OPENSHIFT_INSTALL_EXPERIMENTAL_DISABLE_IMAGE_POLICY=true
Comment thread
eggfoobar marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
- kubeconfig_path is defined
- kubeconfig_stat.stat.exists | default(false)
changed_when: false
register: oc_project_result
retries: 5
delay: 15
until: oc_project_result.rc == 0
failed_when: false
listen: Set OCP project

- name: Warn about missing kubeconfig
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
expected_prefix:
arbiter: "TNA"
fencing: "TNF"
sno: "SNO"
fail:
msg: >-
Config file {{ config_file[method] }} has AGENT_E2E_TEST_SCENARIO="{{ config_scenario }}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ supported_methods:
supported_topologies:
- arbiter
- fencing
- sno
config_file:
ipi: config_{{topology}}.sh
agent: config_{{topology}}.sh
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
# Cluster identity (auto-detected from cluster if not set)
sno_cluster_name: ostest
sno_cluster_domain: ""
sno_infra_id: ""

# Existing master-0 (auto-detected from cluster)
sno_master0_ip: ""

# New node IPs (static assignments within the dev-scripts DHCP range)
sno_master1_ip: "192.168.111.21"
sno_master2_ip: "192.168.111.22"

# VM specs
sno_vm_vcpus: 6
sno_vm_ram_mb: 16384
sno_vm_disk_gb: 50

# Libvirt network (dev-scripts baremetal network)
sno_libvirt_network: ostestbm
sno_libvirt_bridge: ostestbm

# RHCOS live ISO path on hypervisor (auto-detected from release image if empty)
sno_rhcos_live_iso: ""

# Timeouts
sno_mco_timeout_minutes: 45
sno_node_join_timeout_minutes: 20
sno_etcd_timeout_minutes: 15

# Auto-fix MCO drain deadlock during topology transition
sno_auto_fix_drain: true

# Paths (override if dev-scripts is in a non-standard location)
sno_kubeconfig: ""

# VM image directory
sno_vm_image_dir: "/var/lib/libvirt/images"
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
---
- name: "[boot] Check if RHCOS live ISO exists on hypervisor"
stat:
path: "/var/lib/libvirt/images/rhcos-live.iso"
register: iso_stat

- name: "[boot] Find RHCOS live ISO from dev-scripts cache"
shell: |
DEVSCRIPTS_ISO=$(find /var/lib/libvirt/images -name 'rhcos-*-live*.iso' 2>/dev/null | head -1)
if [ -n "$DEVSCRIPTS_ISO" ]; then
echo "Using existing ISO: $DEVSCRIPTS_ISO"
sudo ln -sf "$DEVSCRIPTS_ISO" /var/lib/libvirt/images/rhcos-live.iso
exit 0
fi

CACHE_ISO=$(find {{ sno_dev_scripts_path }}/ -name 'rhcos-*-live*.iso' 2>/dev/null | head -1)
if [ -n "$CACHE_ISO" ]; then
echo "Using dev-scripts cached ISO: $CACHE_ISO"
sudo ln -sf "$CACHE_ISO" /var/lib/libvirt/images/rhcos-live.iso
exit 0
fi

echo "ERROR: No RHCOS live ISO found. Set sno_rhcos_live_iso variable."
exit 1
when: not iso_stat.stat.exists and sno_rhcos_live_iso == ""

- name: "[boot] Set ISO path"
set_fact:
sno_iso_path: "{{ sno_rhcos_live_iso if sno_rhcos_live_iso else '/var/lib/libvirt/images/rhcos-live.iso' }}"

- name: "[boot] Read master.ign content"
slurp:
src: /tmp/master.ign
register: master_ign_content

- name: "[boot] Set base64-encoded master.ign"
set_fact:
sno_master_ign_b64: "{{ master_ign_content.content }}"

- name: "[boot] Generate auto-install ignition"
template:
src: auto-install.ign.j2
dest: /tmp/auto-install.ign
mode: '0644'

- name: "[boot] Create per-node ISO with embedded ignition"
shell: |
sudo cp {{ sno_iso_path }} /var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso
sudo coreos-installer iso ignition embed -i /tmp/auto-install.ign /var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso -f
loop: "{{ sno_new_nodes }}"

- name: "[boot] Boot each VM with ignition-embedded ISO"
shell: |
VM_NAME="{{ item.name }}"
MAC="{{ sno_node_macs[item.hostname] }}"

sudo virsh destroy "$VM_NAME" 2>/dev/null || true
sudo virsh undefine "$VM_NAME" 2>/dev/null || true

sudo virt-install \
--name "$VM_NAME" \
--ram {{ sno_vm_ram_mb }} \
--vcpus {{ sno_vm_vcpus }} \
--disk {{ sno_vm_image_dir }}/${VM_NAME}.qcow2,bus=virtio \
--network network={{ sno_libvirt_network }},model=virtio,mac=${MAC} \
--cdrom /var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso \
--os-variant rhel9.0 \
--graphics none \
--noautoconsole \
--boot loader=/usr/share/edk2/ovmf/OVMF_CODE.fd,loader_ro=yes,loader_type=pflash,nvram_template=/usr/share/edk2/ovmf/OVMF_VARS.fd,loader_secure=no \
--boot hd,cdrom \
--tpm none
loop: "{{ sno_new_nodes }}"

- name: "[boot] Verify VMs are running (initial install boot)"
shell: |
sudo virsh domstate {{ item.name }}
register: vm_state
loop: "{{ sno_new_nodes }}"
changed_when: false
failed_when: "'running' not in vm_state.stdout"

- name: "[boot] Wait for coreos-installer to complete (VM will power off)"
# coreos-installer runs ExecStartPost=systemctl reboot, but RHCOS live issues
# an ACPI poweroff rather than a reset. libvirt fires on_poweroff=destroy so
# the VM shuts off. We poll until shut off, then boot from disk below.
shell: |
for i in $(seq 50); do
STATE=$(sudo virsh domstate {{ item.name }} 2>/dev/null || echo "unknown")
if echo "$STATE" | grep -q "shut off"; then
echo "{{ item.name }} shut off after $((i * 20))s - install complete"
exit 0
fi
sleep 20
done
echo "Timeout: {{ item.name }} did not shut off within 1000s"
exit 1
loop: "{{ sno_new_nodes }}"
changed_when: false

- name: "[boot] Remove CDROM from boot order after install"
# Prevent coreos-installer loop: strip the cdrom boot entry so UEFI only
# tries the hard disk on subsequent boots.
shell: |
TMPXML=$(mktemp /tmp/vm-XXXXXX.xml)
sudo virsh dumpxml {{ item.name }} > "$TMPXML"
sudo sed -i "/<boot dev='cdrom'\/>/d" "$TMPXML"
sudo virsh define "$TMPXML"
sudo rm -f "$TMPXML"
loop: "{{ sno_new_nodes }}"
changed_when: true

- name: "[boot] Start VMs to boot from installed RHCOS"
shell: |
sudo virsh start {{ item.name }}
loop: "{{ sno_new_nodes }}"
changed_when: true

- name: "[boot] Verify VMs are running from installed disk"
shell: |
sudo virsh domstate {{ item.name }}
register: vm_state_disk
loop: "{{ sno_new_nodes }}"
changed_when: false
failed_when: "'running' not in vm_state_disk.stdout"

- name: "[boot] VMs booted with RHCOS"
debug:
msg: >-
{{ sno_new_nodes | length }} VMs installed and started from disk.
Waiting for nodes to join the cluster...
Loading