diff --git a/deploy/Makefile b/deploy/Makefile
index 18959e94..68efb32b 100644
--- a/deploy/Makefile
+++ b/deploy/Makefile
@@ -52,6 +52,9 @@ full-clean:
clean-spoke:
@./openshift-clusters/scripts/clean-spoke.sh
+clean-mutable-topology:
+ @./openshift-clusters/scripts/clean-mutable-topology.sh
+
ssh:
@./aws-hypervisor/scripts/ssh.sh
@@ -73,6 +76,12 @@ arbiter-ipi:
arbiter-agent:
@./openshift-clusters/scripts/deploy-cluster.sh --topology arbiter --method agent
+sno-ipi:
+ @./openshift-clusters/scripts/deploy-cluster.sh --topology sno --method ipi
+
+sno-agent:
+ @./openshift-clusters/scripts/deploy-cluster.sh --topology sno --method agent
+
arbiter-kcli:
@./openshift-clusters/scripts/deploy-cluster.sh --topology arbiter --method kcli
@@ -83,6 +92,9 @@ fencing-assisted:
@$(MAKE) fencing-ipi
@./openshift-clusters/scripts/deploy-fencing-assisted.sh
+sno-to-3node:
+ @./openshift-clusters/scripts/sno-to-3node.sh
+
patch-nodes:
@./openshift-clusters/scripts/patch-nodes.sh
get-tnf-logs:
@@ -115,6 +127,7 @@ help:
@echo " arbiter-kcli - Deploy arbiter cluster using kcli (non-interactive)"
@echo " fencing-kcli - Deploy fencing cluster using kcli (non-interactive)"
@echo " fencing-assisted - Deploy hub + spoke TNF cluster via assisted installer"
+ @echo " sno-to-3node - Transition existing SNO cluster to 3-node HA (platform:none)"
@echo ""
@echo "OpenShift Cluster Management:"
@echo " redeploy-cluster - Redeploy OpenShift cluster using dev-scripts make redeploy"
@@ -123,6 +136,7 @@ help:
@echo " clean - Clean OpenShift cluster using dev-scripts clean target"
@echo " full-clean - Fully clean instance cache and OpenShift cluster using dev-scripts realclean target"
@echo " clean-spoke - Clean spoke cluster resources (VMs, network, auth) from assisted installer"
+ @echo " clean-mutable-topology - Remove master-1/2 VMs, disks, DHCP/DNS entries (sno-to-3node cleanup)"
@echo " patch-nodes - Build resource-agents RPM and patch cluster nodes (default version: 4.11)"
@echo ""
@echo "Cluster Utilities:"
diff --git a/deploy/openshift-clusters/clean-mutable-topology.yml b/deploy/openshift-clusters/clean-mutable-topology.yml
new file mode 100644
index 00000000..c5113c3c
--- /dev/null
+++ b/deploy/openshift-clusters/clean-mutable-topology.yml
@@ -0,0 +1,43 @@
+---
+- hosts: metal_machine
+ gather_facts: yes
+
+ pre_tasks:
+ - name: Confirm mutable topology cleanup
+ ansible.builtin.pause:
+ prompt: >-
+ This will destroy the master-1 and master-2 VMs, their disks, DHCP reservations,
+ and DNS entries. master-0 (if still present) will be unaffected.
+ Press Enter to proceed or Ctrl+C to abort.
+ delegate_to: localhost
+ run_once: true
+ when: interactive_mode | default(true) | bool
+
+ - name: Detect cluster domain from kubeconfig (best-effort)
+ shell: |
+ KUBECONFIG={{ dev_scripts_path | default('openshift-metal3/dev-scripts') }}/ocp/{{ sno_cluster_name | default('ostest') }}/auth/kubeconfig \
+ oc get infrastructure cluster -o jsonpath='{.status.apiServerInternalURI}' 2>/dev/null \
+ | sed 's|^https://api-int\.||; s|:6443$||'
+ register: detected_domain
+ changed_when: false
+ failed_when: false
+ ignore_errors: true
+
+ - name: Set cluster domain (use detected, variable override, or default)
+ set_fact:
+ sno_cluster_domain: >-
+ {{ sno_cluster_domain
+ if (sno_cluster_domain is defined and sno_cluster_domain)
+ else (detected_domain.stdout | trim
+ if (detected_domain.stdout is defined and detected_domain.stdout | trim)
+ else 'ostest.test.metalkube.org') }}
+
+ tasks:
+ - name: Run mutable topology cleanup
+ import_role:
+ name: mutable-topology/sno-to-3node
+ tasks_from: clean.yml
+
+ - name: Cleanup complete
+ ansible.builtin.debug:
+ msg: "Mutable topology cleanup complete. master-1 and master-2 VMs have been removed."
diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/.gitignore b/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/.gitignore
index 6ad1577e..0818832e 100644
--- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/.gitignore
+++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/.gitignore
@@ -3,3 +3,4 @@ ci_token
clusterbot-ci_token
config_arbiter.sh
config_fencing.sh
+config_sno.sh
\ No newline at end of file
diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/config_sno_example.sh b/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/config_sno_example.sh
new file mode 100644
index 00000000..f690e6f8
--- /dev/null
+++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/config_sno_example.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+
+# Please copy one of the config values below for IPI or Agent based installs into your
+# config.
+# BEGIN IPI Specific Install Config Variables
+export IP_STACK="v4"
+export NUM_WORKERS=0
+export MASTER_MEMORY=32768
+export MASTER_DISK=100
+export MASTER_VCPU=4
+export NUM_MASTERS=1
+## END IPI Specific Install Config Variables
+
+## BEGIN Agent Specific Install Config Variables
+export AGENT_E2E_TEST_SCENARIO="SNO_IPV4"
+# Sets the install-config.yaml's platform type.
+# The default is 'baremetal'.
+# See https://github.com/openshift-metal3/dev-scripts/blob/master/config_example.sh for more details on this variable and its effects.
+#export AGENT_PLATFORM_TYPE=none
+## END Agent Specific Install Config Variables
+####
+
+# TechPreview FeatureSet not needed for 4.20 and above OCP
+# export FEATURE_SET="TechPreviewNoUpgrade"
+export OPENSHIFT_CI="true"
+
+# If you want to avoid using the CI_TOKEN, uncomment this variable, but it has side effects.
+# You can read more on this here: https://github.com/openshift-metal3/dev-scripts/blob/3f070cfd36977381a186cadfb44887856d652bed/config_example.sh#L21
+# export OPENSHIFT_CI="true"
+
+# You can find the latest public images in https://quay.io/repository/openshift-release-dev/ocp-release?tab=tags
+# and select your preferred version. Public sources can be found at https://mirror.openshift.com/pub/openshift-v4/
+
+export OPENSHIFT_RELEASE_IMAGE=quay.io/openshift-release-dev/ocp-release:4.21.0-x86_64
+# Unless you need to override the installer image, this is not needed
+# export OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=""
+
+# Disable sigstore image verification during installation
+export OPENSHIFT_INSTALL_EXPERIMENTAL_DISABLE_IMAGE_POLICY=true
diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml
index 4f3761e0..d0a6ea98 100644
--- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml
+++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml
@@ -12,6 +12,11 @@
- kubeconfig_path is defined
- kubeconfig_stat.stat.exists | default(false)
changed_when: false
+ register: oc_project_result
+ retries: 5
+ delay: 15
+ until: oc_project_result.rc == 0
+ failed_when: false
listen: Set OCP project
- name: Warn about missing kubeconfig
diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml
index 84114407..c973118b 100644
--- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml
+++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml
@@ -40,6 +40,7 @@
expected_prefix:
arbiter: "TNA"
fencing: "TNF"
+ sno: "SNO"
fail:
msg: >-
Config file {{ config_file[method] }} has AGENT_E2E_TEST_SCENARIO="{{ config_scenario }}"
diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/vars/main.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/vars/main.yml
index 35a6bad0..9d2fb7cf 100644
--- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/vars/main.yml
+++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/vars/main.yml
@@ -8,6 +8,7 @@ supported_methods:
supported_topologies:
- arbiter
- fencing
+ - sno
config_file:
ipi: config_{{topology}}.sh
agent: config_{{topology}}.sh
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/defaults/main.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/defaults/main.yml
new file mode 100644
index 00000000..202f3a77
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/defaults/main.yml
@@ -0,0 +1,38 @@
+---
+# Cluster identity (auto-detected from cluster if not set)
+sno_cluster_name: ostest
+sno_cluster_domain: ""
+sno_infra_id: ""
+
+# Existing master-0 (auto-detected from cluster)
+sno_master0_ip: ""
+
+# New node IPs (static assignments within the dev-scripts DHCP range)
+sno_master1_ip: "192.168.111.21"
+sno_master2_ip: "192.168.111.22"
+
+# VM specs
+sno_vm_vcpus: 6
+sno_vm_ram_mb: 16384
+sno_vm_disk_gb: 50
+
+# Libvirt network (dev-scripts baremetal network)
+sno_libvirt_network: ostestbm
+sno_libvirt_bridge: ostestbm
+
+# RHCOS live ISO path on hypervisor (auto-detected from release image if empty)
+sno_rhcos_live_iso: ""
+
+# Timeouts
+sno_mco_timeout_minutes: 45
+sno_node_join_timeout_minutes: 20
+sno_etcd_timeout_minutes: 15
+
+# Auto-fix MCO drain deadlock during topology transition
+sno_auto_fix_drain: true
+
+# Paths (override if dev-scripts is in a non-standard location)
+sno_kubeconfig: ""
+
+# VM image directory
+sno_vm_image_dir: "/var/lib/libvirt/images"
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/boot-nodes.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/boot-nodes.yml
new file mode 100644
index 00000000..963943d7
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/boot-nodes.yml
@@ -0,0 +1,131 @@
+---
+- name: "[boot] Check if RHCOS live ISO exists on hypervisor"
+ stat:
+ path: "/var/lib/libvirt/images/rhcos-live.iso"
+ register: iso_stat
+
+- name: "[boot] Find RHCOS live ISO from dev-scripts cache"
+ shell: |
+ DEVSCRIPTS_ISO=$(find /var/lib/libvirt/images -name 'rhcos-*-live*.iso' 2>/dev/null | head -1)
+ if [ -n "$DEVSCRIPTS_ISO" ]; then
+ echo "Using existing ISO: $DEVSCRIPTS_ISO"
+ sudo ln -sf "$DEVSCRIPTS_ISO" /var/lib/libvirt/images/rhcos-live.iso
+ exit 0
+ fi
+
+ CACHE_ISO=$(find {{ sno_dev_scripts_path }}/ -name 'rhcos-*-live*.iso' 2>/dev/null | head -1)
+ if [ -n "$CACHE_ISO" ]; then
+ echo "Using dev-scripts cached ISO: $CACHE_ISO"
+ sudo ln -sf "$CACHE_ISO" /var/lib/libvirt/images/rhcos-live.iso
+ exit 0
+ fi
+
+ echo "ERROR: No RHCOS live ISO found. Set sno_rhcos_live_iso variable."
+ exit 1
+ when: not iso_stat.stat.exists and sno_rhcos_live_iso == ""
+
+- name: "[boot] Set ISO path"
+ set_fact:
+ sno_iso_path: "{{ sno_rhcos_live_iso if sno_rhcos_live_iso else '/var/lib/libvirt/images/rhcos-live.iso' }}"
+
+- name: "[boot] Read master.ign content"
+ slurp:
+ src: /tmp/master.ign
+ register: master_ign_content
+
+- name: "[boot] Set base64-encoded master.ign"
+ set_fact:
+ sno_master_ign_b64: "{{ master_ign_content.content }}"
+
+- name: "[boot] Generate auto-install ignition"
+ template:
+ src: auto-install.ign.j2
+ dest: /tmp/auto-install.ign
+ mode: '0644'
+
+- name: "[boot] Create per-node ISO with embedded ignition"
+ shell: |
+ sudo cp {{ sno_iso_path }} /var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso
+ sudo coreos-installer iso ignition embed -i /tmp/auto-install.ign /var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso -f
+ loop: "{{ sno_new_nodes }}"
+
+- name: "[boot] Boot each VM with ignition-embedded ISO"
+ shell: |
+ VM_NAME="{{ item.name }}"
+ MAC="{{ sno_node_macs[item.hostname] }}"
+
+ sudo virsh destroy "$VM_NAME" 2>/dev/null || true
+ sudo virsh undefine "$VM_NAME" 2>/dev/null || true
+
+ sudo virt-install \
+ --name "$VM_NAME" \
+ --ram {{ sno_vm_ram_mb }} \
+ --vcpus {{ sno_vm_vcpus }} \
+ --disk {{ sno_vm_image_dir }}/${VM_NAME}.qcow2,bus=virtio \
+ --network network={{ sno_libvirt_network }},model=virtio,mac=${MAC} \
+ --cdrom /var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso \
+ --os-variant rhel9.0 \
+ --graphics none \
+ --noautoconsole \
+ --boot loader=/usr/share/edk2/ovmf/OVMF_CODE.fd,loader_ro=yes,loader_type=pflash,nvram_template=/usr/share/edk2/ovmf/OVMF_VARS.fd,loader_secure=no \
+ --boot hd,cdrom \
+ --tpm none
+ loop: "{{ sno_new_nodes }}"
+
+- name: "[boot] Verify VMs are running (initial install boot)"
+ shell: |
+ sudo virsh domstate {{ item.name }}
+ register: vm_state
+ loop: "{{ sno_new_nodes }}"
+ changed_when: false
+ failed_when: "'running' not in vm_state.stdout"
+
+- name: "[boot] Wait for coreos-installer to complete (VM will power off)"
+ # coreos-installer runs ExecStartPost=systemctl reboot, but RHCOS live issues
+ # an ACPI poweroff rather than a reset. libvirt fires on_poweroff=destroy so
+ # the VM shuts off. We poll until shut off, then boot from disk below.
+ shell: |
+ for i in $(seq 50); do
+ STATE=$(sudo virsh domstate {{ item.name }} 2>/dev/null || echo "unknown")
+ if echo "$STATE" | grep -q "shut off"; then
+ echo "{{ item.name }} shut off after $((i * 20))s - install complete"
+ exit 0
+ fi
+ sleep 20
+ done
+ echo "Timeout: {{ item.name }} did not shut off within 1000s"
+ exit 1
+ loop: "{{ sno_new_nodes }}"
+ changed_when: false
+
+- name: "[boot] Remove CDROM from boot order after install"
+ # Prevent coreos-installer loop: strip the cdrom boot entry so UEFI only
+ # tries the hard disk on subsequent boots.
+ shell: |
+ TMPXML=$(mktemp /tmp/vm-XXXXXX.xml)
+ sudo virsh dumpxml {{ item.name }} > "$TMPXML"
+ sudo sed -i "//d" "$TMPXML"
+ sudo virsh define "$TMPXML"
+ sudo rm -f "$TMPXML"
+ loop: "{{ sno_new_nodes }}"
+ changed_when: true
+
+- name: "[boot] Start VMs to boot from installed RHCOS"
+ shell: |
+ sudo virsh start {{ item.name }}
+ loop: "{{ sno_new_nodes }}"
+ changed_when: true
+
+- name: "[boot] Verify VMs are running from installed disk"
+ shell: |
+ sudo virsh domstate {{ item.name }}
+ register: vm_state_disk
+ loop: "{{ sno_new_nodes }}"
+ changed_when: false
+ failed_when: "'running' not in vm_state_disk.stdout"
+
+- name: "[boot] VMs booted with RHCOS"
+ debug:
+ msg: >-
+ {{ sno_new_nodes | length }} VMs installed and started from disk.
+ Waiting for nodes to join the cluster...
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/clean.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/clean.yml
new file mode 100644
index 00000000..5c7d4112
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/clean.yml
@@ -0,0 +1,124 @@
+---
+- name: "[clean] Stop and undefine new VMs"
+ shell: |
+ sudo virsh destroy {{ item.name }} 2>/dev/null || true
+ sudo virsh undefine {{ item.name }} --remove-all-storage 2>/dev/null || true
+ loop: "{{ sno_new_nodes }}"
+ failed_when: false
+ changed_when: true
+
+- name: "[clean] Remove disk images"
+ file:
+ path: "{{ sno_vm_image_dir }}/{{ item.name }}.qcow2"
+ state: absent
+ loop: "{{ sno_new_nodes }}"
+ become: true
+ failed_when: false
+
+- name: "[clean] Remove NVRAM files"
+ file:
+ path: "/var/lib/libvirt/qemu/nvram/{{ item.name }}_VARS.fd"
+ state: absent
+ loop: "{{ sno_new_nodes }}"
+ become: true
+ failed_when: false
+
+- name: "[clean] Remove per-node RHCOS ISOs"
+ file:
+ path: "/var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso"
+ state: absent
+ loop: "{{ sno_new_nodes }}"
+ become: true
+ failed_when: false
+
+- name: "[clean] Remove DHCP reservations from libvirt network"
+ shell: |
+ NET_XML=$(sudo virsh net-dumpxml {{ sno_libvirt_network }} 2>/dev/null || true)
+ EXISTING_MAC=$(echo "$NET_XML" | python3 -c "
+ import xml.etree.ElementTree as ET, sys
+ root = ET.parse(sys.stdin).getroot()
+ for host in root.findall('.//dhcp/host'):
+ if host.get('ip') == '{{ item.ip }}':
+ print(host.get('mac',''))
+ " 2>/dev/null || true)
+
+ if [ -n "$EXISTING_MAC" ]; then
+ sudo virsh net-update {{ sno_libvirt_network }} delete ip-dhcp-host \
+ "" --live --config 2>/dev/null || true
+ echo "Removed DHCP reservation for {{ item.ip }} (mac=${EXISTING_MAC})"
+ else
+ echo "No DHCP reservation found for {{ item.ip }}"
+ fi
+ loop: "{{ sno_new_nodes }}"
+ failed_when: false
+ changed_when: true
+
+- name: "[clean] Remove DNS host entries from libvirt network"
+ shell: |
+ sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \
+ "{{ item.hostname }}.{{ sno_cluster_domain }}" \
+ --live --config 2>/dev/null || true
+ sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \
+ "{{ item.hostname }}" \
+ --live --config 2>/dev/null || true
+ loop: "{{ sno_new_nodes }}"
+ failed_when: false
+ changed_when: true
+
+- name: "[clean] Remove api-int from libvirt network DNS"
+ shell: |
+ sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \
+ "api-int.{{ sno_cluster_domain }}" \
+ --live --config 2>/dev/null || true
+ failed_when: false
+ changed_when: true
+
+- name: "[clean] Remove entries from addnhosts"
+ lineinfile:
+ path: "{{ sno_addnhosts_path }}"
+ regexp: "^{{ item.ip }}\\s"
+ state: absent
+ loop: "{{ sno_new_nodes }}"
+ become: true
+ failed_when: false
+
+- name: "[clean] Remove hostname entries from addnhosts"
+ lineinfile:
+ path: "{{ sno_addnhosts_path }}"
+ regexp: "\\s{{ item.hostname }}\\.{{ sno_cluster_domain }}(\\s|$)"
+ state: absent
+ loop: "{{ sno_new_nodes }}"
+ become: true
+ failed_when: false
+
+- name: "[clean] Flush DHCP leases for new node IPs"
+ shell: |
+ python3 -c "
+ import json, os
+ lf = '{{ sno_lease_file }}'
+ if not os.path.exists(lf) or os.path.getsize(lf) == 0:
+ exit(0)
+ with open(lf) as f:
+ leases = json.load(f)
+ reserved = {{ [sno_master1_ip, sno_master2_ip] | to_json }}
+ before = len(leases)
+ leases = [l for l in leases if l.get('ip-address') not in reserved]
+ with open(lf, 'w') as f:
+ json.dump(leases, f, indent=2)
+ print(f'Flushed {before - len(leases)} leases')
+ "
+ become: true
+ failed_when: false
+ changed_when: true
+
+- name: "[clean] Reload libvirt dnsmasq"
+ shell: |
+ sudo kill -HUP $(cat /run/libvirt/network/{{ sno_libvirt_bridge }}.pid) 2>/dev/null || true
+ changed_when: true
+ failed_when: false
+
+- name: "[clean] Cleanup complete"
+ debug:
+ msg: >-
+ Removed VMs: {{ sno_new_nodes | map(attribute='name') | list | join(', ') }}.
+ Disk images, NVRAM, ISOs, DHCP reservations, and DNS entries removed.
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/create-vms.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/create-vms.yml
new file mode 100644
index 00000000..1779afde
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/create-vms.yml
@@ -0,0 +1,162 @@
+---
+- name: "[create-vms] Cleanup stale VMs"
+ shell: |
+ sudo virsh destroy {{ item.name }} 2>/dev/null || true
+ sudo virsh undefine {{ item.name }} --remove-all-storage 2>/dev/null || true
+ sudo rm -f {{ sno_vm_image_dir }}/{{ item.name }}.qcow2
+ loop: "{{ sno_new_nodes }}"
+ failed_when: false
+ changed_when: true
+
+- name: "[create-vms] Remove stale DHCP entries from libvirt network"
+ shell: |
+ NET_XML=$(sudo virsh net-dumpxml {{ sno_libvirt_network }})
+ EXISTING_MAC=$(echo "$NET_XML" | python3 -c "
+ import xml.etree.ElementTree as ET, sys
+ root = ET.parse(sys.stdin).getroot()
+ for host in root.findall('.//dhcp/host'):
+ if host.get('ip') == '{{ item.ip }}':
+ print(host.get('mac',''))
+ " 2>/dev/null || true)
+
+ if [ -n "$EXISTING_MAC" ]; then
+ sudo virsh net-update {{ sno_libvirt_network }} delete ip-dhcp-host \
+ "" --live --config 2>/dev/null || true
+ fi
+
+ sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \
+ "{{ item.hostname }}.{{ sno_cluster_domain }}" \
+ --live --config 2>/dev/null || true
+ sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \
+ "{{ item.hostname }}" \
+ --live --config 2>/dev/null || true
+ loop: "{{ sno_new_nodes }}"
+ failed_when: false
+ changed_when: true
+
+- name: "[create-vms] Clean stale entries from addnhosts"
+ lineinfile:
+ path: "{{ sno_addnhosts_path }}"
+ regexp: "^{{ item.ip }}\\s"
+ state: absent
+ loop: "{{ sno_new_nodes }}"
+ become: true
+ failed_when: false
+
+- name: "[create-vms] Also clean stale hostname entries from addnhosts"
+ lineinfile:
+ path: "{{ sno_addnhosts_path }}"
+ regexp: "\\s{{ item.hostname }}\\.{{ sno_cluster_domain }}(\\s|$)"
+ state: absent
+ loop: "{{ sno_new_nodes }}"
+ become: true
+ failed_when: false
+
+- name: "[create-vms] Flush stale DHCP leases"
+ shell: |
+ python3 -c "
+ import json, os
+ lf = '{{ sno_lease_file }}'
+ if not os.path.exists(lf):
+ print('Lease file does not exist, skipping')
+ exit(0)
+ if os.path.getsize(lf) == 0:
+ print('Lease file is empty, skipping')
+ exit(0)
+ with open(lf) as f:
+ leases = json.load(f)
+ reserved = {{ [sno_master1_ip, sno_master2_ip] | to_json }}
+ before = len(leases)
+ leases = [l for l in leases if l.get('ip-address') not in reserved]
+ after = len(leases)
+ with open(lf, 'w') as f:
+ json.dump(leases, f, indent=2)
+ print(f'Flushed {before - after} stale leases')
+ "
+ become: true
+ changed_when: true
+ failed_when: false
+
+- name: "[create-vms] Generate random MAC addresses"
+ shell: |
+ printf '52:54:00:%02x:%02x:%02x\n' $((RANDOM%256)) $((RANDOM%256)) $((RANDOM%256))
+ register: mac_gen
+ loop: "{{ sno_new_nodes }}"
+ changed_when: false
+
+- name: "[create-vms] Store MAC addresses"
+ set_fact:
+ sno_node_macs: "{{ sno_node_macs | default({}) | combine({item.item.hostname: item.stdout | trim}) }}"
+ loop: "{{ mac_gen.results }}"
+
+- name: "[create-vms] Display VM details"
+ debug:
+ msg: "{{ item.hostname }}: IP={{ item.ip }}, MAC={{ sno_node_macs[item.hostname] }}"
+ loop: "{{ sno_new_nodes }}"
+
+- name: "[create-vms] Create VM disk images"
+ shell: |
+ sudo qemu-img create -f qcow2 {{ sno_vm_image_dir }}/{{ item.name }}.qcow2 {{ sno_vm_disk_gb }}G
+ args:
+ creates: "{{ sno_vm_image_dir }}/{{ item.name }}.qcow2"
+ loop: "{{ sno_new_nodes }}"
+
+- name: "[create-vms] Define VMs"
+ shell: |
+ sudo virt-install \
+ --name {{ item.name }} \
+ --ram {{ sno_vm_ram_mb }} \
+ --vcpus {{ sno_vm_vcpus }} \
+ --disk {{ sno_vm_image_dir }}/{{ item.name }}.qcow2,bus=virtio \
+ --network network={{ sno_libvirt_network }},model=virtio,mac={{ sno_node_macs[item.hostname] }} \
+ --os-variant rhel9.0 \
+ --graphics none \
+ --noautoconsole \
+ --boot loader=/usr/share/edk2/ovmf/OVMF_CODE.fd,loader_ro=yes,loader_type=pflash,nvram_template=/usr/share/edk2/ovmf/OVMF_VARS.fd,loader_secure=no \
+ --boot hd \
+ --tpm none \
+ --noreboot \
+ --import \
+ --print-xml | sudo virsh define /dev/stdin
+ loop: "{{ sno_new_nodes }}"
+
+- name: "[create-vms] Add DHCP reservations to libvirt network"
+ shell: |
+ sudo virsh net-update {{ sno_libvirt_network }} add ip-dhcp-host \
+ "" \
+ --live --config
+ loop: "{{ sno_new_nodes }}"
+
+- name: "[create-vms] Add DNS entries to libvirt network"
+ shell: |
+ sudo virsh net-update {{ sno_libvirt_network }} add dns-host \
+ "{{ item.hostname }}.{{ sno_cluster_domain }}" \
+ --live --config
+ loop: "{{ sno_new_nodes }}"
+ failed_when: false
+
+- name: "[create-vms] Add entries to addnhosts"
+ lineinfile:
+ path: "{{ sno_addnhosts_path }}"
+ line: "{{ item.ip }} {{ item.hostname }}.{{ sno_cluster_domain }} {{ item.hostname }}"
+ regexp: "^{{ item.ip }}\\s"
+ state: present
+ loop: "{{ sno_new_nodes }}"
+ become: true
+
+- name: "[create-vms] Reload libvirt dnsmasq"
+ shell: |
+ sudo kill -HUP $(cat /run/libvirt/network/{{ sno_libvirt_bridge }}.pid) 2>/dev/null || true
+ changed_when: true
+
+- name: "[create-vms] Verify DNS for new nodes"
+ shell: |
+ dig +short {{ item.hostname }}.{{ sno_cluster_domain }} @192.168.111.1
+ register: dns_verify
+ changed_when: false
+ failed_when: dns_verify.stdout | trim == ""
+ loop: "{{ sno_new_nodes }}"
+
+- name: "[create-vms] VMs created and DNS configured"
+ debug:
+ msg: "{{ sno_new_nodes | length }} VMs created with DNS entries."
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/ignition.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/ignition.yml
new file mode 100644
index 00000000..5ed52064
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/ignition.yml
@@ -0,0 +1,32 @@
+---
+- name: "[ignition] Extract MCS CA certificate"
+ shell: |
+ oc get configmap -n openshift-machine-config-operator machine-config-server-ca -o jsonpath='{.data.ca-bundle\.crt}'
+ environment: "{{ sno_oc_env }}"
+ register: mcs_ca_raw
+ changed_when: false
+
+- name: "[ignition] Base64-encode MCS CA"
+ set_fact:
+ sno_mcs_ca_b64: "{{ mcs_ca_raw.stdout | b64encode }}"
+
+- name: "[ignition] Generate master.ign from template"
+ template:
+ src: master.ign.j2
+ dest: /tmp/master.ign
+ mode: '0644'
+
+- name: "[ignition] Verify MCS reachability"
+ shell: |
+ curl -sk https://api-int.{{ sno_cluster_domain }}:22623/healthz
+ register: mcs_health
+ changed_when: false
+ failed_when: false
+
+- name: "[ignition] Display MCS status"
+ debug:
+ msg: "MCS healthz: {{ mcs_health.stdout | default('unreachable') }}"
+
+- name: "[ignition] Ignition config generated"
+ debug:
+ msg: "master.ign written to /tmp/master.ign for domain {{ sno_cluster_domain }}"
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/main.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/main.yml
new file mode 100644
index 00000000..2fb77402
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/main.yml
@@ -0,0 +1,30 @@
+---
+- name: "Phase 0: Preflight checks"
+ import_tasks: preflight.yml
+
+- name: "Phase 1: Generate ignition config"
+ import_tasks: ignition.yml
+
+- name: "Phase 2: Create new VMs"
+ import_tasks: create-vms.yml
+
+- name: "Phase 3: Boot nodes with RHCOS"
+ import_tasks: boot-nodes.yml
+
+- name: "Phase 4: Wait for nodes to join"
+ import_tasks: wait-nodes.yml
+
+- name: "Phase 5: Wait for etcd scaling"
+ import_tasks: wait-etcd.yml
+
+- name: "Phase 6: Topology transition"
+ import_tasks: topology.yml
+
+- name: "Phase 7: MCO rollout"
+ import_tasks: mco-rollout.yml
+
+- name: "Phase 8: Update DNS for all nodes"
+ import_tasks: update-dns.yml
+
+- name: "Phase 9: Final verification"
+ import_tasks: verify.yml
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/mco-rollout.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/mco-rollout.yml
new file mode 100644
index 00000000..77772746
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/mco-rollout.yml
@@ -0,0 +1,82 @@
+---
+- name: "[mco] Display MCO rollout monitoring start"
+ debug:
+ msg: "Beginning MCO rollout monitoring (timeout: {{ sno_mco_timeout_minutes }} min)"
+
+- name: "[mco] Poll MCP master and auto-fix drain deadlock"
+ shell: |
+ MCP_JSON=$(oc get mcp master -o json 2>/dev/null) || {
+ echo "STATUS=api_unreachable"
+ echo "RESULT=pending"
+ exit 0
+ }
+
+ UPDATED=$(echo "$MCP_JSON" | python3 -c "
+ import json,sys
+ data=json.load(sys.stdin)
+ conds={c['type']:c['status'] for c in data.get('status',{}).get('conditions',[])}
+ print(conds.get('Updated','Unknown'))
+ ")
+ UPDATING=$(echo "$MCP_JSON" | python3 -c "
+ import json,sys
+ data=json.load(sys.stdin)
+ conds={c['type']:c['status'] for c in data.get('status',{}).get('conditions',[])}
+ print(conds.get('Updating','Unknown'))
+ ")
+ DEGRADED=$(echo "$MCP_JSON" | python3 -c "
+ import json,sys
+ data=json.load(sys.stdin)
+ conds={c['type']:c['status'] for c in data.get('status',{}).get('conditions',[])}
+ print(conds.get('Degraded','Unknown'))
+ ")
+
+ CORDONED_NODES=$(oc get nodes --no-headers 2>/dev/null | grep SchedulingDisabled | awk '{print $1}' || true)
+
+ echo "STATUS=updated:${UPDATED},updating:${UPDATING},degraded:${DEGRADED},cordoned:${CORDONED_NODES:-none}"
+
+ if [ "$UPDATED" = "True" ] && [ "$UPDATING" = "False" ] && [ "$DEGRADED" = "False" ]; then
+ echo "RESULT=done"
+ exit 0
+ fi
+
+ # Auto-fix drain deadlock for any cordoned node.
+ # On SNO→3node, MCD cordons a node before draining. If MCC cannot reach
+ # the node (api-int flap, RBAC timing) the drain annotation never advances
+ # and the node stays cordoned forever. Fix: uncordon, set lastAppliedDrain
+ # to the drain- value MCC is waiting for, restart MCC.
+ if [ -n "$CORDONED_NODES" ] && [ "{{ sno_auto_fix_drain | default(true) | bool }}" = "True" ]; then
+ DESIRED=$(oc get mcp master -o jsonpath='{.spec.configuration.name}' 2>/dev/null || echo "")
+ for NODE_NAME in $CORDONED_NODES; do
+ echo "DRAIN_FIX: Fixing drain deadlock on $NODE_NAME (desired=${DESIRED})..."
+
+ oc adm uncordon "$NODE_NAME" 2>&1 || true
+
+ if [ -n "$DESIRED" ]; then
+ oc annotate node "$NODE_NAME" \
+ "machineconfiguration.openshift.io/lastAppliedDrain=drain-${DESIRED}" \
+ --overwrite 2>&1 || true
+ echo "DRAIN_FIX: Set lastAppliedDrain=drain-${DESIRED} on $NODE_NAME"
+ fi
+ done
+
+ oc delete pod -n openshift-machine-config-operator \
+ -l k8s-app=machine-config-controller 2>&1 || true
+ echo "DRAIN_FIX: Restarted MCC pod"
+ fi
+
+ echo "RESULT=pending"
+ environment: "{{ sno_oc_env }}"
+ register: mcp_poll
+ until: "'RESULT=done' in mcp_poll.stdout"
+ retries: "{{ (sno_mco_timeout_minutes | int * 60 / 30) | int }}"
+ delay: 30
+ changed_when: false
+ failed_when: false
+
+- name: "[mco] MCO rollout result"
+ debug:
+ msg: >-
+ {{ 'MCO rollout completed successfully.'
+ if 'RESULT=done' in mcp_poll.stdout
+ else 'MCO rollout did not complete within timeout. Continuing anyway.' }}
+
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/preflight.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/preflight.yml
new file mode 100644
index 00000000..cf599701
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/preflight.yml
@@ -0,0 +1,158 @@
+---
+- name: "[preflight] Query Infrastructure CR"
+ shell: |
+ oc get infrastructure cluster -o json
+ environment: "{{ sno_oc_env }}"
+ register: infra_cr_raw
+ changed_when: false
+
+- name: "[preflight] Parse infrastructure data"
+ set_fact:
+ sno_infra_data: "{{ infra_cr_raw.stdout | from_json }}"
+
+- name: "[preflight] Extract cluster identity"
+ set_fact:
+ sno_platform: "{{ sno_infra_data.spec.platformSpec.type }}"
+ sno_cp_topology: "{{ sno_infra_data.status.controlPlaneTopology }}"
+ sno_infra_topology: "{{ sno_infra_data.status.infrastructureTopology }}"
+ sno_infra_id: "{{ sno_infra_id if sno_infra_id else sno_infra_data.status.infrastructureName }}"
+ sno_cluster_domain: >-
+ {{ sno_cluster_domain if sno_cluster_domain
+ else sno_infra_data.status.apiServerInternalURI
+ | regex_replace('^https://api-int\.', '')
+ | regex_replace(':6443$', '') }}
+
+- name: "[preflight] Assert platform is None"
+ assert:
+ that: sno_platform == "None"
+ fail_msg: "Expected platform None, got {{ sno_platform }}"
+
+- name: "[preflight] Assert topology is SingleReplica"
+ assert:
+ that: sno_cp_topology == "SingleReplica"
+ fail_msg: "Expected SingleReplica topology, got {{ sno_cp_topology }}. Cluster may already be HA."
+ when: not (sno_skip_topology_check | default(false) | bool)
+
+
+- name: "[preflight] Display detected cluster info"
+ debug:
+ msg: >-
+ Cluster: {{ sno_cluster_name }} | Domain: {{ sno_cluster_domain }} |
+ InfraID: {{ sno_infra_id }} | Platform: {{ sno_platform }} |
+ Topology: {{ sno_cp_topology }}
+
+- name: "[preflight] Get cluster nodes"
+ shell: |
+ oc get nodes -o json
+ environment: "{{ sno_oc_env }}"
+ register: nodes_raw
+ changed_when: false
+
+- name: "[preflight] Parse node data"
+ set_fact:
+ sno_nodes: "{{ (nodes_raw.stdout | from_json)['items'] }}"
+
+- name: "[preflight] Assert exactly 1 node exists"
+ assert:
+ that: sno_nodes | length == 1
+ fail_msg: "Expected 1 node for SNO, found {{ sno_nodes | length }}"
+
+- name: "[preflight] Auto-detect master-0 IP"
+ set_fact:
+ sno_master0_ip: >-
+ {{ sno_master0_ip if sno_master0_ip
+ else (sno_nodes[0].status.addresses
+ | selectattr('type', 'equalto', 'InternalIP')
+ | map(attribute='address') | first) }}
+
+- name: "[preflight] Display master-0 IP"
+ debug:
+ msg: "master-0 IP: {{ sno_master0_ip }}"
+
+- name: "[preflight] Check node is Ready"
+ assert:
+ that: >-
+ sno_nodes[0].status.conditions
+ | selectattr('type', 'equalto', 'Ready')
+ | map(attribute='status') | first == 'True'
+ fail_msg: "master-0 is not Ready"
+
+- name: "[preflight] Check etcd pods"
+ shell: |
+ oc get pods -n openshift-etcd -l app=etcd --no-headers \
+ -o custom-columns=NAME:.metadata.name,READY:.status.containerStatuses[*].ready 2>/dev/null || true
+ environment: "{{ sno_oc_env }}"
+ register: etcd_pods_raw
+ changed_when: false
+
+- name: "[preflight] Display etcd status"
+ debug:
+ msg: "{{ etcd_pods_raw.stdout }}"
+
+- name: "[preflight] Check for degraded cluster operators"
+ shell: |
+ oc get co -o json | python3 -c "
+ import json, sys
+ data = json.load(sys.stdin)
+ degraded = []
+ for co in data['items']:
+ name = co['metadata']['name']
+ for cond in co.get('status', {}).get('conditions', []):
+ if cond['type'] == 'Degraded' and cond['status'] == 'True':
+ degraded.append(name)
+ if degraded:
+ print('Degraded COs: ' + ', '.join(degraded))
+ sys.exit(1)
+ print('All cluster operators healthy')
+ "
+ environment: "{{ sno_oc_env }}"
+ register: co_check
+ changed_when: false
+ failed_when: false
+
+- name: "[preflight] Display CO status"
+ debug:
+ msg: "{{ co_check.stdout }}"
+
+- name: "[preflight] Warn if COs degraded"
+ debug:
+ msg: "WARNING: Some COs are degraded. Proceeding anyway."
+ when: co_check.rc != 0
+
+- name: "[preflight] Verify api-int DNS on hypervisor"
+ shell: |
+ dig +short api-int.{{ sno_cluster_domain }} @192.168.111.1
+ register: dns_apiint
+ changed_when: false
+ failed_when: false
+
+- name: "[preflight] Add api-int to libvirt network DNS"
+ # Using virsh net-update (not lineinfile) so that api-int survives addnhosts
+ # regeneration when create-vms.yml calls virsh net-update for master-1/2.
+ # libvirt regenerates ostestbm.addnhosts from the network XML on every net-update.
+ shell: |
+ # Remove any stale api-int entry (idempotent, ignore errors)
+ sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \
+ "api-int.{{ sno_cluster_domain }}" \
+ --live --config 2>/dev/null || true
+ # Add the correct entry - triggers addnhosts regeneration with api-int included
+ sudo virsh net-update {{ sno_libvirt_network }} add dns-host \
+ "api-int.{{ sno_cluster_domain }}" \
+ --live --config
+ when: dns_apiint.stdout | trim == "" or sno_master0_ip not in dns_apiint.stdout
+ changed_when: true
+
+- name: "[preflight] Verify api-int DNS after fix"
+ shell: |
+ dig +short api-int.{{ sno_cluster_domain }} @192.168.111.1
+ register: dns_apiint_verify
+ changed_when: false
+ failed_when: dns_apiint_verify.stdout | trim == ""
+ retries: 3
+ delay: 2
+ until: dns_apiint_verify.stdout | trim != ""
+ when: dns_apiint.stdout | trim == "" or sno_master0_ip not in dns_apiint.stdout
+
+- name: "[preflight] Preflight checks passed"
+ debug:
+ msg: "All preflight checks passed. Ready for topology transition."
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/topology.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/topology.yml
new file mode 100644
index 00000000..4b99f1db
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/topology.yml
@@ -0,0 +1,38 @@
+---
+- name: "[topology] Scale CVO to 0 replicas"
+ shell: |
+ oc scale deployment cluster-version-operator -n openshift-cluster-version --replicas=0
+ environment: "{{ sno_oc_env }}"
+
+- name: "[topology] Wait for CVO to scale down"
+ shell: |
+ oc get deployment cluster-version-operator -n openshift-cluster-version \
+ -o jsonpath='{.status.replicas}' 2>/dev/null || echo "0"
+ environment: "{{ sno_oc_env }}"
+ register: cvo_replicas
+ until: cvo_replicas.stdout | trim | int == 0
+ retries: 30
+ delay: 5
+ changed_when: false
+
+- name: "[topology] CVO scaled down"
+ debug:
+ msg: "CVO scaled to 0 replicas"
+
+- name: "[topology] Patch Infrastructure CR - topology to HighlyAvailable"
+ shell: |
+ oc patch infrastructure cluster --type merge --subresource status \
+ -p '{"status":{"controlPlaneTopology":"HighlyAvailable","infrastructureTopology":"HighlyAvailable"}}'
+ environment: "{{ sno_oc_env }}"
+
+- name: "[topology] Verify topology patch"
+ shell: |
+ oc get infrastructure cluster -o jsonpath='{.status.controlPlaneTopology}'
+ environment: "{{ sno_oc_env }}"
+ register: topo_verify
+ changed_when: false
+ failed_when: topo_verify.stdout != "HighlyAvailable"
+
+- name: "[topology] Topology transition complete"
+ debug:
+ msg: "Topology changed to HighlyAvailable. MCO rollout will begin."
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/update-dns.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/update-dns.yml
new file mode 100644
index 00000000..bcfba13e
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/update-dns.yml
@@ -0,0 +1,75 @@
+---
+- name: "[dns] Get all node InternalIPs"
+ shell: |
+ oc get nodes -o json | python3 -c "
+ import json, sys
+ data = json.load(sys.stdin)
+ for node in data['items']:
+ for addr in node['status']['addresses']:
+ if addr['type'] == 'InternalIP':
+ print(addr['address'])
+ "
+ environment: "{{ sno_oc_env }}"
+ register: all_node_ips_raw
+ changed_when: false
+
+- name: "[dns] Store node IPs"
+ set_fact:
+ sno_all_node_ips: "{{ all_node_ips_raw.stdout_lines }}"
+
+- name: "[dns] Display node IPs for DNS"
+ debug:
+ msg: "Node IPs: {{ sno_all_node_ips | join(', ') }}"
+
+- name: "[dns] Build NM dnsmasq config"
+ set_fact:
+ sno_dnsmasq_lines: |
+ {% for ip in sno_all_node_ips %}
+ address=/api.{{ sno_cluster_domain }}/{{ ip }}
+ address=/api-int.{{ sno_cluster_domain }}/{{ ip }}
+ {% endfor %}
+ address=/.apps.{{ sno_cluster_domain }}/{{ sno_all_node_ips[0] }}
+
+- name: "[dns] Write NM dnsmasq config"
+ copy:
+ content: "{{ sno_dnsmasq_lines }}"
+ dest: "{{ sno_nm_dnsmasq_conf }}"
+ mode: '0644'
+ become: true
+
+- name: "[dns] Clean stale apps entries from /etc/hosts"
+ lineinfile:
+ path: /etc/hosts
+ regexp: "apps\\.{{ sno_cluster_domain | regex_escape() }}"
+ state: absent
+ become: true
+
+- name: "[dns] Restart NetworkManager to pick up dnsmasq changes"
+ systemd:
+ name: NetworkManager
+ state: restarted
+ become: true
+
+- name: "[dns] Wait for DNS to stabilize"
+ pause:
+ seconds: 5
+
+- name: "[dns] Verify API DNS resolution"
+ shell: |
+ dig +short api.{{ sno_cluster_domain }}
+ register: dns_api_verify
+ changed_when: false
+ failed_when: dns_api_verify.stdout | trim == ""
+
+- name: "[dns] Verify apps DNS resolution"
+ shell: |
+ dig +short test.apps.{{ sno_cluster_domain }}
+ register: dns_apps_verify
+ changed_when: false
+ failed_when: false
+
+- name: "[dns] DNS update complete"
+ debug:
+ msg: >-
+ API resolves to: {{ dns_api_verify.stdout_lines | join(', ') }}.
+ Apps resolves to: {{ dns_apps_verify.stdout_lines | default(['not yet']) | join(', ') }}.
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/verify.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/verify.yml
new file mode 100644
index 00000000..1e9d9c96
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/verify.yml
@@ -0,0 +1,111 @@
+---
+- name: "[verify] Get node count and status"
+ shell: |
+ oc get nodes --no-headers
+ environment: "{{ sno_oc_env }}"
+ register: verify_nodes
+ changed_when: false
+
+- name: "[verify] Nodes"
+ debug:
+ msg: "{{ verify_nodes.stdout }}"
+
+- name: "[verify] Get topology"
+ shell: |
+ oc get infrastructure cluster -o jsonpath='{.status.controlPlaneTopology}'
+ environment: "{{ sno_oc_env }}"
+ register: verify_topo
+ changed_when: false
+
+- name: "[verify] Topology"
+ debug:
+ msg: "Control plane topology: {{ verify_topo.stdout }}"
+
+- name: "[verify] Get etcd member count"
+ shell: |
+ oc get pods -n openshift-etcd -l app=etcd --no-headers | wc -l
+ environment: "{{ sno_oc_env }}"
+ register: verify_etcd
+ changed_when: false
+
+- name: "[verify] etcd members"
+ debug:
+ msg: "etcd pods: {{ verify_etcd.stdout | trim }}"
+
+- name: "[verify] Get CVO replica count"
+ shell: |
+ oc get deployment cluster-version-operator -n openshift-cluster-version \
+ -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "unknown"
+ environment: "{{ sno_oc_env }}"
+ register: verify_cvo
+ changed_when: false
+
+- name: "[verify] CVO status"
+ debug:
+ msg: "CVO replicas: {{ verify_cvo.stdout }} (expected: 0 - CVO intentionally scaled down)"
+
+- name: "[verify] Get cluster operator summary"
+ shell: |
+ oc get co -o json | python3 -c "
+ import json, sys
+ data = json.load(sys.stdin)
+ total = len(data['items'])
+ degraded = []
+ unavailable = []
+ for co in data['items']:
+ name = co['metadata']['name']
+ conds = {c['type']: c['status'] for c in co.get('status', {}).get('conditions', [])}
+ if conds.get('Degraded') == 'True':
+ degraded.append(name)
+ if conds.get('Available') == 'False':
+ unavailable.append(name)
+ print(f'Total COs: {total}')
+ if degraded:
+ print(f'Degraded: {chr(44).join(degraded)}')
+ else:
+ print('Degraded: none')
+ if unavailable:
+ print(f'Unavailable: {chr(44).join(unavailable)}')
+ else:
+ print('Unavailable: none')
+ "
+ environment: "{{ sno_oc_env }}"
+ register: verify_co
+ changed_when: false
+
+- name: "[verify] Cluster operators"
+ debug:
+ msg: "{{ verify_co.stdout }}"
+
+- name: "[verify] Get MCP master status"
+ shell: |
+ oc get mcp master --no-headers
+ environment: "{{ sno_oc_env }}"
+ register: verify_mcp
+ changed_when: false
+
+- name: "[verify] MCP status"
+ debug:
+ msg: "{{ verify_mcp.stdout }}"
+
+- name: "[verify] Assert 3 nodes"
+ assert:
+ that: (verify_nodes.stdout_lines | length) >= 3
+ fail_msg: "Expected 3+ nodes, found {{ verify_nodes.stdout_lines | length }}"
+
+- name: "[verify] Assert HA topology"
+ assert:
+ that: verify_topo.stdout == "HighlyAvailable"
+ fail_msg: "Expected HighlyAvailable, got {{ verify_topo.stdout }}"
+
+- name: "[verify] Assert 3 etcd members"
+ assert:
+ that: verify_etcd.stdout | trim | int >= 3
+ fail_msg: "Expected 3+ etcd pods, found {{ verify_etcd.stdout | trim }}"
+
+- name: "[verify] SNO to 3-node transition verified successfully"
+ debug:
+ msg: >-
+ MIGRATION COMPLETE: {{ verify_nodes.stdout_lines | length }} nodes,
+ {{ verify_etcd.stdout | trim }} etcd members,
+ topology={{ verify_topo.stdout }}, CVO={{ verify_cvo.stdout }} replicas.
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-etcd.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-etcd.yml
new file mode 100644
index 00000000..e92737a8
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-etcd.yml
@@ -0,0 +1,52 @@
+---
+- name: "[wait-etcd] Poll for etcd scaling to 3 members"
+ shell: |
+ FULLY_READY=$(oc get pods -n openshift-etcd -l app=etcd -o json 2>/dev/null | python3 -c "
+ import json, sys
+ data = json.load(sys.stdin)
+ count = 0
+ for pod in data.get('items', []):
+ containers = pod.get('status', {}).get('containerStatuses', [])
+ if containers and all(c.get('ready', False) for c in containers):
+ count += 1
+ print(count)
+ " 2>/dev/null || echo "0")
+ echo "ETCD_FULLY_READY=$FULLY_READY"
+
+ if [ "$FULLY_READY" -ge 3 ]; then
+ echo "RESULT=done"
+ else
+ echo "RESULT=pending"
+ fi
+ environment: "{{ sno_oc_env }}"
+ register: etcd_poll
+ until: "'RESULT=done' in etcd_poll.stdout"
+ retries: "{{ (sno_etcd_timeout_minutes | int * 60 / 20) | int }}"
+ delay: 20
+ changed_when: false
+
+- name: "[etcd] Display etcd member list"
+ shell: |
+ ETCD_POD=$(oc get pods -n openshift-etcd -l app=etcd -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+ oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl member list -w table 2>/dev/null || echo "Could not query etcd members"
+ environment: "{{ sno_oc_env }}"
+ register: etcd_members
+ changed_when: false
+ failed_when: false
+
+- name: "[etcd] Members"
+ debug:
+ msg: "{{ etcd_members.stdout }}"
+
+- name: "[etcd] Display etcd endpoint health"
+ shell: |
+ ETCD_POD=$(oc get pods -n openshift-etcd -l app=etcd -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+ oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint health --cluster -w table 2>/dev/null || echo "Could not query endpoint health"
+ environment: "{{ sno_oc_env }}"
+ register: etcd_health
+ changed_when: false
+ failed_when: false
+
+- name: "[etcd] Health"
+ debug:
+ msg: "{{ etcd_health.stdout }}"
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-nodes.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-nodes.yml
new file mode 100644
index 00000000..f02335f8
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-nodes.yml
@@ -0,0 +1,48 @@
+---
+- name: "[wait-nodes] Poll for CSR approval and node join"
+ shell: |
+ PENDING=$(oc get csr -o json 2>/dev/null | python3 -c "
+ import json, sys
+ data = json.load(sys.stdin)
+ pending = []
+ for csr in data.get('items', []):
+ status = csr.get('status', {})
+ if not status.get('conditions'):
+ name = csr['metadata']['name']
+ pending.append(name)
+ for name in pending:
+ print(name)
+ " 2>/dev/null || true)
+
+ if [ -n "$PENDING" ]; then
+ for CSR in $PENDING; do
+ oc adm certificate approve "$CSR" 2>/dev/null || true
+ echo "Approved CSR: $CSR"
+ done
+ fi
+
+ READY_COUNT=$(oc get nodes --no-headers 2>/dev/null | grep -c ' Ready' || echo "0")
+ echo "READY_NODES=$READY_COUNT"
+
+ if [ "$READY_COUNT" -ge 3 ]; then
+ echo "RESULT=done"
+ else
+ echo "RESULT=pending"
+ fi
+ environment: "{{ sno_oc_env }}"
+ register: node_poll
+ until: "'RESULT=done' in node_poll.stdout"
+ retries: "{{ (sno_node_join_timeout_minutes | int * 60 / 20) | int }}"
+ delay: 20
+ changed_when: false
+
+- name: "[wait-nodes] Display final node list"
+ shell: |
+ oc get nodes -o wide
+ environment: "{{ sno_oc_env }}"
+ register: node_list
+ changed_when: false
+
+- name: "[wait-nodes] Nodes joined"
+ debug:
+ msg: "{{ node_list.stdout }}"
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/auto-install.ign.j2 b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/auto-install.ign.j2
new file mode 100644
index 00000000..5bdd58e0
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/auto-install.ign.j2
@@ -0,0 +1,25 @@
+{
+ "ignition": {
+ "version": "3.2.0"
+ },
+ "storage": {
+ "files": [
+ {
+ "path": "/etc/master.ign",
+ "mode": 420,
+ "contents": {
+ "source": "data:text/plain;charset=utf-8;base64,{{ sno_master_ign_b64 }}"
+ }
+ }
+ ]
+ },
+ "systemd": {
+ "units": [
+ {
+ "name": "auto-install.service",
+ "enabled": true,
+ "contents": "[Unit]\nDescription=Auto-install RHCOS to disk\nAfter=network-online.target\nWants=network-online.target\n\n[Service]\nType=oneshot\nRemainAfterExit=yes\nExecStart=/usr/bin/coreos-installer install /dev/vda --ignition-file /etc/master.ign --insecure\nExecStartPost=/usr/bin/systemctl reboot\n\n[Install]\nWantedBy=multi-user.target\n"
+ }
+ ]
+ }
+}
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/master.ign.j2 b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/master.ign.j2
new file mode 100644
index 00000000..d2c5b010
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/master.ign.j2
@@ -0,0 +1,21 @@
+{
+ "ignition": {
+ "version": "3.2.0",
+ "security": {
+ "tls": {
+ "certificateAuthorities": [
+ {
+ "source": "data:text/plain;charset=utf-8;base64,{{ sno_mcs_ca_b64 }}"
+ }
+ ]
+ }
+ },
+ "config": {
+ "merge": [
+ {
+ "source": "https://api-int.{{ sno_cluster_domain }}:22623/config/master"
+ }
+ ]
+ }
+ }
+}
diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/vars/main.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/vars/main.yml
new file mode 100644
index 00000000..87e52bb4
--- /dev/null
+++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/vars/main.yml
@@ -0,0 +1,22 @@
+---
+sno_dev_scripts_path: "{{ dev_scripts_path | default('openshift-metal3/dev-scripts') }}"
+sno_kubeconfig_resolved: >-
+ {{ sno_kubeconfig if sno_kubeconfig
+ else sno_dev_scripts_path ~ '/ocp/' ~ sno_cluster_name ~ '/auth/kubeconfig' }}
+sno_addnhosts_path: "/var/lib/libvirt/dnsmasq/{{ sno_libvirt_bridge }}.addnhosts"
+sno_lease_file: "/var/lib/libvirt/dnsmasq/{{ sno_libvirt_bridge }}.status"
+sno_nm_dnsmasq_conf: "/etc/NetworkManager/dnsmasq.d/openshift-{{ sno_cluster_name }}.conf"
+
+sno_oc_env:
+ KUBECONFIG: "{{ sno_kubeconfig_resolved }}"
+ PATH: "/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:{{ ansible_env.PATH | default('/usr/local/bin:/usr/bin') }}"
+
+sno_new_nodes:
+ - index: 1
+ name: "{{ sno_cluster_name }}_master_1"
+ hostname: "master-1"
+ ip: "{{ sno_master1_ip }}"
+ - index: 2
+ name: "{{ sno_cluster_name }}_master_2"
+ hostname: "master-2"
+ ip: "{{ sno_master2_ip }}"
diff --git a/deploy/openshift-clusters/scripts/clean-mutable-topology.sh b/deploy/openshift-clusters/scripts/clean-mutable-topology.sh
new file mode 100755
index 00000000..adedd358
--- /dev/null
+++ b/deploy/openshift-clusters/scripts/clean-mutable-topology.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR=$(dirname "$0")
+DEPLOY_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# shellcheck source=/dev/null
+source "${DEPLOY_DIR}/aws-hypervisor/scripts/common.sh"
+
+if [[ ! -f "$(get_node_dir)/aws-instance-id" ]]; then
+ echo "Error: No instance found. Run 'make deploy' first."
+ exit 1
+fi
+
+if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/inventory.ini" ]]; then
+ echo "Error: inventory.ini not found. Run 'make inventory' first."
+ exit 1
+fi
+
+cd "${DEPLOY_DIR}/openshift-clusters"
+ansible-playbook clean-mutable-topology.yml -e "interactive_mode=false" -i inventory.ini "$@"
diff --git a/deploy/openshift-clusters/scripts/deploy-cluster.sh b/deploy/openshift-clusters/scripts/deploy-cluster.sh
index 746a425c..6fc1761a 100755
--- a/deploy/openshift-clusters/scripts/deploy-cluster.sh
+++ b/deploy/openshift-clusters/scripts/deploy-cluster.sh
@@ -49,7 +49,7 @@ done
# Validate required arguments
if [[ -z "${TOPOLOGY}" ]]; then
- echo "Error: --topology is required (arbiter or fencing)"
+ echo "Error: --topology is required (arbiter, fencing, or sno)"
exit 1
fi
@@ -59,8 +59,8 @@ if [[ -z "${METHOD}" ]]; then
fi
# Validate topology value
-if [[ "${TOPOLOGY}" != "arbiter" && "${TOPOLOGY}" != "fencing" ]]; then
- echo "Error: Invalid topology '${TOPOLOGY}'. Must be 'arbiter' or 'fencing'."
+if [[ "${TOPOLOGY}" != "arbiter" && "${TOPOLOGY}" != "fencing" && "${TOPOLOGY}" != "sno" ]]; then
+ echo "Error: Invalid topology '${TOPOLOGY}'. Must be 'arbiter', 'fencing', or 'sno'."
exit 1
fi
diff --git a/deploy/openshift-clusters/scripts/sno-to-3node.sh b/deploy/openshift-clusters/scripts/sno-to-3node.sh
new file mode 100755
index 00000000..fab58d19
--- /dev/null
+++ b/deploy/openshift-clusters/scripts/sno-to-3node.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR=$(dirname "$0")
+DEPLOY_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# shellcheck source=/dev/null
+source "${DEPLOY_DIR}/aws-hypervisor/scripts/common.sh"
+
+if [[ ! -f "$(get_node_dir)/aws-instance-id" ]]; then
+ echo "Error: No instance found. Run 'make deploy' first."
+ exit 1
+fi
+
+if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/inventory.ini" ]]; then
+ echo "Error: inventory.ini not found. Run 'make inventory' first."
+ exit 1
+fi
+
+cd "${DEPLOY_DIR}/openshift-clusters"
+ansible-playbook sno-to-3node.yml -e "interactive_mode=false" -i inventory.ini "$@"
diff --git a/deploy/openshift-clusters/sno-to-3node.yml b/deploy/openshift-clusters/sno-to-3node.yml
new file mode 100644
index 00000000..1b549ce7
--- /dev/null
+++ b/deploy/openshift-clusters/sno-to-3node.yml
@@ -0,0 +1,23 @@
+---
+- hosts: metal_machine
+ gather_facts: no
+ force_handlers: yes
+
+ pre_tasks:
+ - name: Confirm SNO to 3-node transition
+ ansible.builtin.pause:
+ prompt: >-
+ This will transition the SNO cluster to a 3-node HA cluster.
+ 2 new VMs will be created. CVO will be scaled to 0.
+ Press Enter to proceed or Ctrl+C to abort.
+ delegate_to: localhost
+ run_once: true
+ when: interactive_mode | default(true) | bool
+
+ roles:
+ - mutable-topology/sno-to-3node
+
+ tasks:
+ - name: Transition complete
+ ansible.builtin.debug:
+ msg: "SNO to 3-node HA transition completed successfully."