From ba89bedf6a464946348f8d44df19c7308e442bdf Mon Sep 17 00:00:00 2001 From: E Hila Date: Thu, 25 Jun 2026 07:01:02 +0000 Subject: [PATCH 1/3] Add SNO to 3-node HA migration Ansible role Add a new Ansible role and make target to perform a one-shot migration of a platform:None SNO cluster to a 3-node HA cluster. The role creates two new control-plane VMs, boots them with RHCOS via coreos-installer ISO ignition embed, waits for nodes to join and etcd to scale, then transitions the topology to HighlyAvailable. Usage: make sno-to-3node Co-Authored-By: Claude --- deploy/Makefile | 4 + .../sno-expand/sno-to-3node/defaults/main.yml | 38 +++++ .../sno-to-3node/tasks/boot-nodes.yml | 94 ++++++++++ .../sno-to-3node/tasks/create-vms.yml | 160 ++++++++++++++++++ .../sno-to-3node/tasks/ignition.yml | 32 ++++ .../sno-expand/sno-to-3node/tasks/main.yml | 30 ++++ .../sno-to-3node/tasks/mco-rollout.yml | 77 +++++++++ .../sno-to-3node/tasks/preflight.yml | 146 ++++++++++++++++ .../sno-to-3node/tasks/topology.yml | 38 +++++ .../sno-to-3node/tasks/update-dns.yml | 75 ++++++++ .../sno-expand/sno-to-3node/tasks/verify.yml | 111 ++++++++++++ .../sno-to-3node/tasks/wait-etcd.yml | 52 ++++++ .../sno-to-3node/tasks/wait-nodes.yml | 48 ++++++ .../templates/auto-install.ign.j2 | 25 +++ .../sno-to-3node/templates/master.ign.j2 | 21 +++ .../sno-expand/sno-to-3node/vars/main.yml | 22 +++ .../scripts/sno-to-3node.sh | 21 +++ deploy/openshift-clusters/sno-to-3node.yml | 23 +++ 18 files changed, 1017 insertions(+) create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/defaults/main.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/boot-nodes.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/create-vms.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/ignition.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/main.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/mco-rollout.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/preflight.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/topology.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/update-dns.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/verify.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-etcd.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-nodes.yml create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/auto-install.ign.j2 create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/master.ign.j2 create mode 100644 deploy/openshift-clusters/roles/sno-expand/sno-to-3node/vars/main.yml create mode 100755 deploy/openshift-clusters/scripts/sno-to-3node.sh create mode 100644 deploy/openshift-clusters/sno-to-3node.yml diff --git a/deploy/Makefile b/deploy/Makefile index 18959e9..4429523 100644 --- a/deploy/Makefile +++ b/deploy/Makefile @@ -83,6 +83,9 @@ fencing-assisted: @$(MAKE) fencing-ipi @./openshift-clusters/scripts/deploy-fencing-assisted.sh +sno-to-3node: + @./openshift-clusters/scripts/sno-to-3node.sh + patch-nodes: @./openshift-clusters/scripts/patch-nodes.sh get-tnf-logs: @@ -115,6 +118,7 @@ help: @echo " arbiter-kcli - Deploy arbiter cluster using kcli (non-interactive)" @echo " fencing-kcli - Deploy fencing cluster using kcli (non-interactive)" @echo " fencing-assisted - Deploy hub + spoke TNF cluster via assisted installer" + @echo " sno-to-3node - Transition existing SNO cluster to 3-node HA (platform:none)" @echo "" @echo "OpenShift Cluster Management:" @echo " redeploy-cluster - Redeploy OpenShift cluster using dev-scripts make redeploy" diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/defaults/main.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/defaults/main.yml new file mode 100644 index 0000000..202f3a7 --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/defaults/main.yml @@ -0,0 +1,38 @@ +--- +# Cluster identity (auto-detected from cluster if not set) +sno_cluster_name: ostest +sno_cluster_domain: "" +sno_infra_id: "" + +# Existing master-0 (auto-detected from cluster) +sno_master0_ip: "" + +# New node IPs (static assignments within the dev-scripts DHCP range) +sno_master1_ip: "192.168.111.21" +sno_master2_ip: "192.168.111.22" + +# VM specs +sno_vm_vcpus: 6 +sno_vm_ram_mb: 16384 +sno_vm_disk_gb: 50 + +# Libvirt network (dev-scripts baremetal network) +sno_libvirt_network: ostestbm +sno_libvirt_bridge: ostestbm + +# RHCOS live ISO path on hypervisor (auto-detected from release image if empty) +sno_rhcos_live_iso: "" + +# Timeouts +sno_mco_timeout_minutes: 45 +sno_node_join_timeout_minutes: 20 +sno_etcd_timeout_minutes: 15 + +# Auto-fix MCO drain deadlock during topology transition +sno_auto_fix_drain: true + +# Paths (override if dev-scripts is in a non-standard location) +sno_kubeconfig: "" + +# VM image directory +sno_vm_image_dir: "/var/lib/libvirt/images" diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/boot-nodes.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/boot-nodes.yml new file mode 100644 index 0000000..4ad9a2d --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/boot-nodes.yml @@ -0,0 +1,94 @@ +--- +- name: "[boot] Check if RHCOS live ISO exists on hypervisor" + stat: + path: "/var/lib/libvirt/images/rhcos-live.iso" + register: iso_stat + +- name: "[boot] Find RHCOS live ISO from dev-scripts cache" + shell: | + DEVSCRIPTS_ISO=$(find /var/lib/libvirt/images -name 'rhcos-*-live*.iso' 2>/dev/null | head -1) + if [ -n "$DEVSCRIPTS_ISO" ]; then + echo "Using existing ISO: $DEVSCRIPTS_ISO" + sudo ln -sf "$DEVSCRIPTS_ISO" /var/lib/libvirt/images/rhcos-live.iso + exit 0 + fi + + CACHE_ISO=$(find {{ sno_dev_scripts_path }}/ -name 'rhcos-*-live*.iso' 2>/dev/null | head -1) + if [ -n "$CACHE_ISO" ]; then + echo "Using dev-scripts cached ISO: $CACHE_ISO" + sudo ln -sf "$CACHE_ISO" /var/lib/libvirt/images/rhcos-live.iso + exit 0 + fi + + echo "ERROR: No RHCOS live ISO found. Set sno_rhcos_live_iso variable." + exit 1 + when: not iso_stat.stat.exists and sno_rhcos_live_iso == "" + +- name: "[boot] Set ISO path" + set_fact: + sno_iso_path: "{{ sno_rhcos_live_iso if sno_rhcos_live_iso else '/var/lib/libvirt/images/rhcos-live.iso' }}" + +- name: "[boot] Read master.ign content" + slurp: + src: /tmp/master.ign + register: master_ign_content + +- name: "[boot] Set base64-encoded master.ign" + set_fact: + sno_master_ign_b64: "{{ master_ign_content.content }}" + +- name: "[boot] Generate auto-install ignition" + template: + src: auto-install.ign.j2 + dest: /tmp/auto-install.ign + mode: '0644' + +- name: "[boot] Create per-node ISO with embedded ignition" + shell: | + sudo cp {{ sno_iso_path }} /var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso + sudo coreos-installer iso ignition embed -i /tmp/auto-install.ign /var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso -f + loop: "{{ sno_new_nodes }}" + +- name: "[boot] Boot each VM with ignition-embedded ISO" + shell: | + VM_NAME="{{ item.name }}" + MAC="{{ sno_node_macs[item.hostname] }}" + + sudo virsh destroy "$VM_NAME" 2>/dev/null || true + sudo virsh undefine "$VM_NAME" 2>/dev/null || true + + sudo virt-install \ + --name "$VM_NAME" \ + --ram {{ sno_vm_ram_mb }} \ + --vcpus {{ sno_vm_vcpus }} \ + --disk {{ sno_vm_image_dir }}/${VM_NAME}.qcow2,bus=virtio \ + --network network={{ sno_libvirt_network }},model=virtio,mac=${MAC} \ + --cdrom /var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso \ + --os-variant rhel9.0 \ + --graphics none \ + --noautoconsole \ + --boot hd,cdrom + loop: "{{ sno_new_nodes }}" + +- name: "[boot] Verify VMs are running" + shell: | + sudo virsh domstate {{ item.name }} + register: vm_state + loop: "{{ sno_new_nodes }}" + changed_when: false + failed_when: "'running' not in vm_state.stdout" + +- name: "[boot] Set on_reboot to restart (not destroy)" + shell: | + sudo virsh dumpxml {{ item.name }} | grep -q 'destroy' && \ + sudo virt-xml {{ item.name }} --edit --events on_reboot=restart 2>/dev/null || true + loop: "{{ sno_new_nodes }}" + failed_when: false + changed_when: true + +- name: "[boot] VMs booted with RHCOS" + debug: + msg: >- + {{ sno_new_nodes | length }} VMs booted with RHCOS ISO (ignition embedded). + coreos-installer will write to disk and reboot automatically. + Waiting for nodes to join the cluster... diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/create-vms.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/create-vms.yml new file mode 100644 index 0000000..d5ee1c9 --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/create-vms.yml @@ -0,0 +1,160 @@ +--- +- name: "[create-vms] Cleanup stale VMs" + shell: | + sudo virsh destroy {{ item.name }} 2>/dev/null || true + sudo virsh undefine {{ item.name }} --remove-all-storage 2>/dev/null || true + sudo rm -f {{ sno_vm_image_dir }}/{{ item.name }}.qcow2 + loop: "{{ sno_new_nodes }}" + failed_when: false + changed_when: true + +- name: "[create-vms] Remove stale DHCP entries from libvirt network" + shell: | + NET_XML=$(sudo virsh net-dumpxml {{ sno_libvirt_network }}) + EXISTING_MAC=$(echo "$NET_XML" | python3 -c " + import xml.etree.ElementTree as ET, sys + root = ET.parse(sys.stdin).getroot() + for host in root.findall('.//dhcp/host'): + if host.get('ip') == '{{ item.ip }}': + print(host.get('mac','')) + " 2>/dev/null || true) + + if [ -n "$EXISTING_MAC" ]; then + sudo virsh net-update {{ sno_libvirt_network }} delete ip-dhcp-host \ + "" --live --config 2>/dev/null || true + fi + + sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \ + "{{ item.hostname }}.{{ sno_cluster_domain }}" \ + --live --config 2>/dev/null || true + sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \ + "{{ item.hostname }}" \ + --live --config 2>/dev/null || true + loop: "{{ sno_new_nodes }}" + failed_when: false + changed_when: true + +- name: "[create-vms] Clean stale entries from addnhosts" + lineinfile: + path: "{{ sno_addnhosts_path }}" + regexp: "^{{ item.ip }}\\s" + state: absent + loop: "{{ sno_new_nodes }}" + become: true + failed_when: false + +- name: "[create-vms] Also clean stale hostname entries from addnhosts" + lineinfile: + path: "{{ sno_addnhosts_path }}" + regexp: "\\s{{ item.hostname }}\\.{{ sno_cluster_domain }}(\\s|$)" + state: absent + loop: "{{ sno_new_nodes }}" + become: true + failed_when: false + +- name: "[create-vms] Flush stale DHCP leases" + shell: | + python3 -c " + import json, os + lf = '{{ sno_lease_file }}' + if not os.path.exists(lf): + print('Lease file does not exist, skipping') + exit(0) + if os.path.getsize(lf) == 0: + print('Lease file is empty, skipping') + exit(0) + with open(lf) as f: + leases = json.load(f) + reserved = {{ [sno_master1_ip, sno_master2_ip] | to_json }} + before = len(leases) + leases = [l for l in leases if l.get('ip-address') not in reserved] + after = len(leases) + with open(lf, 'w') as f: + json.dump(leases, f, indent=2) + print(f'Flushed {before - after} stale leases') + " + become: true + changed_when: true + failed_when: false + +- name: "[create-vms] Generate random MAC addresses" + shell: | + printf '52:54:00:%02x:%02x:%02x\n' $((RANDOM%256)) $((RANDOM%256)) $((RANDOM%256)) + register: mac_gen + loop: "{{ sno_new_nodes }}" + changed_when: false + +- name: "[create-vms] Store MAC addresses" + set_fact: + sno_node_macs: "{{ sno_node_macs | default({}) | combine({item.item.hostname: item.stdout | trim}) }}" + loop: "{{ mac_gen.results }}" + +- name: "[create-vms] Display VM details" + debug: + msg: "{{ item.hostname }}: IP={{ item.ip }}, MAC={{ sno_node_macs[item.hostname] }}" + loop: "{{ sno_new_nodes }}" + +- name: "[create-vms] Create VM disk images" + shell: | + sudo qemu-img create -f qcow2 {{ sno_vm_image_dir }}/{{ item.name }}.qcow2 {{ sno_vm_disk_gb }}G + args: + creates: "{{ sno_vm_image_dir }}/{{ item.name }}.qcow2" + loop: "{{ sno_new_nodes }}" + +- name: "[create-vms] Define VMs" + shell: | + sudo virt-install \ + --name {{ item.name }} \ + --ram {{ sno_vm_ram_mb }} \ + --vcpus {{ sno_vm_vcpus }} \ + --disk {{ sno_vm_image_dir }}/{{ item.name }}.qcow2,bus=virtio \ + --network network={{ sno_libvirt_network }},model=virtio,mac={{ sno_node_macs[item.hostname] }} \ + --os-variant rhel9.0 \ + --graphics none \ + --noautoconsole \ + --boot hd \ + --noreboot \ + --import \ + --print-xml | sudo virsh define /dev/stdin + loop: "{{ sno_new_nodes }}" + +- name: "[create-vms] Add DHCP reservations to libvirt network" + shell: | + sudo virsh net-update {{ sno_libvirt_network }} add ip-dhcp-host \ + "" \ + --live --config + loop: "{{ sno_new_nodes }}" + +- name: "[create-vms] Add DNS entries to libvirt network" + shell: | + sudo virsh net-update {{ sno_libvirt_network }} add dns-host \ + "{{ item.hostname }}.{{ sno_cluster_domain }}" \ + --live --config + loop: "{{ sno_new_nodes }}" + failed_when: false + +- name: "[create-vms] Add entries to addnhosts" + lineinfile: + path: "{{ sno_addnhosts_path }}" + line: "{{ item.ip }} {{ item.hostname }}.{{ sno_cluster_domain }} {{ item.hostname }}" + regexp: "^{{ item.ip }}\\s" + state: present + loop: "{{ sno_new_nodes }}" + become: true + +- name: "[create-vms] Reload libvirt dnsmasq" + shell: | + sudo kill -HUP $(cat /run/libvirt/network/{{ sno_libvirt_bridge }}.pid) 2>/dev/null || true + changed_when: true + +- name: "[create-vms] Verify DNS for new nodes" + shell: | + dig +short {{ item.hostname }}.{{ sno_cluster_domain }} @192.168.111.1 + register: dns_verify + changed_when: false + failed_when: dns_verify.stdout | trim == "" + loop: "{{ sno_new_nodes }}" + +- name: "[create-vms] VMs created and DNS configured" + debug: + msg: "{{ sno_new_nodes | length }} VMs created with DNS entries." diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/ignition.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/ignition.yml new file mode 100644 index 0000000..5ed5206 --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/ignition.yml @@ -0,0 +1,32 @@ +--- +- name: "[ignition] Extract MCS CA certificate" + shell: | + oc get configmap -n openshift-machine-config-operator machine-config-server-ca -o jsonpath='{.data.ca-bundle\.crt}' + environment: "{{ sno_oc_env }}" + register: mcs_ca_raw + changed_when: false + +- name: "[ignition] Base64-encode MCS CA" + set_fact: + sno_mcs_ca_b64: "{{ mcs_ca_raw.stdout | b64encode }}" + +- name: "[ignition] Generate master.ign from template" + template: + src: master.ign.j2 + dest: /tmp/master.ign + mode: '0644' + +- name: "[ignition] Verify MCS reachability" + shell: | + curl -sk https://api-int.{{ sno_cluster_domain }}:22623/healthz + register: mcs_health + changed_when: false + failed_when: false + +- name: "[ignition] Display MCS status" + debug: + msg: "MCS healthz: {{ mcs_health.stdout | default('unreachable') }}" + +- name: "[ignition] Ignition config generated" + debug: + msg: "master.ign written to /tmp/master.ign for domain {{ sno_cluster_domain }}" diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/main.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/main.yml new file mode 100644 index 0000000..2fb7740 --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/main.yml @@ -0,0 +1,30 @@ +--- +- name: "Phase 0: Preflight checks" + import_tasks: preflight.yml + +- name: "Phase 1: Generate ignition config" + import_tasks: ignition.yml + +- name: "Phase 2: Create new VMs" + import_tasks: create-vms.yml + +- name: "Phase 3: Boot nodes with RHCOS" + import_tasks: boot-nodes.yml + +- name: "Phase 4: Wait for nodes to join" + import_tasks: wait-nodes.yml + +- name: "Phase 5: Wait for etcd scaling" + import_tasks: wait-etcd.yml + +- name: "Phase 6: Topology transition" + import_tasks: topology.yml + +- name: "Phase 7: MCO rollout" + import_tasks: mco-rollout.yml + +- name: "Phase 8: Update DNS for all nodes" + import_tasks: update-dns.yml + +- name: "Phase 9: Final verification" + import_tasks: verify.yml diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/mco-rollout.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/mco-rollout.yml new file mode 100644 index 0000000..4701edd --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/mco-rollout.yml @@ -0,0 +1,77 @@ +--- +- name: "[mco] Display MCO rollout monitoring start" + debug: + msg: "Beginning MCO rollout monitoring (timeout: {{ sno_mco_timeout_minutes }} min)" + +- name: "[mco] Poll MCP master and auto-fix drain deadlock" + shell: | + MCP_JSON=$(oc get mcp master -o json 2>/dev/null) || { + echo "STATUS=api_unreachable" + echo "RESULT=pending" + exit 0 + } + + UPDATED=$(echo "$MCP_JSON" | python3 -c " + import json,sys + data=json.load(sys.stdin) + conds={c['type']:c['status'] for c in data.get('status',{}).get('conditions',[])} + print(conds.get('Updated','Unknown')) + ") + UPDATING=$(echo "$MCP_JSON" | python3 -c " + import json,sys + data=json.load(sys.stdin) + conds={c['type']:c['status'] for c in data.get('status',{}).get('conditions',[])} + print(conds.get('Updating','Unknown')) + ") + DEGRADED=$(echo "$MCP_JSON" | python3 -c " + import json,sys + data=json.load(sys.stdin) + conds={c['type']:c['status'] for c in data.get('status',{}).get('conditions',[])} + print(conds.get('Degraded','Unknown')) + ") + + NODE_SCHED=$(oc get nodes --no-headers -o custom-columns=SCHED:.spec.unschedulable 2>/dev/null | head -1) + + echo "STATUS=updated:${UPDATED},updating:${UPDATING},degraded:${DEGRADED},cordoned:${NODE_SCHED}" + + if [ "$UPDATED" = "True" ] && [ "$UPDATING" = "False" ] && [ "$DEGRADED" = "False" ]; then + echo "RESULT=done" + exit 0 + fi + + # Auto-fix drain deadlock if node is cordoned and auto-fix is enabled + if [ "$NODE_SCHED" = "true" ] && [ "{{ sno_auto_fix_drain | default(true) | bool }}" = "True" ]; then + NODE_NAME=$(oc get nodes --no-headers -o custom-columns=NAME:.metadata.name | head -1) + echo "DRAIN_FIX: Node $NODE_NAME is cordoned, applying drain deadlock fix..." + + # Uncordon + oc adm uncordon "$NODE_NAME" 2>&1 || true + + # Set lastAppliedDrain annotation + DESIRED=$(oc get node "$NODE_NAME" -o jsonpath='{.metadata.annotations.machine\.openshift\.io/desired-drain}' 2>/dev/null || echo "") + if [ -n "$DESIRED" ]; then + oc annotate node "$NODE_NAME" "machine.openshift.io/last-applied-drain=$DESIRED" --overwrite 2>&1 || true + echo "DRAIN_FIX: Set lastAppliedDrain=$DESIRED" + fi + + # Restart MCC pod + oc delete pod -n openshift-machine-config-operator -l k8s-app=machine-config-controller 2>&1 || true + echo "DRAIN_FIX: Restarted MCC pod" + fi + + echo "RESULT=pending" + environment: "{{ sno_oc_env }}" + register: mcp_poll + until: "'RESULT=done' in mcp_poll.stdout" + retries: "{{ (sno_mco_timeout_minutes | int * 60 / 30) | int }}" + delay: 30 + changed_when: false + failed_when: false + +- name: "[mco] MCO rollout result" + debug: + msg: >- + {{ 'MCO rollout completed successfully.' + if 'RESULT=done' in mcp_poll.stdout + else 'MCO rollout did not complete within timeout. Continuing anyway.' }} + diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/preflight.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/preflight.yml new file mode 100644 index 0000000..7af662e --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/preflight.yml @@ -0,0 +1,146 @@ +--- +- name: "[preflight] Query Infrastructure CR" + shell: | + oc get infrastructure cluster -o json + environment: "{{ sno_oc_env }}" + register: infra_cr_raw + changed_when: false + +- name: "[preflight] Parse infrastructure data" + set_fact: + sno_infra_data: "{{ infra_cr_raw.stdout | from_json }}" + +- name: "[preflight] Extract cluster identity" + set_fact: + sno_platform: "{{ sno_infra_data.spec.platformSpec.type }}" + sno_cp_topology: "{{ sno_infra_data.status.controlPlaneTopology }}" + sno_infra_topology: "{{ sno_infra_data.status.infrastructureTopology }}" + sno_infra_id: "{{ sno_infra_id if sno_infra_id else sno_infra_data.status.infrastructureName }}" + sno_cluster_domain: >- + {{ sno_cluster_domain if sno_cluster_domain + else sno_infra_data.status.apiServerInternalURI + | regex_replace('^https://api-int\.', '') + | regex_replace(':6443$', '') }} + +- name: "[preflight] Assert platform is None" + assert: + that: sno_platform == "None" + fail_msg: "Expected platform None, got {{ sno_platform }}" + +- name: "[preflight] Assert topology is SingleReplica" + assert: + that: sno_cp_topology == "SingleReplica" + fail_msg: "Expected SingleReplica topology, got {{ sno_cp_topology }}. Cluster may already be HA." + when: not (sno_skip_topology_check | default(false) | bool) + + +- name: "[preflight] Display detected cluster info" + debug: + msg: >- + Cluster: {{ sno_cluster_name }} | Domain: {{ sno_cluster_domain }} | + InfraID: {{ sno_infra_id }} | Platform: {{ sno_platform }} | + Topology: {{ sno_cp_topology }} + +- name: "[preflight] Get cluster nodes" + shell: | + oc get nodes -o json + environment: "{{ sno_oc_env }}" + register: nodes_raw + changed_when: false + +- name: "[preflight] Parse node data" + set_fact: + sno_nodes: "{{ (nodes_raw.stdout | from_json)['items'] }}" + +- name: "[preflight] Assert exactly 1 node exists" + assert: + that: sno_nodes | length == 1 + fail_msg: "Expected 1 node for SNO, found {{ sno_nodes | length }}" + +- name: "[preflight] Auto-detect master-0 IP" + set_fact: + sno_master0_ip: >- + {{ sno_master0_ip if sno_master0_ip + else (sno_nodes[0].status.addresses + | selectattr('type', 'equalto', 'InternalIP') + | map(attribute='address') | first) }} + +- name: "[preflight] Display master-0 IP" + debug: + msg: "master-0 IP: {{ sno_master0_ip }}" + +- name: "[preflight] Check node is Ready" + assert: + that: >- + sno_nodes[0].status.conditions + | selectattr('type', 'equalto', 'Ready') + | map(attribute='status') | first == 'True' + fail_msg: "master-0 is not Ready" + +- name: "[preflight] Check etcd pods" + shell: | + oc get pods -n openshift-etcd -l app=etcd --no-headers \ + -o custom-columns=NAME:.metadata.name,READY:.status.containerStatuses[*].ready 2>/dev/null || true + environment: "{{ sno_oc_env }}" + register: etcd_pods_raw + changed_when: false + +- name: "[preflight] Display etcd status" + debug: + msg: "{{ etcd_pods_raw.stdout }}" + +- name: "[preflight] Check for degraded cluster operators" + shell: | + oc get co -o json | python3 -c " + import json, sys + data = json.load(sys.stdin) + degraded = [] + for co in data['items']: + name = co['metadata']['name'] + for cond in co.get('status', {}).get('conditions', []): + if cond['type'] == 'Degraded' and cond['status'] == 'True': + degraded.append(name) + if degraded: + print('Degraded COs: ' + ', '.join(degraded)) + sys.exit(1) + print('All cluster operators healthy') + " + environment: "{{ sno_oc_env }}" + register: co_check + changed_when: false + failed_when: false + +- name: "[preflight] Display CO status" + debug: + msg: "{{ co_check.stdout }}" + +- name: "[preflight] Warn if COs degraded" + debug: + msg: "WARNING: Some COs are degraded. Proceeding anyway." + when: co_check.rc != 0 + +- name: "[preflight] Verify api-int DNS on hypervisor" + shell: | + dig +short api-int.{{ sno_cluster_domain }} @192.168.111.1 + register: dns_apiint + changed_when: false + failed_when: false + +- name: "[preflight] Add api-int to addnhosts if missing" + shell: | + echo "{{ sno_master0_ip }} api-int.{{ sno_cluster_domain }} api-int" | \ + sudo tee -a {{ sno_addnhosts_path }} + sudo kill -HUP $(cat /run/libvirt/network/{{ sno_libvirt_bridge }}.pid) 2>/dev/null || true + when: dns_apiint.stdout | trim == "" or sno_master0_ip not in dns_apiint.stdout + +- name: "[preflight] Verify api-int DNS after fix" + shell: | + dig +short api-int.{{ sno_cluster_domain }} @192.168.111.1 + register: dns_apiint_verify + changed_when: false + failed_when: dns_apiint_verify.stdout | trim == "" + when: dns_apiint.stdout | trim == "" or sno_master0_ip not in dns_apiint.stdout + +- name: "[preflight] Preflight checks passed" + debug: + msg: "All preflight checks passed. Ready for topology transition." diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/topology.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/topology.yml new file mode 100644 index 0000000..4b99f1d --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/topology.yml @@ -0,0 +1,38 @@ +--- +- name: "[topology] Scale CVO to 0 replicas" + shell: | + oc scale deployment cluster-version-operator -n openshift-cluster-version --replicas=0 + environment: "{{ sno_oc_env }}" + +- name: "[topology] Wait for CVO to scale down" + shell: | + oc get deployment cluster-version-operator -n openshift-cluster-version \ + -o jsonpath='{.status.replicas}' 2>/dev/null || echo "0" + environment: "{{ sno_oc_env }}" + register: cvo_replicas + until: cvo_replicas.stdout | trim | int == 0 + retries: 30 + delay: 5 + changed_when: false + +- name: "[topology] CVO scaled down" + debug: + msg: "CVO scaled to 0 replicas" + +- name: "[topology] Patch Infrastructure CR - topology to HighlyAvailable" + shell: | + oc patch infrastructure cluster --type merge --subresource status \ + -p '{"status":{"controlPlaneTopology":"HighlyAvailable","infrastructureTopology":"HighlyAvailable"}}' + environment: "{{ sno_oc_env }}" + +- name: "[topology] Verify topology patch" + shell: | + oc get infrastructure cluster -o jsonpath='{.status.controlPlaneTopology}' + environment: "{{ sno_oc_env }}" + register: topo_verify + changed_when: false + failed_when: topo_verify.stdout != "HighlyAvailable" + +- name: "[topology] Topology transition complete" + debug: + msg: "Topology changed to HighlyAvailable. MCO rollout will begin." diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/update-dns.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/update-dns.yml new file mode 100644 index 0000000..bcfba13 --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/update-dns.yml @@ -0,0 +1,75 @@ +--- +- name: "[dns] Get all node InternalIPs" + shell: | + oc get nodes -o json | python3 -c " + import json, sys + data = json.load(sys.stdin) + for node in data['items']: + for addr in node['status']['addresses']: + if addr['type'] == 'InternalIP': + print(addr['address']) + " + environment: "{{ sno_oc_env }}" + register: all_node_ips_raw + changed_when: false + +- name: "[dns] Store node IPs" + set_fact: + sno_all_node_ips: "{{ all_node_ips_raw.stdout_lines }}" + +- name: "[dns] Display node IPs for DNS" + debug: + msg: "Node IPs: {{ sno_all_node_ips | join(', ') }}" + +- name: "[dns] Build NM dnsmasq config" + set_fact: + sno_dnsmasq_lines: | + {% for ip in sno_all_node_ips %} + address=/api.{{ sno_cluster_domain }}/{{ ip }} + address=/api-int.{{ sno_cluster_domain }}/{{ ip }} + {% endfor %} + address=/.apps.{{ sno_cluster_domain }}/{{ sno_all_node_ips[0] }} + +- name: "[dns] Write NM dnsmasq config" + copy: + content: "{{ sno_dnsmasq_lines }}" + dest: "{{ sno_nm_dnsmasq_conf }}" + mode: '0644' + become: true + +- name: "[dns] Clean stale apps entries from /etc/hosts" + lineinfile: + path: /etc/hosts + regexp: "apps\\.{{ sno_cluster_domain | regex_escape() }}" + state: absent + become: true + +- name: "[dns] Restart NetworkManager to pick up dnsmasq changes" + systemd: + name: NetworkManager + state: restarted + become: true + +- name: "[dns] Wait for DNS to stabilize" + pause: + seconds: 5 + +- name: "[dns] Verify API DNS resolution" + shell: | + dig +short api.{{ sno_cluster_domain }} + register: dns_api_verify + changed_when: false + failed_when: dns_api_verify.stdout | trim == "" + +- name: "[dns] Verify apps DNS resolution" + shell: | + dig +short test.apps.{{ sno_cluster_domain }} + register: dns_apps_verify + changed_when: false + failed_when: false + +- name: "[dns] DNS update complete" + debug: + msg: >- + API resolves to: {{ dns_api_verify.stdout_lines | join(', ') }}. + Apps resolves to: {{ dns_apps_verify.stdout_lines | default(['not yet']) | join(', ') }}. diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/verify.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/verify.yml new file mode 100644 index 0000000..1e9d9c9 --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/verify.yml @@ -0,0 +1,111 @@ +--- +- name: "[verify] Get node count and status" + shell: | + oc get nodes --no-headers + environment: "{{ sno_oc_env }}" + register: verify_nodes + changed_when: false + +- name: "[verify] Nodes" + debug: + msg: "{{ verify_nodes.stdout }}" + +- name: "[verify] Get topology" + shell: | + oc get infrastructure cluster -o jsonpath='{.status.controlPlaneTopology}' + environment: "{{ sno_oc_env }}" + register: verify_topo + changed_when: false + +- name: "[verify] Topology" + debug: + msg: "Control plane topology: {{ verify_topo.stdout }}" + +- name: "[verify] Get etcd member count" + shell: | + oc get pods -n openshift-etcd -l app=etcd --no-headers | wc -l + environment: "{{ sno_oc_env }}" + register: verify_etcd + changed_when: false + +- name: "[verify] etcd members" + debug: + msg: "etcd pods: {{ verify_etcd.stdout | trim }}" + +- name: "[verify] Get CVO replica count" + shell: | + oc get deployment cluster-version-operator -n openshift-cluster-version \ + -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "unknown" + environment: "{{ sno_oc_env }}" + register: verify_cvo + changed_when: false + +- name: "[verify] CVO status" + debug: + msg: "CVO replicas: {{ verify_cvo.stdout }} (expected: 0 - CVO intentionally scaled down)" + +- name: "[verify] Get cluster operator summary" + shell: | + oc get co -o json | python3 -c " + import json, sys + data = json.load(sys.stdin) + total = len(data['items']) + degraded = [] + unavailable = [] + for co in data['items']: + name = co['metadata']['name'] + conds = {c['type']: c['status'] for c in co.get('status', {}).get('conditions', [])} + if conds.get('Degraded') == 'True': + degraded.append(name) + if conds.get('Available') == 'False': + unavailable.append(name) + print(f'Total COs: {total}') + if degraded: + print(f'Degraded: {chr(44).join(degraded)}') + else: + print('Degraded: none') + if unavailable: + print(f'Unavailable: {chr(44).join(unavailable)}') + else: + print('Unavailable: none') + " + environment: "{{ sno_oc_env }}" + register: verify_co + changed_when: false + +- name: "[verify] Cluster operators" + debug: + msg: "{{ verify_co.stdout }}" + +- name: "[verify] Get MCP master status" + shell: | + oc get mcp master --no-headers + environment: "{{ sno_oc_env }}" + register: verify_mcp + changed_when: false + +- name: "[verify] MCP status" + debug: + msg: "{{ verify_mcp.stdout }}" + +- name: "[verify] Assert 3 nodes" + assert: + that: (verify_nodes.stdout_lines | length) >= 3 + fail_msg: "Expected 3+ nodes, found {{ verify_nodes.stdout_lines | length }}" + +- name: "[verify] Assert HA topology" + assert: + that: verify_topo.stdout == "HighlyAvailable" + fail_msg: "Expected HighlyAvailable, got {{ verify_topo.stdout }}" + +- name: "[verify] Assert 3 etcd members" + assert: + that: verify_etcd.stdout | trim | int >= 3 + fail_msg: "Expected 3+ etcd pods, found {{ verify_etcd.stdout | trim }}" + +- name: "[verify] SNO to 3-node transition verified successfully" + debug: + msg: >- + MIGRATION COMPLETE: {{ verify_nodes.stdout_lines | length }} nodes, + {{ verify_etcd.stdout | trim }} etcd members, + topology={{ verify_topo.stdout }}, CVO={{ verify_cvo.stdout }} replicas. diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-etcd.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-etcd.yml new file mode 100644 index 0000000..e92737a --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-etcd.yml @@ -0,0 +1,52 @@ +--- +- name: "[wait-etcd] Poll for etcd scaling to 3 members" + shell: | + FULLY_READY=$(oc get pods -n openshift-etcd -l app=etcd -o json 2>/dev/null | python3 -c " + import json, sys + data = json.load(sys.stdin) + count = 0 + for pod in data.get('items', []): + containers = pod.get('status', {}).get('containerStatuses', []) + if containers and all(c.get('ready', False) for c in containers): + count += 1 + print(count) + " 2>/dev/null || echo "0") + echo "ETCD_FULLY_READY=$FULLY_READY" + + if [ "$FULLY_READY" -ge 3 ]; then + echo "RESULT=done" + else + echo "RESULT=pending" + fi + environment: "{{ sno_oc_env }}" + register: etcd_poll + until: "'RESULT=done' in etcd_poll.stdout" + retries: "{{ (sno_etcd_timeout_minutes | int * 60 / 20) | int }}" + delay: 20 + changed_when: false + +- name: "[etcd] Display etcd member list" + shell: | + ETCD_POD=$(oc get pods -n openshift-etcd -l app=etcd -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl member list -w table 2>/dev/null || echo "Could not query etcd members" + environment: "{{ sno_oc_env }}" + register: etcd_members + changed_when: false + failed_when: false + +- name: "[etcd] Members" + debug: + msg: "{{ etcd_members.stdout }}" + +- name: "[etcd] Display etcd endpoint health" + shell: | + ETCD_POD=$(oc get pods -n openshift-etcd -l app=etcd -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint health --cluster -w table 2>/dev/null || echo "Could not query endpoint health" + environment: "{{ sno_oc_env }}" + register: etcd_health + changed_when: false + failed_when: false + +- name: "[etcd] Health" + debug: + msg: "{{ etcd_health.stdout }}" diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-nodes.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-nodes.yml new file mode 100644 index 0000000..f02335f --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-nodes.yml @@ -0,0 +1,48 @@ +--- +- name: "[wait-nodes] Poll for CSR approval and node join" + shell: | + PENDING=$(oc get csr -o json 2>/dev/null | python3 -c " + import json, sys + data = json.load(sys.stdin) + pending = [] + for csr in data.get('items', []): + status = csr.get('status', {}) + if not status.get('conditions'): + name = csr['metadata']['name'] + pending.append(name) + for name in pending: + print(name) + " 2>/dev/null || true) + + if [ -n "$PENDING" ]; then + for CSR in $PENDING; do + oc adm certificate approve "$CSR" 2>/dev/null || true + echo "Approved CSR: $CSR" + done + fi + + READY_COUNT=$(oc get nodes --no-headers 2>/dev/null | grep -c ' Ready' || echo "0") + echo "READY_NODES=$READY_COUNT" + + if [ "$READY_COUNT" -ge 3 ]; then + echo "RESULT=done" + else + echo "RESULT=pending" + fi + environment: "{{ sno_oc_env }}" + register: node_poll + until: "'RESULT=done' in node_poll.stdout" + retries: "{{ (sno_node_join_timeout_minutes | int * 60 / 20) | int }}" + delay: 20 + changed_when: false + +- name: "[wait-nodes] Display final node list" + shell: | + oc get nodes -o wide + environment: "{{ sno_oc_env }}" + register: node_list + changed_when: false + +- name: "[wait-nodes] Nodes joined" + debug: + msg: "{{ node_list.stdout }}" diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/auto-install.ign.j2 b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/auto-install.ign.j2 new file mode 100644 index 0000000..5bdd58e --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/auto-install.ign.j2 @@ -0,0 +1,25 @@ +{ + "ignition": { + "version": "3.2.0" + }, + "storage": { + "files": [ + { + "path": "/etc/master.ign", + "mode": 420, + "contents": { + "source": "data:text/plain;charset=utf-8;base64,{{ sno_master_ign_b64 }}" + } + } + ] + }, + "systemd": { + "units": [ + { + "name": "auto-install.service", + "enabled": true, + "contents": "[Unit]\nDescription=Auto-install RHCOS to disk\nAfter=network-online.target\nWants=network-online.target\n\n[Service]\nType=oneshot\nRemainAfterExit=yes\nExecStart=/usr/bin/coreos-installer install /dev/vda --ignition-file /etc/master.ign --insecure\nExecStartPost=/usr/bin/systemctl reboot\n\n[Install]\nWantedBy=multi-user.target\n" + } + ] + } +} diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/master.ign.j2 b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/master.ign.j2 new file mode 100644 index 0000000..d2c5b01 --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/master.ign.j2 @@ -0,0 +1,21 @@ +{ + "ignition": { + "version": "3.2.0", + "security": { + "tls": { + "certificateAuthorities": [ + { + "source": "data:text/plain;charset=utf-8;base64,{{ sno_mcs_ca_b64 }}" + } + ] + } + }, + "config": { + "merge": [ + { + "source": "https://api-int.{{ sno_cluster_domain }}:22623/config/master" + } + ] + } + } +} diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/vars/main.yml b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/vars/main.yml new file mode 100644 index 0000000..87e52bb --- /dev/null +++ b/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/vars/main.yml @@ -0,0 +1,22 @@ +--- +sno_dev_scripts_path: "{{ dev_scripts_path | default('openshift-metal3/dev-scripts') }}" +sno_kubeconfig_resolved: >- + {{ sno_kubeconfig if sno_kubeconfig + else sno_dev_scripts_path ~ '/ocp/' ~ sno_cluster_name ~ '/auth/kubeconfig' }} +sno_addnhosts_path: "/var/lib/libvirt/dnsmasq/{{ sno_libvirt_bridge }}.addnhosts" +sno_lease_file: "/var/lib/libvirt/dnsmasq/{{ sno_libvirt_bridge }}.status" +sno_nm_dnsmasq_conf: "/etc/NetworkManager/dnsmasq.d/openshift-{{ sno_cluster_name }}.conf" + +sno_oc_env: + KUBECONFIG: "{{ sno_kubeconfig_resolved }}" + PATH: "/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:{{ ansible_env.PATH | default('/usr/local/bin:/usr/bin') }}" + +sno_new_nodes: + - index: 1 + name: "{{ sno_cluster_name }}_master_1" + hostname: "master-1" + ip: "{{ sno_master1_ip }}" + - index: 2 + name: "{{ sno_cluster_name }}_master_2" + hostname: "master-2" + ip: "{{ sno_master2_ip }}" diff --git a/deploy/openshift-clusters/scripts/sno-to-3node.sh b/deploy/openshift-clusters/scripts/sno-to-3node.sh new file mode 100755 index 0000000..095ccd0 --- /dev/null +++ b/deploy/openshift-clusters/scripts/sno-to-3node.sh @@ -0,0 +1,21 @@ +#!/usr/bin/bash +set -euo pipefail + +SCRIPT_DIR=$(dirname "$0") +DEPLOY_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +# shellcheck source=/dev/null +source "${DEPLOY_DIR}/aws-hypervisor/scripts/common.sh" + +if [[ ! -f "$(get_node_dir)/aws-instance-id" ]]; then + echo "Error: No instance found. Run 'make deploy' first." + exit 1 +fi + +if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/inventory.ini" ]]; then + echo "Error: inventory.ini not found. Run 'make inventory' first." + exit 1 +fi + +cd "${DEPLOY_DIR}/openshift-clusters" +ansible-playbook sno-to-3node.yml -e "interactive_mode=false" -i inventory.ini "$@" diff --git a/deploy/openshift-clusters/sno-to-3node.yml b/deploy/openshift-clusters/sno-to-3node.yml new file mode 100644 index 0000000..8fd93bf --- /dev/null +++ b/deploy/openshift-clusters/sno-to-3node.yml @@ -0,0 +1,23 @@ +--- +- hosts: metal_machine + gather_facts: no + force_handlers: yes + + pre_tasks: + - name: Confirm SNO to 3-node transition + ansible.builtin.pause: + prompt: >- + This will transition the SNO cluster to a 3-node HA cluster. + 2 new VMs will be created. CVO will be scaled to 0. + Press Enter to proceed or Ctrl+C to abort. + delegate_to: localhost + run_once: true + when: interactive_mode | default(true) | bool + + roles: + - sno-expand/sno-to-3node + + tasks: + - name: Transition complete + ansible.builtin.debug: + msg: "SNO to 3-node HA transition completed successfully." From 33a331b5926dd2f718b638116a9c41cc4240d832 Mon Sep 17 00:00:00 2001 From: ehila Date: Thu, 25 Jun 2026 14:32:49 -0400 Subject: [PATCH 2/3] feat: add sno deployment config Signed-off-by: ehila --- deploy/Makefile | 6 +++ .../dev-scripts/install-dev/files/.gitignore | 1 + .../install-dev/files/config_sno_example.sh | 40 +++++++++++++++++++ .../dev-scripts/install-dev/handlers/main.yml | 5 +++ .../dev-scripts/install-dev/tasks/config.yml | 1 + .../dev-scripts/install-dev/vars/main.yml | 1 + .../scripts/deploy-cluster.sh | 6 +-- 7 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 deploy/openshift-clusters/roles/dev-scripts/install-dev/files/config_sno_example.sh diff --git a/deploy/Makefile b/deploy/Makefile index 4429523..e6c65f2 100644 --- a/deploy/Makefile +++ b/deploy/Makefile @@ -73,6 +73,12 @@ arbiter-ipi: arbiter-agent: @./openshift-clusters/scripts/deploy-cluster.sh --topology arbiter --method agent +sno-ipi: + @./openshift-clusters/scripts/deploy-cluster.sh --topology sno --method ipi + +sno-agent: + @./openshift-clusters/scripts/deploy-cluster.sh --topology sno --method agent + arbiter-kcli: @./openshift-clusters/scripts/deploy-cluster.sh --topology arbiter --method kcli diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/.gitignore b/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/.gitignore index 6ad1577..0818832 100644 --- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/.gitignore +++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/.gitignore @@ -3,3 +3,4 @@ ci_token clusterbot-ci_token config_arbiter.sh config_fencing.sh +config_sno.sh \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/config_sno_example.sh b/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/config_sno_example.sh new file mode 100644 index 0000000..f690e6f --- /dev/null +++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/files/config_sno_example.sh @@ -0,0 +1,40 @@ +#!/bin/bash + + +# Please copy one of the config values below for IPI or Agent based installs into your +# config. +# BEGIN IPI Specific Install Config Variables +export IP_STACK="v4" +export NUM_WORKERS=0 +export MASTER_MEMORY=32768 +export MASTER_DISK=100 +export MASTER_VCPU=4 +export NUM_MASTERS=1 +## END IPI Specific Install Config Variables + +## BEGIN Agent Specific Install Config Variables +export AGENT_E2E_TEST_SCENARIO="SNO_IPV4" +# Sets the install-config.yaml's platform type. +# The default is 'baremetal'. +# See https://github.com/openshift-metal3/dev-scripts/blob/master/config_example.sh for more details on this variable and its effects. +#export AGENT_PLATFORM_TYPE=none +## END Agent Specific Install Config Variables +#### + +# TechPreview FeatureSet not needed for 4.20 and above OCP +# export FEATURE_SET="TechPreviewNoUpgrade" +export OPENSHIFT_CI="true" + +# If you want to avoid using the CI_TOKEN, uncomment this variable, but it has side effects. +# You can read more on this here: https://github.com/openshift-metal3/dev-scripts/blob/3f070cfd36977381a186cadfb44887856d652bed/config_example.sh#L21 +# export OPENSHIFT_CI="true" + +# You can find the latest public images in https://quay.io/repository/openshift-release-dev/ocp-release?tab=tags +# and select your preferred version. Public sources can be found at https://mirror.openshift.com/pub/openshift-v4/ + +export OPENSHIFT_RELEASE_IMAGE=quay.io/openshift-release-dev/ocp-release:4.21.0-x86_64 +# Unless you need to override the installer image, this is not needed +# export OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE="" + +# Disable sigstore image verification during installation +export OPENSHIFT_INSTALL_EXPERIMENTAL_DISABLE_IMAGE_POLICY=true diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml index 4f3761e..d0a6ea9 100644 --- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml +++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml @@ -12,6 +12,11 @@ - kubeconfig_path is defined - kubeconfig_stat.stat.exists | default(false) changed_when: false + register: oc_project_result + retries: 5 + delay: 15 + until: oc_project_result.rc == 0 + failed_when: false listen: Set OCP project - name: Warn about missing kubeconfig diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml index 8411440..c973118 100644 --- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml +++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml @@ -40,6 +40,7 @@ expected_prefix: arbiter: "TNA" fencing: "TNF" + sno: "SNO" fail: msg: >- Config file {{ config_file[method] }} has AGENT_E2E_TEST_SCENARIO="{{ config_scenario }}" diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/vars/main.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/vars/main.yml index 35a6bad..9d2fb7c 100644 --- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/vars/main.yml +++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/vars/main.yml @@ -8,6 +8,7 @@ supported_methods: supported_topologies: - arbiter - fencing + - sno config_file: ipi: config_{{topology}}.sh agent: config_{{topology}}.sh diff --git a/deploy/openshift-clusters/scripts/deploy-cluster.sh b/deploy/openshift-clusters/scripts/deploy-cluster.sh index 746a425..6fc1761 100755 --- a/deploy/openshift-clusters/scripts/deploy-cluster.sh +++ b/deploy/openshift-clusters/scripts/deploy-cluster.sh @@ -49,7 +49,7 @@ done # Validate required arguments if [[ -z "${TOPOLOGY}" ]]; then - echo "Error: --topology is required (arbiter or fencing)" + echo "Error: --topology is required (arbiter, fencing, or sno)" exit 1 fi @@ -59,8 +59,8 @@ if [[ -z "${METHOD}" ]]; then fi # Validate topology value -if [[ "${TOPOLOGY}" != "arbiter" && "${TOPOLOGY}" != "fencing" ]]; then - echo "Error: Invalid topology '${TOPOLOGY}'. Must be 'arbiter' or 'fencing'." +if [[ "${TOPOLOGY}" != "arbiter" && "${TOPOLOGY}" != "fencing" && "${TOPOLOGY}" != "sno" ]]; then + echo "Error: Invalid topology '${TOPOLOGY}'. Must be 'arbiter', 'fencing', or 'sno'." exit 1 fi From 497dfcc9f5386356581ce6cffe83f5bab9fd602e Mon Sep 17 00:00:00 2001 From: ehila Date: Fri, 26 Jun 2026 02:17:21 -0400 Subject: [PATCH 3/3] upkeep: rename and validate flow for sno to 3node Signed-off-by: ehila --- deploy/Makefile | 4 + .../clean-mutable-topology.yml | 43 ++++++ .../sno-to-3node/defaults/main.yml | 0 .../sno-to-3node/tasks/boot-nodes.yml | 53 ++++++-- .../sno-to-3node/tasks/clean.yml | 124 ++++++++++++++++++ .../sno-to-3node/tasks/create-vms.yml | 2 + .../sno-to-3node/tasks/ignition.yml | 0 .../sno-to-3node/tasks/main.yml | 0 .../sno-to-3node/tasks/mco-rollout.yml | 37 +++--- .../sno-to-3node/tasks/preflight.yml | 20 ++- .../sno-to-3node/tasks/topology.yml | 0 .../sno-to-3node/tasks/update-dns.yml | 0 .../sno-to-3node/tasks/verify.yml | 0 .../sno-to-3node/tasks/wait-etcd.yml | 0 .../sno-to-3node/tasks/wait-nodes.yml | 0 .../templates/auto-install.ign.j2 | 0 .../sno-to-3node/templates/master.ign.j2 | 0 .../sno-to-3node/vars/main.yml | 0 .../scripts/clean-mutable-topology.sh | 21 +++ .../scripts/sno-to-3node.sh | 2 +- deploy/openshift-clusters/sno-to-3node.yml | 2 +- 21 files changed, 278 insertions(+), 30 deletions(-) create mode 100644 deploy/openshift-clusters/clean-mutable-topology.yml rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/defaults/main.yml (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/boot-nodes.yml (60%) create mode 100644 deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/clean.yml rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/create-vms.yml (96%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/ignition.yml (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/main.yml (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/mco-rollout.yml (57%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/preflight.yml (83%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/topology.yml (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/update-dns.yml (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/verify.yml (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/wait-etcd.yml (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/tasks/wait-nodes.yml (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/templates/auto-install.ign.j2 (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/templates/master.ign.j2 (100%) rename deploy/openshift-clusters/roles/{sno-expand => mutable-topology}/sno-to-3node/vars/main.yml (100%) create mode 100755 deploy/openshift-clusters/scripts/clean-mutable-topology.sh diff --git a/deploy/Makefile b/deploy/Makefile index e6c65f2..68efb32 100644 --- a/deploy/Makefile +++ b/deploy/Makefile @@ -52,6 +52,9 @@ full-clean: clean-spoke: @./openshift-clusters/scripts/clean-spoke.sh +clean-mutable-topology: + @./openshift-clusters/scripts/clean-mutable-topology.sh + ssh: @./aws-hypervisor/scripts/ssh.sh @@ -133,6 +136,7 @@ help: @echo " clean - Clean OpenShift cluster using dev-scripts clean target" @echo " full-clean - Fully clean instance cache and OpenShift cluster using dev-scripts realclean target" @echo " clean-spoke - Clean spoke cluster resources (VMs, network, auth) from assisted installer" + @echo " clean-mutable-topology - Remove master-1/2 VMs, disks, DHCP/DNS entries (sno-to-3node cleanup)" @echo " patch-nodes - Build resource-agents RPM and patch cluster nodes (default version: 4.11)" @echo "" @echo "Cluster Utilities:" diff --git a/deploy/openshift-clusters/clean-mutable-topology.yml b/deploy/openshift-clusters/clean-mutable-topology.yml new file mode 100644 index 0000000..c5113c3 --- /dev/null +++ b/deploy/openshift-clusters/clean-mutable-topology.yml @@ -0,0 +1,43 @@ +--- +- hosts: metal_machine + gather_facts: yes + + pre_tasks: + - name: Confirm mutable topology cleanup + ansible.builtin.pause: + prompt: >- + This will destroy the master-1 and master-2 VMs, their disks, DHCP reservations, + and DNS entries. master-0 (if still present) will be unaffected. + Press Enter to proceed or Ctrl+C to abort. + delegate_to: localhost + run_once: true + when: interactive_mode | default(true) | bool + + - name: Detect cluster domain from kubeconfig (best-effort) + shell: | + KUBECONFIG={{ dev_scripts_path | default('openshift-metal3/dev-scripts') }}/ocp/{{ sno_cluster_name | default('ostest') }}/auth/kubeconfig \ + oc get infrastructure cluster -o jsonpath='{.status.apiServerInternalURI}' 2>/dev/null \ + | sed 's|^https://api-int\.||; s|:6443$||' + register: detected_domain + changed_when: false + failed_when: false + ignore_errors: true + + - name: Set cluster domain (use detected, variable override, or default) + set_fact: + sno_cluster_domain: >- + {{ sno_cluster_domain + if (sno_cluster_domain is defined and sno_cluster_domain) + else (detected_domain.stdout | trim + if (detected_domain.stdout is defined and detected_domain.stdout | trim) + else 'ostest.test.metalkube.org') }} + + tasks: + - name: Run mutable topology cleanup + import_role: + name: mutable-topology/sno-to-3node + tasks_from: clean.yml + + - name: Cleanup complete + ansible.builtin.debug: + msg: "Mutable topology cleanup complete. master-1 and master-2 VMs have been removed." diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/defaults/main.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/defaults/main.yml similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/defaults/main.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/defaults/main.yml diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/boot-nodes.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/boot-nodes.yml similarity index 60% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/boot-nodes.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/boot-nodes.yml index 4ad9a2d..963943d 100644 --- a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/boot-nodes.yml +++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/boot-nodes.yml @@ -67,10 +67,12 @@ --os-variant rhel9.0 \ --graphics none \ --noautoconsole \ - --boot hd,cdrom + --boot loader=/usr/share/edk2/ovmf/OVMF_CODE.fd,loader_ro=yes,loader_type=pflash,nvram_template=/usr/share/edk2/ovmf/OVMF_VARS.fd,loader_secure=no \ + --boot hd,cdrom \ + --tpm none loop: "{{ sno_new_nodes }}" -- name: "[boot] Verify VMs are running" +- name: "[boot] Verify VMs are running (initial install boot)" shell: | sudo virsh domstate {{ item.name }} register: vm_state @@ -78,17 +80,52 @@ changed_when: false failed_when: "'running' not in vm_state.stdout" -- name: "[boot] Set on_reboot to restart (not destroy)" +- name: "[boot] Wait for coreos-installer to complete (VM will power off)" + # coreos-installer runs ExecStartPost=systemctl reboot, but RHCOS live issues + # an ACPI poweroff rather than a reset. libvirt fires on_poweroff=destroy so + # the VM shuts off. We poll until shut off, then boot from disk below. shell: | - sudo virsh dumpxml {{ item.name }} | grep -q 'destroy' && \ - sudo virt-xml {{ item.name }} --edit --events on_reboot=restart 2>/dev/null || true + for i in $(seq 50); do + STATE=$(sudo virsh domstate {{ item.name }} 2>/dev/null || echo "unknown") + if echo "$STATE" | grep -q "shut off"; then + echo "{{ item.name }} shut off after $((i * 20))s - install complete" + exit 0 + fi + sleep 20 + done + echo "Timeout: {{ item.name }} did not shut off within 1000s" + exit 1 + loop: "{{ sno_new_nodes }}" + changed_when: false + +- name: "[boot] Remove CDROM from boot order after install" + # Prevent coreos-installer loop: strip the cdrom boot entry so UEFI only + # tries the hard disk on subsequent boots. + shell: | + TMPXML=$(mktemp /tmp/vm-XXXXXX.xml) + sudo virsh dumpxml {{ item.name }} > "$TMPXML" + sudo sed -i "//d" "$TMPXML" + sudo virsh define "$TMPXML" + sudo rm -f "$TMPXML" + loop: "{{ sno_new_nodes }}" + changed_when: true + +- name: "[boot] Start VMs to boot from installed RHCOS" + shell: | + sudo virsh start {{ item.name }} loop: "{{ sno_new_nodes }}" - failed_when: false changed_when: true +- name: "[boot] Verify VMs are running from installed disk" + shell: | + sudo virsh domstate {{ item.name }} + register: vm_state_disk + loop: "{{ sno_new_nodes }}" + changed_when: false + failed_when: "'running' not in vm_state_disk.stdout" + - name: "[boot] VMs booted with RHCOS" debug: msg: >- - {{ sno_new_nodes | length }} VMs booted with RHCOS ISO (ignition embedded). - coreos-installer will write to disk and reboot automatically. + {{ sno_new_nodes | length }} VMs installed and started from disk. Waiting for nodes to join the cluster... diff --git a/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/clean.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/clean.yml new file mode 100644 index 0000000..5c7d411 --- /dev/null +++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/clean.yml @@ -0,0 +1,124 @@ +--- +- name: "[clean] Stop and undefine new VMs" + shell: | + sudo virsh destroy {{ item.name }} 2>/dev/null || true + sudo virsh undefine {{ item.name }} --remove-all-storage 2>/dev/null || true + loop: "{{ sno_new_nodes }}" + failed_when: false + changed_when: true + +- name: "[clean] Remove disk images" + file: + path: "{{ sno_vm_image_dir }}/{{ item.name }}.qcow2" + state: absent + loop: "{{ sno_new_nodes }}" + become: true + failed_when: false + +- name: "[clean] Remove NVRAM files" + file: + path: "/var/lib/libvirt/qemu/nvram/{{ item.name }}_VARS.fd" + state: absent + loop: "{{ sno_new_nodes }}" + become: true + failed_when: false + +- name: "[clean] Remove per-node RHCOS ISOs" + file: + path: "/var/lib/libvirt/images/rhcos-{{ item.hostname }}.iso" + state: absent + loop: "{{ sno_new_nodes }}" + become: true + failed_when: false + +- name: "[clean] Remove DHCP reservations from libvirt network" + shell: | + NET_XML=$(sudo virsh net-dumpxml {{ sno_libvirt_network }} 2>/dev/null || true) + EXISTING_MAC=$(echo "$NET_XML" | python3 -c " + import xml.etree.ElementTree as ET, sys + root = ET.parse(sys.stdin).getroot() + for host in root.findall('.//dhcp/host'): + if host.get('ip') == '{{ item.ip }}': + print(host.get('mac','')) + " 2>/dev/null || true) + + if [ -n "$EXISTING_MAC" ]; then + sudo virsh net-update {{ sno_libvirt_network }} delete ip-dhcp-host \ + "" --live --config 2>/dev/null || true + echo "Removed DHCP reservation for {{ item.ip }} (mac=${EXISTING_MAC})" + else + echo "No DHCP reservation found for {{ item.ip }}" + fi + loop: "{{ sno_new_nodes }}" + failed_when: false + changed_when: true + +- name: "[clean] Remove DNS host entries from libvirt network" + shell: | + sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \ + "{{ item.hostname }}.{{ sno_cluster_domain }}" \ + --live --config 2>/dev/null || true + sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \ + "{{ item.hostname }}" \ + --live --config 2>/dev/null || true + loop: "{{ sno_new_nodes }}" + failed_when: false + changed_when: true + +- name: "[clean] Remove api-int from libvirt network DNS" + shell: | + sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \ + "api-int.{{ sno_cluster_domain }}" \ + --live --config 2>/dev/null || true + failed_when: false + changed_when: true + +- name: "[clean] Remove entries from addnhosts" + lineinfile: + path: "{{ sno_addnhosts_path }}" + regexp: "^{{ item.ip }}\\s" + state: absent + loop: "{{ sno_new_nodes }}" + become: true + failed_when: false + +- name: "[clean] Remove hostname entries from addnhosts" + lineinfile: + path: "{{ sno_addnhosts_path }}" + regexp: "\\s{{ item.hostname }}\\.{{ sno_cluster_domain }}(\\s|$)" + state: absent + loop: "{{ sno_new_nodes }}" + become: true + failed_when: false + +- name: "[clean] Flush DHCP leases for new node IPs" + shell: | + python3 -c " + import json, os + lf = '{{ sno_lease_file }}' + if not os.path.exists(lf) or os.path.getsize(lf) == 0: + exit(0) + with open(lf) as f: + leases = json.load(f) + reserved = {{ [sno_master1_ip, sno_master2_ip] | to_json }} + before = len(leases) + leases = [l for l in leases if l.get('ip-address') not in reserved] + with open(lf, 'w') as f: + json.dump(leases, f, indent=2) + print(f'Flushed {before - len(leases)} leases') + " + become: true + failed_when: false + changed_when: true + +- name: "[clean] Reload libvirt dnsmasq" + shell: | + sudo kill -HUP $(cat /run/libvirt/network/{{ sno_libvirt_bridge }}.pid) 2>/dev/null || true + changed_when: true + failed_when: false + +- name: "[clean] Cleanup complete" + debug: + msg: >- + Removed VMs: {{ sno_new_nodes | map(attribute='name') | list | join(', ') }}. + Disk images, NVRAM, ISOs, DHCP reservations, and DNS entries removed. diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/create-vms.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/create-vms.yml similarity index 96% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/create-vms.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/create-vms.yml index d5ee1c9..1779afd 100644 --- a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/create-vms.yml +++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/create-vms.yml @@ -112,7 +112,9 @@ --os-variant rhel9.0 \ --graphics none \ --noautoconsole \ + --boot loader=/usr/share/edk2/ovmf/OVMF_CODE.fd,loader_ro=yes,loader_type=pflash,nvram_template=/usr/share/edk2/ovmf/OVMF_VARS.fd,loader_secure=no \ --boot hd \ + --tpm none \ --noreboot \ --import \ --print-xml | sudo virsh define /dev/stdin diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/ignition.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/ignition.yml similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/ignition.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/ignition.yml diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/main.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/main.yml similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/main.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/main.yml diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/mco-rollout.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/mco-rollout.yml similarity index 57% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/mco-rollout.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/mco-rollout.yml index 4701edd..7777274 100644 --- a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/mco-rollout.yml +++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/mco-rollout.yml @@ -30,32 +30,37 @@ print(conds.get('Degraded','Unknown')) ") - NODE_SCHED=$(oc get nodes --no-headers -o custom-columns=SCHED:.spec.unschedulable 2>/dev/null | head -1) + CORDONED_NODES=$(oc get nodes --no-headers 2>/dev/null | grep SchedulingDisabled | awk '{print $1}' || true) - echo "STATUS=updated:${UPDATED},updating:${UPDATING},degraded:${DEGRADED},cordoned:${NODE_SCHED}" + echo "STATUS=updated:${UPDATED},updating:${UPDATING},degraded:${DEGRADED},cordoned:${CORDONED_NODES:-none}" if [ "$UPDATED" = "True" ] && [ "$UPDATING" = "False" ] && [ "$DEGRADED" = "False" ]; then echo "RESULT=done" exit 0 fi - # Auto-fix drain deadlock if node is cordoned and auto-fix is enabled - if [ "$NODE_SCHED" = "true" ] && [ "{{ sno_auto_fix_drain | default(true) | bool }}" = "True" ]; then - NODE_NAME=$(oc get nodes --no-headers -o custom-columns=NAME:.metadata.name | head -1) - echo "DRAIN_FIX: Node $NODE_NAME is cordoned, applying drain deadlock fix..." + # Auto-fix drain deadlock for any cordoned node. + # On SNO→3node, MCD cordons a node before draining. If MCC cannot reach + # the node (api-int flap, RBAC timing) the drain annotation never advances + # and the node stays cordoned forever. Fix: uncordon, set lastAppliedDrain + # to the drain- value MCC is waiting for, restart MCC. + if [ -n "$CORDONED_NODES" ] && [ "{{ sno_auto_fix_drain | default(true) | bool }}" = "True" ]; then + DESIRED=$(oc get mcp master -o jsonpath='{.spec.configuration.name}' 2>/dev/null || echo "") + for NODE_NAME in $CORDONED_NODES; do + echo "DRAIN_FIX: Fixing drain deadlock on $NODE_NAME (desired=${DESIRED})..." - # Uncordon - oc adm uncordon "$NODE_NAME" 2>&1 || true + oc adm uncordon "$NODE_NAME" 2>&1 || true - # Set lastAppliedDrain annotation - DESIRED=$(oc get node "$NODE_NAME" -o jsonpath='{.metadata.annotations.machine\.openshift\.io/desired-drain}' 2>/dev/null || echo "") - if [ -n "$DESIRED" ]; then - oc annotate node "$NODE_NAME" "machine.openshift.io/last-applied-drain=$DESIRED" --overwrite 2>&1 || true - echo "DRAIN_FIX: Set lastAppliedDrain=$DESIRED" - fi + if [ -n "$DESIRED" ]; then + oc annotate node "$NODE_NAME" \ + "machineconfiguration.openshift.io/lastAppliedDrain=drain-${DESIRED}" \ + --overwrite 2>&1 || true + echo "DRAIN_FIX: Set lastAppliedDrain=drain-${DESIRED} on $NODE_NAME" + fi + done - # Restart MCC pod - oc delete pod -n openshift-machine-config-operator -l k8s-app=machine-config-controller 2>&1 || true + oc delete pod -n openshift-machine-config-operator \ + -l k8s-app=machine-config-controller 2>&1 || true echo "DRAIN_FIX: Restarted MCC pod" fi diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/preflight.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/preflight.yml similarity index 83% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/preflight.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/preflight.yml index 7af662e..cf59970 100644 --- a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/preflight.yml +++ b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/preflight.yml @@ -126,12 +126,21 @@ changed_when: false failed_when: false -- name: "[preflight] Add api-int to addnhosts if missing" +- name: "[preflight] Add api-int to libvirt network DNS" + # Using virsh net-update (not lineinfile) so that api-int survives addnhosts + # regeneration when create-vms.yml calls virsh net-update for master-1/2. + # libvirt regenerates ostestbm.addnhosts from the network XML on every net-update. shell: | - echo "{{ sno_master0_ip }} api-int.{{ sno_cluster_domain }} api-int" | \ - sudo tee -a {{ sno_addnhosts_path }} - sudo kill -HUP $(cat /run/libvirt/network/{{ sno_libvirt_bridge }}.pid) 2>/dev/null || true + # Remove any stale api-int entry (idempotent, ignore errors) + sudo virsh net-update {{ sno_libvirt_network }} delete dns-host \ + "api-int.{{ sno_cluster_domain }}" \ + --live --config 2>/dev/null || true + # Add the correct entry - triggers addnhosts regeneration with api-int included + sudo virsh net-update {{ sno_libvirt_network }} add dns-host \ + "api-int.{{ sno_cluster_domain }}" \ + --live --config when: dns_apiint.stdout | trim == "" or sno_master0_ip not in dns_apiint.stdout + changed_when: true - name: "[preflight] Verify api-int DNS after fix" shell: | @@ -139,6 +148,9 @@ register: dns_apiint_verify changed_when: false failed_when: dns_apiint_verify.stdout | trim == "" + retries: 3 + delay: 2 + until: dns_apiint_verify.stdout | trim != "" when: dns_apiint.stdout | trim == "" or sno_master0_ip not in dns_apiint.stdout - name: "[preflight] Preflight checks passed" diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/topology.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/topology.yml similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/topology.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/topology.yml diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/update-dns.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/update-dns.yml similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/update-dns.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/update-dns.yml diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/verify.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/verify.yml similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/verify.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/verify.yml diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-etcd.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-etcd.yml similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-etcd.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-etcd.yml diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-nodes.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-nodes.yml similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/tasks/wait-nodes.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/tasks/wait-nodes.yml diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/auto-install.ign.j2 b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/auto-install.ign.j2 similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/auto-install.ign.j2 rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/auto-install.ign.j2 diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/master.ign.j2 b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/master.ign.j2 similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/templates/master.ign.j2 rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/templates/master.ign.j2 diff --git a/deploy/openshift-clusters/roles/sno-expand/sno-to-3node/vars/main.yml b/deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/vars/main.yml similarity index 100% rename from deploy/openshift-clusters/roles/sno-expand/sno-to-3node/vars/main.yml rename to deploy/openshift-clusters/roles/mutable-topology/sno-to-3node/vars/main.yml diff --git a/deploy/openshift-clusters/scripts/clean-mutable-topology.sh b/deploy/openshift-clusters/scripts/clean-mutable-topology.sh new file mode 100755 index 0000000..adedd35 --- /dev/null +++ b/deploy/openshift-clusters/scripts/clean-mutable-topology.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR=$(dirname "$0") +DEPLOY_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +# shellcheck source=/dev/null +source "${DEPLOY_DIR}/aws-hypervisor/scripts/common.sh" + +if [[ ! -f "$(get_node_dir)/aws-instance-id" ]]; then + echo "Error: No instance found. Run 'make deploy' first." + exit 1 +fi + +if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/inventory.ini" ]]; then + echo "Error: inventory.ini not found. Run 'make inventory' first." + exit 1 +fi + +cd "${DEPLOY_DIR}/openshift-clusters" +ansible-playbook clean-mutable-topology.yml -e "interactive_mode=false" -i inventory.ini "$@" diff --git a/deploy/openshift-clusters/scripts/sno-to-3node.sh b/deploy/openshift-clusters/scripts/sno-to-3node.sh index 095ccd0..fab58d1 100755 --- a/deploy/openshift-clusters/scripts/sno-to-3node.sh +++ b/deploy/openshift-clusters/scripts/sno-to-3node.sh @@ -1,4 +1,4 @@ -#!/usr/bin/bash +#!/bin/bash set -euo pipefail SCRIPT_DIR=$(dirname "$0") diff --git a/deploy/openshift-clusters/sno-to-3node.yml b/deploy/openshift-clusters/sno-to-3node.yml index 8fd93bf..1b549ce 100644 --- a/deploy/openshift-clusters/sno-to-3node.yml +++ b/deploy/openshift-clusters/sno-to-3node.yml @@ -15,7 +15,7 @@ when: interactive_mode | default(true) | bool roles: - - sno-expand/sno-to-3node + - mutable-topology/sno-to-3node tasks: - name: Transition complete