diff --git a/roles/scale_ocp_workers/README.md b/roles/scale_ocp_workers/README.md new file mode 100644 index 0000000..8907d31 --- /dev/null +++ b/roles/scale_ocp_workers/README.md @@ -0,0 +1,31 @@ +# scale_ocp_workers + +Scales OpenShift worker nodes on AWS by managing MachineSets. + +## Usage + +Scale to 3 workers using existing instance type: + +```yaml +- name: Scale OCP workers + ansible.builtin.include_role: + name: agnosticd.cloud_provider_aws.scale_ocp_workers + vars: + worker_instance_count: 3 +``` + +Scale to 5 workers with a different instance type: + +```yaml +- name: Scale OCP workers with custom instance type + ansible.builtin.include_role: + name: agnosticd.cloud_provider_aws.scale_ocp_workers + vars: + worker_instance_count: 5 + worker_instance_type: m5.2xlarge + worker_machineset_suffix: compute +``` + +## Variables + +See [defaults/main.yml](defaults/main.yml) for all variables and their defaults. diff --git a/roles/scale_ocp_workers/defaults/main.yml b/roles/scale_ocp_workers/defaults/main.yml new file mode 100644 index 0000000..ad56828 --- /dev/null +++ b/roles/scale_ocp_workers/defaults/main.yml @@ -0,0 +1,32 @@ +--- +# OpenShift API authentication (required) +# openshift_api_url: https://api.cluster.example.com:6443 +# openshift_cluster_admin_token: +# openshift_validate_certs: false + +# Number of worker nodes to scale to +# Workers are distributed round-robin across available availability zones +worker_instance_count: 0 + +# EC2 instance type for workers +# If empty or matches existing MachineSets, scales existing MachineSets +# If different, creates new MachineSets with this instance type +worker_instance_type: "" + +# Custom suffix for new MachineSets when instance type differs +# Example: "gpu", "highmem", "compute" +# Results in names like: ocp-xjgsc-worker-us-east-2a-gpu +worker_machineset_suffix: "" + +# Timeout settings for waiting for nodes to become Ready +worker_scale_timeout: 600 +worker_scale_retries: 60 +worker_scale_delay: 10 + +# CSR approval settings +# If nodes don't become Ready within worker_scale_csr_wait_seconds, +# the role will start approving pending CSRs +worker_scale_approve_csrs: true +worker_scale_csr_wait_seconds: 180 +worker_scale_csr_retries: 30 +worker_scale_csr_delay: 10 diff --git a/roles/scale_ocp_workers/tasks/approve_csrs.yml b/roles/scale_ocp_workers/tasks/approve_csrs.yml new file mode 100644 index 0000000..fc99670 --- /dev/null +++ b/roles/scale_ocp_workers/tasks/approve_csrs.yml @@ -0,0 +1,84 @@ +--- +# Approve pending CSRs for worker nodes +# This is called recursively until all expected nodes are Ready + +- name: Get all CSRs + kubernetes.core.k8s_info: + host: "{{ openshift_api_url }}" + api_key: "{{ openshift_cluster_admin_token }}" + validate_certs: "{{ openshift_validate_certs | default(false) }}" + api_version: certificates.k8s.io/v1 + kind: CertificateSigningRequest + register: r_all_csrs + +- name: Filter for pending CSRs + ansible.builtin.set_fact: + pending_csrs: "{{ r_all_csrs.resources | selectattr('status', 'equalto', {}) | list }}" + +- name: Display pending CSRs + ansible.builtin.debug: + msg: "Found {{ pending_csrs | length }} pending CSR(s)" + +- name: Approve pending CSRs + when: pending_csrs | length > 0 + kubernetes.core.k8s: + host: "{{ openshift_api_url }}" + api_key: "{{ openshift_cluster_admin_token }}" + validate_certs: "{{ openshift_validate_certs | default(false) }}" + api_version: certificates.k8s.io/v1 + kind: CertificateSigningRequest + name: "{{ item.metadata.name }}" + state: present + definition: + status: + conditions: + - type: Approved + status: "True" + reason: AnsibleApproval + message: "CSR was approved by Ansible scale_ocp_workers role." + loop: "{{ pending_csrs }}" + loop_control: + label: "{{ item.metadata.name }}" + +- name: Get current worker nodes + kubernetes.core.k8s_info: + host: "{{ openshift_api_url }}" + api_key: "{{ openshift_cluster_admin_token }}" + validate_certs: "{{ openshift_validate_certs | default(false) }}" + api_version: v1 + kind: Node + label_selectors: + - "node-role.kubernetes.io/worker" + - "!node-role.kubernetes.io/master" + register: r_worker_nodes + +- name: Count Ready worker nodes + ansible.builtin.set_fact: + ready_worker_count: >- + {{ + r_worker_nodes.resources + | selectattr('status.conditions', 'defined') + | list + | json_query("[?status.conditions[?type=='Ready' && status=='True']]") + | length + }} + +- name: Display current Ready count + ansible.builtin.debug: + msg: "Ready workers: {{ ready_worker_count }}/{{ expected_worker_count }}" + +- name: Recursively approve CSRs if nodes are not Ready yet + when: + - ready_worker_count | int < expected_worker_count | int + - csr_approval_retries | default(worker_scale_csr_retries) | int > 0 + block: + - name: Wait before checking again + ansible.builtin.pause: + seconds: "{{ worker_scale_csr_delay }}" + + - name: Decrement retry counter + ansible.builtin.set_fact: + csr_approval_retries: "{{ csr_approval_retries | default(worker_scale_csr_retries) | int - 1 }}" + + - name: Recursively include CSR approval + ansible.builtin.include_tasks: approve_csrs.yml diff --git a/roles/scale_ocp_workers/tasks/create_new_machinesets.yml b/roles/scale_ocp_workers/tasks/create_new_machinesets.yml new file mode 100644 index 0000000..306b213 --- /dev/null +++ b/roles/scale_ocp_workers/tasks/create_new_machinesets.yml @@ -0,0 +1,96 @@ +--- +# Create new MachineSets with a different instance type +# Clones existing MachineSet specs and updates instance type and naming + +- name: Validate worker_machineset_suffix is provided + ansible.builtin.fail: + msg: "worker_machineset_suffix is required when creating MachineSets with a different instance type" + when: worker_machineset_suffix | default('') | length == 0 + +- name: Calculate round-robin distribution + ansible.builtin.set_fact: + machineset_count: "{{ existing_machinesets | length }}" + +- name: Build replica distribution list + ansible.builtin.set_fact: + replica_distribution: >- + {%- set total = worker_instance_count | int -%} + {%- set count = machineset_count | int -%} + {%- set base = total // count -%} + {%- set remainder = total % count -%} + {%- set result = [] -%} + {%- for i in range(count) -%} + {%- if i < remainder -%} + {%- set _ = result.append(base + 1) -%} + {%- else -%} + {%- set _ = result.append(base) -%} + {%- endif -%} + {%- endfor -%} + {{ result }} + +- name: Display new MachineSet plan + ansible.builtin.debug: + msg: >- + Creating {{ item.0.metadata.name }}-{{ worker_machineset_suffix }} + with {{ item.1 }} replicas ({{ worker_instance_type }}) + loop: "{{ existing_machinesets | zip(replica_distribution) | list }}" + loop_control: + label: "{{ item.0.metadata.name }}" + +- name: Create new MachineSets + kubernetes.core.k8s: + host: "{{ openshift_api_url }}" + api_key: "{{ openshift_cluster_admin_token }}" + validate_certs: "{{ openshift_validate_certs | default(false) }}" + api_version: machine.openshift.io/v1beta1 + kind: MachineSet + namespace: openshift-machine-api + name: "{{ item.0.metadata.name }}-{{ worker_machineset_suffix }}" + state: present + definition: + metadata: + labels: + machine.openshift.io/cluster-api-cluster: "{{ cluster_infrastructure_name }}" + spec: + replicas: "{{ item.1 | int }}" + selector: + matchLabels: + machine.openshift.io/cluster-api-cluster: "{{ cluster_infrastructure_name }}" + machine.openshift.io/cluster-api-machineset: "{{ item.0.metadata.name }}-{{ worker_machineset_suffix }}" + template: + metadata: + labels: + machine.openshift.io/cluster-api-cluster: "{{ cluster_infrastructure_name }}" + machine.openshift.io/cluster-api-machine-role: worker + machine.openshift.io/cluster-api-machine-type: worker + machine.openshift.io/cluster-api-machineset: "{{ item.0.metadata.name }}-{{ worker_machineset_suffix }}" + spec: + lifecycleHooks: {} + metadata: {} + providerSpec: + value: + apiVersion: machine.openshift.io/v1beta1 + kind: AWSMachineProviderConfig + ami: + id: "{{ item.0.spec.template.spec.providerSpec.value.ami.id }}" + blockDevices: "{{ item.0.spec.template.spec.providerSpec.value.blockDevices }}" + credentialsSecret: + name: aws-cloud-credentials + deviceIndex: 0 + iamInstanceProfile: + id: "{{ item.0.spec.template.spec.providerSpec.value.iamInstanceProfile.id }}" + instanceType: "{{ worker_instance_type }}" + metadata: + creationTimestamp: null + metadataServiceOptions: {} + placement: + availabilityZone: "{{ item.0.spec.template.spec.providerSpec.value.placement.availabilityZone }}" + region: "{{ item.0.spec.template.spec.providerSpec.value.placement.region }}" + securityGroups: "{{ item.0.spec.template.spec.providerSpec.value.securityGroups }}" + subnet: "{{ item.0.spec.template.spec.providerSpec.value.subnet }}" + tags: "{{ item.0.spec.template.spec.providerSpec.value.tags }}" + userDataSecret: + name: worker-user-data + loop: "{{ existing_machinesets | zip(replica_distribution) | list }}" + loop_control: + label: "{{ item.0.metadata.name }}-{{ worker_machineset_suffix }}" diff --git a/roles/scale_ocp_workers/tasks/main.yml b/roles/scale_ocp_workers/tasks/main.yml new file mode 100644 index 0000000..ebcd955 --- /dev/null +++ b/roles/scale_ocp_workers/tasks/main.yml @@ -0,0 +1,80 @@ +--- +- name: Scale OCP workers on AWS + module_defaults: + group/k8s: + host: "{{ openshift_api_url }}" + api_key: "{{ openshift_cluster_admin_token }}" + validate_certs: "{{ openshift_validate_certs | default(false) }}" + block: + + - name: Get all MachineSets + kubernetes.core.k8s_info: + api_version: machine.openshift.io/v1beta1 + kind: MachineSet + namespace: openshift-machine-api + register: r_all_machinesets + + - name: Filter worker MachineSets + ansible.builtin.set_fact: + r_machinesets: >- + {%- set worker_machinesets = [] -%} + {%- for ms in r_all_machinesets.resources -%} + {%- set labels = ms.spec.template.metadata.labels | default({}) -%} + {%- if labels.get('machine.openshift.io/cluster-api-machine-role') == 'worker' -%} + {%- set _ = worker_machinesets.append(ms) -%} + {%- endif -%} + {%- endfor -%} + {{ worker_machinesets }} + + - name: Fail if no worker MachineSets found + ansible.builtin.fail: + msg: "No worker MachineSets found in openshift-machine-api namespace" + when: r_machinesets | length == 0 + + - name: Set MachineSet facts + ansible.builtin.set_fact: + existing_machinesets: "{{ r_machinesets }}" + existing_instance_type: "{{ r_machinesets[0].spec.template.spec.providerSpec.value.instanceType }}" + cluster_infrastructure_name: "{{ r_machinesets[0].metadata.labels['machine.openshift.io/cluster-api-cluster'] }}" + + - name: Get current worker node count + kubernetes.core.k8s_info: + api_version: v1 + kind: Node + label_selectors: + - "node-role.kubernetes.io/worker" + register: r_worker_nodes + + - name: Set current worker count + ansible.builtin.set_fact: + current_worker_count: "{{ r_worker_nodes.resources | length }}" + + - name: Display current state + ansible.builtin.debug: + msg: + - "Cluster: {{ cluster_infrastructure_name }}" + - "Current workers: {{ current_worker_count }}" + - "Desired workers: {{ worker_instance_count }}" + - "Existing instance type: {{ existing_instance_type }}" + - "Requested instance type: {{ worker_instance_type | default('(use existing)') }}" + - "Available MachineSets: {{ existing_machinesets | map(attribute='metadata.name') | list }}" + + - name: Determine if we need new MachineSets + ansible.builtin.set_fact: + needs_new_machinesets: >- + {{ + worker_instance_type | default('') | length > 0 + and worker_instance_type != existing_instance_type + }} + + - name: Scale existing MachineSets + when: not needs_new_machinesets + ansible.builtin.include_tasks: scale_existing_machinesets.yml + + - name: Create new MachineSets with different instance type + when: needs_new_machinesets + ansible.builtin.include_tasks: create_new_machinesets.yml + + - name: Wait for worker nodes to be Ready + when: worker_instance_count | int > 0 + ansible.builtin.include_tasks: wait_for_nodes.yml diff --git a/roles/scale_ocp_workers/tasks/scale_existing_machinesets.yml b/roles/scale_ocp_workers/tasks/scale_existing_machinesets.yml new file mode 100644 index 0000000..b222070 --- /dev/null +++ b/roles/scale_ocp_workers/tasks/scale_existing_machinesets.yml @@ -0,0 +1,48 @@ +--- +# Scale existing MachineSets using round-robin distribution across AZs +# Example: 5 workers across 3 AZs = [2, 2, 1] + +- name: Calculate round-robin distribution + ansible.builtin.set_fact: + machineset_count: "{{ existing_machinesets | length }}" + +- name: Build replica distribution list + ansible.builtin.set_fact: + replica_distribution: >- + {%- set total = worker_instance_count | int -%} + {%- set count = machineset_count | int -%} + {%- set base = total // count -%} + {%- set remainder = total % count -%} + {%- set result = [] -%} + {%- for i in range(count) -%} + {%- if i < remainder -%} + {%- set _ = result.append(base + 1) -%} + {%- else -%} + {%- set _ = result.append(base) -%} + {%- endif -%} + {%- endfor -%} + {{ result }} + +- name: Display scaling plan + ansible.builtin.debug: + msg: "Scaling {{ item.0.metadata.name }} to {{ item.1 }} replicas" + loop: "{{ existing_machinesets | zip(replica_distribution) | list }}" + loop_control: + label: "{{ item.0.metadata.name }}" + +- name: Scale MachineSets + kubernetes.core.k8s: + host: "{{ openshift_api_url }}" + api_key: "{{ openshift_cluster_admin_token }}" + validate_certs: "{{ openshift_validate_certs | default(false) }}" + api_version: machine.openshift.io/v1beta1 + kind: MachineSet + namespace: openshift-machine-api + name: "{{ item.0.metadata.name }}" + state: present + definition: + spec: + replicas: "{{ item.1 | int }}" + loop: "{{ existing_machinesets | zip(replica_distribution) | list }}" + loop_control: + label: "{{ item.0.metadata.name }} -> {{ item.1 }} replicas" diff --git a/roles/scale_ocp_workers/tasks/wait_for_nodes.yml b/roles/scale_ocp_workers/tasks/wait_for_nodes.yml new file mode 100644 index 0000000..04a980c --- /dev/null +++ b/roles/scale_ocp_workers/tasks/wait_for_nodes.yml @@ -0,0 +1,112 @@ +--- +# Wait for worker nodes to reach Ready status +# First tries a quick wait, then falls back to CSR approval if needed + +- name: Calculate expected worker count + ansible.builtin.set_fact: + expected_worker_count: "{{ worker_instance_count | int }}" + +- name: Display wait parameters + ansible.builtin.debug: + msg: + - "Waiting for {{ expected_worker_count }} worker nodes to be Ready" + - "Initial wait: {{ worker_scale_csr_wait_seconds }}s before CSR approval" + - "CSR approval enabled: {{ worker_scale_approve_csrs }}" + +# First, try waiting for the initial period without CSR approval +- name: Initial wait for worker nodes (before CSR approval) + kubernetes.core.k8s_info: + host: "{{ openshift_api_url }}" + api_key: "{{ openshift_cluster_admin_token }}" + validate_certs: "{{ openshift_validate_certs | default(false) }}" + api_version: v1 + kind: Node + label_selectors: + - "node-role.kubernetes.io/worker" + - "!node-role.kubernetes.io/master" + register: r_worker_nodes + until: >- + (r_worker_nodes.resources | length >= expected_worker_count | int) + and + (r_worker_nodes.resources + | selectattr('status.conditions', 'defined') + | rejectattr('status.conditions', 'equalto', []) + | list + | json_query("[?status.conditions[?type=='Ready' && status=='True']]") + | length >= expected_worker_count | int) + retries: "{{ (worker_scale_csr_wait_seconds | int / worker_scale_delay | int) | int }}" + delay: "{{ worker_scale_delay }}" + ignore_errors: true + +- name: Check if initial wait succeeded + ansible.builtin.set_fact: + initial_wait_succeeded: >- + {{ + (r_worker_nodes.resources | length >= expected_worker_count | int) + and + (r_worker_nodes.resources + | selectattr('status.conditions', 'defined') + | rejectattr('status.conditions', 'equalto', []) + | list + | json_query("[?status.conditions[?type=='Ready' && status=='True']]") + | length >= expected_worker_count | int) + }} + +# If initial wait failed and CSR approval is enabled, start approving CSRs +- name: Approve CSRs and wait for nodes + when: + - not initial_wait_succeeded + - worker_scale_approve_csrs | default(true) | bool + block: + - name: Display CSR approval message + ansible.builtin.debug: + msg: "Nodes not Ready after {{ worker_scale_csr_wait_seconds }}s - starting CSR approval" + + - name: Include CSR approval tasks + ansible.builtin.include_tasks: approve_csrs.yml + +# Final wait if CSR approval is disabled but initial wait failed +- name: Final wait for worker nodes (CSR approval disabled) + when: + - not initial_wait_succeeded + - not (worker_scale_approve_csrs | default(true) | bool) + kubernetes.core.k8s_info: + host: "{{ openshift_api_url }}" + api_key: "{{ openshift_cluster_admin_token }}" + validate_certs: "{{ openshift_validate_certs | default(false) }}" + api_version: v1 + kind: Node + label_selectors: + - "node-role.kubernetes.io/worker" + - "!node-role.kubernetes.io/master" + register: r_worker_nodes + until: >- + (r_worker_nodes.resources | length >= expected_worker_count | int) + and + (r_worker_nodes.resources + | selectattr('status.conditions', 'defined') + | rejectattr('status.conditions', 'equalto', []) + | list + | json_query("[?status.conditions[?type=='Ready' && status=='True']]") + | length >= expected_worker_count | int) + retries: "{{ worker_scale_retries }}" + delay: "{{ worker_scale_delay }}" + +# Get final node status +- name: Get final worker node status + kubernetes.core.k8s_info: + host: "{{ openshift_api_url }}" + api_key: "{{ openshift_cluster_admin_token }}" + validate_certs: "{{ openshift_validate_certs | default(false) }}" + api_version: v1 + kind: Node + label_selectors: + - "node-role.kubernetes.io/worker" + - "!node-role.kubernetes.io/master" + register: r_worker_nodes_final + +- name: Display final worker node status + ansible.builtin.debug: + msg: + - "Worker nodes Ready: {{ r_worker_nodes_final.resources | length }}" + - "Node names: {{ r_worker_nodes_final.resources | map(attribute='metadata.name') | list }}"