Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions roles/scale_ocp_workers/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# scale_ocp_workers

Scales OpenShift worker nodes on AWS by managing MachineSets.

## Usage

Scale to 3 workers using existing instance type:

```yaml
- name: Scale OCP workers
ansible.builtin.include_role:
name: agnosticd.cloud_provider_aws.scale_ocp_workers
vars:
worker_instance_count: 3
```

Scale to 5 workers with a different instance type:

```yaml
- name: Scale OCP workers with custom instance type
ansible.builtin.include_role:
name: agnosticd.cloud_provider_aws.scale_ocp_workers
vars:
worker_instance_count: 5
worker_instance_type: m5.2xlarge
worker_machineset_suffix: compute
```

## Variables

See [defaults/main.yml](defaults/main.yml) for all variables and their defaults.
32 changes: 32 additions & 0 deletions roles/scale_ocp_workers/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
# OpenShift API authentication (required)
# openshift_api_url: https://api.cluster.example.com:6443
# openshift_cluster_admin_token: <token>
# openshift_validate_certs: false

# Number of worker nodes to scale to
# Workers are distributed round-robin across available availability zones
worker_instance_count: 0

# EC2 instance type for workers
# If empty or matches existing MachineSets, scales existing MachineSets
# If different, creates new MachineSets with this instance type
worker_instance_type: ""

# Custom suffix for new MachineSets when instance type differs
# Example: "gpu", "highmem", "compute"
# Results in names like: ocp-xjgsc-worker-us-east-2a-gpu
worker_machineset_suffix: ""

# Timeout settings for waiting for nodes to become Ready
worker_scale_timeout: 600
worker_scale_retries: 60
worker_scale_delay: 10

# CSR approval settings
# If nodes don't become Ready within worker_scale_csr_wait_seconds,
# the role will start approving pending CSRs
worker_scale_approve_csrs: true
worker_scale_csr_wait_seconds: 180
worker_scale_csr_retries: 30
worker_scale_csr_delay: 10
84 changes: 84 additions & 0 deletions roles/scale_ocp_workers/tasks/approve_csrs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
---
# Approve pending CSRs for worker nodes
# This is called recursively until all expected nodes are Ready

- name: Get all CSRs
kubernetes.core.k8s_info:
host: "{{ openshift_api_url }}"
api_key: "{{ openshift_cluster_admin_token }}"
validate_certs: "{{ openshift_validate_certs | default(false) }}"
api_version: certificates.k8s.io/v1
kind: CertificateSigningRequest
register: r_all_csrs

- name: Filter for pending CSRs
ansible.builtin.set_fact:
pending_csrs: "{{ r_all_csrs.resources | selectattr('status', 'equalto', {}) | list }}"

- name: Display pending CSRs
ansible.builtin.debug:
msg: "Found {{ pending_csrs | length }} pending CSR(s)"

- name: Approve pending CSRs
when: pending_csrs | length > 0
kubernetes.core.k8s:
host: "{{ openshift_api_url }}"
api_key: "{{ openshift_cluster_admin_token }}"
validate_certs: "{{ openshift_validate_certs | default(false) }}"
api_version: certificates.k8s.io/v1
kind: CertificateSigningRequest
name: "{{ item.metadata.name }}"
state: present
definition:
status:
conditions:
- type: Approved
status: "True"
reason: AnsibleApproval
message: "CSR was approved by Ansible scale_ocp_workers role."
loop: "{{ pending_csrs }}"
loop_control:
label: "{{ item.metadata.name }}"

- name: Get current worker nodes
kubernetes.core.k8s_info:
host: "{{ openshift_api_url }}"
api_key: "{{ openshift_cluster_admin_token }}"
validate_certs: "{{ openshift_validate_certs | default(false) }}"
api_version: v1
kind: Node
label_selectors:
- "node-role.kubernetes.io/worker"
- "!node-role.kubernetes.io/master"
register: r_worker_nodes

- name: Count Ready worker nodes
ansible.builtin.set_fact:
ready_worker_count: >-
{{
r_worker_nodes.resources
| selectattr('status.conditions', 'defined')
| list
| json_query("[?status.conditions[?type=='Ready' && status=='True']]")
| length
}}

- name: Display current Ready count
ansible.builtin.debug:
msg: "Ready workers: {{ ready_worker_count }}/{{ expected_worker_count }}"

- name: Recursively approve CSRs if nodes are not Ready yet
when:
- ready_worker_count | int < expected_worker_count | int
- csr_approval_retries | default(worker_scale_csr_retries) | int > 0
block:
- name: Wait before checking again
ansible.builtin.pause:
seconds: "{{ worker_scale_csr_delay }}"

- name: Decrement retry counter
ansible.builtin.set_fact:
csr_approval_retries: "{{ csr_approval_retries | default(worker_scale_csr_retries) | int - 1 }}"

- name: Recursively include CSR approval
ansible.builtin.include_tasks: approve_csrs.yml
96 changes: 96 additions & 0 deletions roles/scale_ocp_workers/tasks/create_new_machinesets.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
---
# Create new MachineSets with a different instance type
# Clones existing MachineSet specs and updates instance type and naming

- name: Validate worker_machineset_suffix is provided
ansible.builtin.fail:
msg: "worker_machineset_suffix is required when creating MachineSets with a different instance type"
when: worker_machineset_suffix | default('') | length == 0

- name: Calculate round-robin distribution
ansible.builtin.set_fact:
machineset_count: "{{ existing_machinesets | length }}"

- name: Build replica distribution list
ansible.builtin.set_fact:
replica_distribution: >-
{%- set total = worker_instance_count | int -%}
{%- set count = machineset_count | int -%}
{%- set base = total // count -%}
{%- set remainder = total % count -%}
{%- set result = [] -%}
{%- for i in range(count) -%}
{%- if i < remainder -%}
{%- set _ = result.append(base + 1) -%}
{%- else -%}
{%- set _ = result.append(base) -%}
{%- endif -%}
{%- endfor -%}
{{ result }}

- name: Display new MachineSet plan
ansible.builtin.debug:
msg: >-
Creating {{ item.0.metadata.name }}-{{ worker_machineset_suffix }}
with {{ item.1 }} replicas ({{ worker_instance_type }})
loop: "{{ existing_machinesets | zip(replica_distribution) | list }}"
loop_control:
label: "{{ item.0.metadata.name }}"

- name: Create new MachineSets
kubernetes.core.k8s:
host: "{{ openshift_api_url }}"
api_key: "{{ openshift_cluster_admin_token }}"
validate_certs: "{{ openshift_validate_certs | default(false) }}"
api_version: machine.openshift.io/v1beta1
kind: MachineSet
namespace: openshift-machine-api
name: "{{ item.0.metadata.name }}-{{ worker_machineset_suffix }}"
state: present
definition:
metadata:
labels:
machine.openshift.io/cluster-api-cluster: "{{ cluster_infrastructure_name }}"
spec:
replicas: "{{ item.1 | int }}"
selector:
matchLabels:
machine.openshift.io/cluster-api-cluster: "{{ cluster_infrastructure_name }}"
machine.openshift.io/cluster-api-machineset: "{{ item.0.metadata.name }}-{{ worker_machineset_suffix }}"
template:
metadata:
labels:
machine.openshift.io/cluster-api-cluster: "{{ cluster_infrastructure_name }}"
machine.openshift.io/cluster-api-machine-role: worker
machine.openshift.io/cluster-api-machine-type: worker
machine.openshift.io/cluster-api-machineset: "{{ item.0.metadata.name }}-{{ worker_machineset_suffix }}"
spec:
lifecycleHooks: {}
metadata: {}
providerSpec:
value:
apiVersion: machine.openshift.io/v1beta1
kind: AWSMachineProviderConfig
ami:
id: "{{ item.0.spec.template.spec.providerSpec.value.ami.id }}"
blockDevices: "{{ item.0.spec.template.spec.providerSpec.value.blockDevices }}"
credentialsSecret:
name: aws-cloud-credentials
deviceIndex: 0
iamInstanceProfile:
id: "{{ item.0.spec.template.spec.providerSpec.value.iamInstanceProfile.id }}"
instanceType: "{{ worker_instance_type }}"
metadata:
creationTimestamp: null
metadataServiceOptions: {}
placement:
availabilityZone: "{{ item.0.spec.template.spec.providerSpec.value.placement.availabilityZone }}"
region: "{{ item.0.spec.template.spec.providerSpec.value.placement.region }}"
securityGroups: "{{ item.0.spec.template.spec.providerSpec.value.securityGroups }}"
subnet: "{{ item.0.spec.template.spec.providerSpec.value.subnet }}"
tags: "{{ item.0.spec.template.spec.providerSpec.value.tags }}"
userDataSecret:
name: worker-user-data
loop: "{{ existing_machinesets | zip(replica_distribution) | list }}"
loop_control:
label: "{{ item.0.metadata.name }}-{{ worker_machineset_suffix }}"
80 changes: 80 additions & 0 deletions roles/scale_ocp_workers/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
---
- name: Scale OCP workers on AWS
module_defaults:
group/k8s:
host: "{{ openshift_api_url }}"
api_key: "{{ openshift_cluster_admin_token }}"
validate_certs: "{{ openshift_validate_certs | default(false) }}"
block:

- name: Get all MachineSets
kubernetes.core.k8s_info:
api_version: machine.openshift.io/v1beta1
kind: MachineSet
namespace: openshift-machine-api
register: r_all_machinesets

- name: Filter worker MachineSets
ansible.builtin.set_fact:
r_machinesets: >-
{%- set worker_machinesets = [] -%}
{%- for ms in r_all_machinesets.resources -%}
{%- set labels = ms.spec.template.metadata.labels | default({}) -%}
{%- if labels.get('machine.openshift.io/cluster-api-machine-role') == 'worker' -%}
{%- set _ = worker_machinesets.append(ms) -%}
{%- endif -%}
{%- endfor -%}
{{ worker_machinesets }}

- name: Fail if no worker MachineSets found
ansible.builtin.fail:
msg: "No worker MachineSets found in openshift-machine-api namespace"
when: r_machinesets | length == 0

- name: Set MachineSet facts
ansible.builtin.set_fact:
existing_machinesets: "{{ r_machinesets }}"
existing_instance_type: "{{ r_machinesets[0].spec.template.spec.providerSpec.value.instanceType }}"
cluster_infrastructure_name: "{{ r_machinesets[0].metadata.labels['machine.openshift.io/cluster-api-cluster'] }}"

- name: Get current worker node count
kubernetes.core.k8s_info:
api_version: v1
kind: Node
label_selectors:
- "node-role.kubernetes.io/worker"
register: r_worker_nodes

- name: Set current worker count
ansible.builtin.set_fact:
current_worker_count: "{{ r_worker_nodes.resources | length }}"

- name: Display current state
ansible.builtin.debug:
msg:
- "Cluster: {{ cluster_infrastructure_name }}"
- "Current workers: {{ current_worker_count }}"
- "Desired workers: {{ worker_instance_count }}"
- "Existing instance type: {{ existing_instance_type }}"
- "Requested instance type: {{ worker_instance_type | default('(use existing)') }}"
- "Available MachineSets: {{ existing_machinesets | map(attribute='metadata.name') | list }}"

- name: Determine if we need new MachineSets
ansible.builtin.set_fact:
needs_new_machinesets: >-
{{
worker_instance_type | default('') | length > 0
and worker_instance_type != existing_instance_type
}}

- name: Scale existing MachineSets
when: not needs_new_machinesets
ansible.builtin.include_tasks: scale_existing_machinesets.yml

- name: Create new MachineSets with different instance type
when: needs_new_machinesets
ansible.builtin.include_tasks: create_new_machinesets.yml

- name: Wait for worker nodes to be Ready
when: worker_instance_count | int > 0
ansible.builtin.include_tasks: wait_for_nodes.yml
48 changes: 48 additions & 0 deletions roles/scale_ocp_workers/tasks/scale_existing_machinesets.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
---
# Scale existing MachineSets using round-robin distribution across AZs
# Example: 5 workers across 3 AZs = [2, 2, 1]

- name: Calculate round-robin distribution
ansible.builtin.set_fact:
machineset_count: "{{ existing_machinesets | length }}"

- name: Build replica distribution list
ansible.builtin.set_fact:
replica_distribution: >-
{%- set total = worker_instance_count | int -%}
{%- set count = machineset_count | int -%}
{%- set base = total // count -%}
{%- set remainder = total % count -%}
{%- set result = [] -%}
{%- for i in range(count) -%}
{%- if i < remainder -%}
{%- set _ = result.append(base + 1) -%}
{%- else -%}
{%- set _ = result.append(base) -%}
{%- endif -%}
{%- endfor -%}
{{ result }}

- name: Display scaling plan
ansible.builtin.debug:
msg: "Scaling {{ item.0.metadata.name }} to {{ item.1 }} replicas"
loop: "{{ existing_machinesets | zip(replica_distribution) | list }}"
loop_control:
label: "{{ item.0.metadata.name }}"

- name: Scale MachineSets
kubernetes.core.k8s:
host: "{{ openshift_api_url }}"
api_key: "{{ openshift_cluster_admin_token }}"
validate_certs: "{{ openshift_validate_certs | default(false) }}"
api_version: machine.openshift.io/v1beta1
kind: MachineSet
namespace: openshift-machine-api
name: "{{ item.0.metadata.name }}"
state: present
definition:
spec:
replicas: "{{ item.1 | int }}"
loop: "{{ existing_machinesets | zip(replica_distribution) | list }}"
loop_control:
label: "{{ item.0.metadata.name }} -> {{ item.1 }} replicas"
Loading