From 826f9c51c895748d6682606686bb07fc505699b8 Mon Sep 17 00:00:00 2001 From: treddy08 Date: Tue, 19 May 2026 12:52:05 +1000 Subject: [PATCH 1/5] Add configurable dashboard replica scaling for OpenShift AI Add support for scaling the rhods-dashboard deployment with automatic management state transition to prevent operator reconciliation. New variables: - ocp4_workload_openshift_ai_scale_dashboard: Enable/disable scaling (default: false) - ocp4_workload_openshift_ai_dashboard_replicas: Replica count (default: 2) When scaling is enabled: 1. Dashboard deployment is scaled to specified replica count 2. Dashboard component in DataScienceCluster CR is set to Unmanaged This is particularly useful for SNO clusters where 1 replica is preferred to avoid CPU resource constraints. --- .../defaults/main.yml | 9 ++++++++ .../tasks/workload.yml | 23 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/roles/ocp4_workload_openshift_ai/defaults/main.yml b/roles/ocp4_workload_openshift_ai/defaults/main.yml index 4b0af14..4d4530f 100644 --- a/roles/ocp4_workload_openshift_ai/defaults/main.yml +++ b/roles/ocp4_workload_openshift_ai/defaults/main.yml @@ -61,3 +61,12 @@ ocp4_workload_openshift_ai_catalog_snapshot_image: quay.io/rhpds/olm_snapshot_re # Catalog snapshot image tag ocp4_workload_openshift_ai_catalog_snapshot_image_tag: v4.19_2025_07_21 + +# -------------------------------- +# Dashboard Replica Scaling +# -------------------------------- +# Set ocp4_workload_openshift_ai_scale_dashboard to true to override operator default +# Useful for SNO clusters where 1 replica is preferred +# When enabled, dashboard component will be set to Unmanaged to prevent operator reconciliation +ocp4_workload_openshift_ai_scale_dashboard: false +ocp4_workload_openshift_ai_dashboard_replicas: 2 diff --git a/roles/ocp4_workload_openshift_ai/tasks/workload.yml b/roles/ocp4_workload_openshift_ai/tasks/workload.yml index 50baeae..3c2f401 100644 --- a/roles/ocp4_workload_openshift_ai/tasks/workload.yml +++ b/roles/ocp4_workload_openshift_ai/tasks/workload.yml @@ -64,6 +64,29 @@ retries: 30 delay: 10 +- name: Scale dashboard deployment + when: ocp4_workload_openshift_ai_scale_dashboard | default(false) | bool + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: rhods-dashboard + namespace: redhat-ods-applications + definition: + spec: + replicas: "{{ ocp4_workload_openshift_ai_dashboard_replicas }}" + +- name: Set dashboard component to Unmanaged after scaling + when: ocp4_workload_openshift_ai_scale_dashboard | default(false) | bool + kubernetes.core.k8s: + api_version: datasciencecluster.opendatahub.io/{{ 'v2' if _ocp4_workload_openshift_ai_version_3 else 'v1' }} + kind: DataScienceCluster + name: default-dsc + definition: + spec: + components: + dashboard: + managementState: Unmanaged + - name: Create OpenShift AI Dashboard Route when: _ocp4_workload_openshift_ai_version_3 | bool kubernetes.core.k8s: From 3275971911afbbd747f76302a43b08375ad8ddc3 Mon Sep 17 00:00:00 2001 From: treddy08 Date: Tue, 19 May 2026 12:54:39 +1000 Subject: [PATCH 2/5] Fix task ordering for dashboard scaling Move dashboard scaling tasks to execute BEFORE waiting for DataScienceCluster ready state. This prevents the playbook from hanging when dashboard pods are stuck pending due to resource constraints. Execution order: 1. Create DataScienceCluster (operator creates dashboard with 2 replicas) 2. Wait for dashboard deployment to exist (if scaling enabled) 3. Scale dashboard deployment to desired replica count 4. Set dashboard component to Unmanaged 5. Wait for DataScienceCluster to reach Ready state This ensures SNO clusters with limited resources don't get stuck waiting for a dashboard deployment that can never reach 2/2 ready. --- .../tasks/workload.yml | 34 +++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/roles/ocp4_workload_openshift_ai/tasks/workload.yml b/roles/ocp4_workload_openshift_ai/tasks/workload.yml index 3c2f401..85240f8 100644 --- a/roles/ocp4_workload_openshift_ai/tasks/workload.yml +++ b/roles/ocp4_workload_openshift_ai/tasks/workload.yml @@ -49,18 +49,17 @@ retries: 10 delay: 30 -- name: Check if Data Science Cluster is ready +- name: Wait for dashboard deployment to be created + when: ocp4_workload_openshift_ai_scale_dashboard | default(false) | bool kubernetes.core.k8s_info: - api_version: datasciencecluster.opendatahub.io/{{ 'v2' if _ocp4_workload_openshift_ai_version_3 else 'v1' }} - kind: DataScienceCluster - name: default-dsc - register: r_ds_cluster + api_version: apps/v1 + kind: Deployment + name: rhods-dashboard + namespace: redhat-ods-applications + register: r_dashboard_deployment until: - - r_ds_cluster.resources is defined - - r_ds_cluster.resources | length > 0 - - r_ds_cluster.resources[0].status is defined - - r_ds_cluster.resources[0].status.phase is defined - - r_ds_cluster.resources[0].status.phase == 'Ready' + - r_dashboard_deployment.resources is defined + - r_dashboard_deployment.resources | length > 0 retries: 30 delay: 10 @@ -87,6 +86,21 @@ dashboard: managementState: Unmanaged +- name: Check if Data Science Cluster is ready + kubernetes.core.k8s_info: + api_version: datasciencecluster.opendatahub.io/{{ 'v2' if _ocp4_workload_openshift_ai_version_3 else 'v1' }} + kind: DataScienceCluster + name: default-dsc + register: r_ds_cluster + until: + - r_ds_cluster.resources is defined + - r_ds_cluster.resources | length > 0 + - r_ds_cluster.resources[0].status is defined + - r_ds_cluster.resources[0].status.phase is defined + - r_ds_cluster.resources[0].status.phase == 'Ready' + retries: 30 + delay: 10 + - name: Create OpenShift AI Dashboard Route when: _ocp4_workload_openshift_ai_version_3 | bool kubernetes.core.k8s: From fe00ac9b0e67d2c3d09c57f4172bce246597b4fc Mon Sep 17 00:00:00 2001 From: treddy08 Date: Tue, 19 May 2026 12:58:09 +1000 Subject: [PATCH 3/5] Refactor dashboard scaling tasks into a block Consolidate the three dashboard scaling tasks into a single block with one when condition for cleaner code organization. Changes: - Wrap tasks in a block with descriptive name - Move when condition from individual tasks to block level - No functional changes, purely organizational improvement --- .../tasks/workload.yml | 66 +++++++++---------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/roles/ocp4_workload_openshift_ai/tasks/workload.yml b/roles/ocp4_workload_openshift_ai/tasks/workload.yml index 85240f8..cc01c31 100644 --- a/roles/ocp4_workload_openshift_ai/tasks/workload.yml +++ b/roles/ocp4_workload_openshift_ai/tasks/workload.yml @@ -49,42 +49,42 @@ retries: 10 delay: 30 -- name: Wait for dashboard deployment to be created +- name: Scale dashboard deployment for resource-constrained clusters when: ocp4_workload_openshift_ai_scale_dashboard | default(false) | bool - kubernetes.core.k8s_info: - api_version: apps/v1 - kind: Deployment - name: rhods-dashboard - namespace: redhat-ods-applications - register: r_dashboard_deployment - until: - - r_dashboard_deployment.resources is defined - - r_dashboard_deployment.resources | length > 0 - retries: 30 - delay: 10 + block: + - name: Wait for dashboard deployment to be created + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: Deployment + name: rhods-dashboard + namespace: redhat-ods-applications + register: r_dashboard_deployment + until: + - r_dashboard_deployment.resources is defined + - r_dashboard_deployment.resources | length > 0 + retries: 30 + delay: 10 -- name: Scale dashboard deployment - when: ocp4_workload_openshift_ai_scale_dashboard | default(false) | bool - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: rhods-dashboard - namespace: redhat-ods-applications - definition: - spec: - replicas: "{{ ocp4_workload_openshift_ai_dashboard_replicas }}" + - name: Scale dashboard deployment + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: rhods-dashboard + namespace: redhat-ods-applications + definition: + spec: + replicas: "{{ ocp4_workload_openshift_ai_dashboard_replicas }}" -- name: Set dashboard component to Unmanaged after scaling - when: ocp4_workload_openshift_ai_scale_dashboard | default(false) | bool - kubernetes.core.k8s: - api_version: datasciencecluster.opendatahub.io/{{ 'v2' if _ocp4_workload_openshift_ai_version_3 else 'v1' }} - kind: DataScienceCluster - name: default-dsc - definition: - spec: - components: - dashboard: - managementState: Unmanaged + - name: Set dashboard component to Unmanaged after scaling + kubernetes.core.k8s: + api_version: datasciencecluster.opendatahub.io/{{ 'v2' if _ocp4_workload_openshift_ai_version_3 else 'v1' }} + kind: DataScienceCluster + name: default-dsc + definition: + spec: + components: + dashboard: + managementState: Unmanaged - name: Check if Data Science Cluster is ready kubernetes.core.k8s_info: From 4aab087cabdba50a6242716055d176ed6463626b Mon Sep 17 00:00:00 2001 From: treddy08 Date: Tue, 19 May 2026 13:03:30 +1000 Subject: [PATCH 4/5] Add idempotency check for dashboard replica scaling Only scale the dashboard deployment and set it to Unmanaged if the current replica count differs from the desired replica count. Changes: - Capture current replica count from dashboard deployment - Compare current vs desired replica count - Only execute scaling and management state change if different This prevents unnecessary changes when the deployment already has the desired replica count, improving playbook idempotency. --- roles/ocp4_workload_openshift_ai/tasks/workload.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/roles/ocp4_workload_openshift_ai/tasks/workload.yml b/roles/ocp4_workload_openshift_ai/tasks/workload.yml index cc01c31..d0979fc 100644 --- a/roles/ocp4_workload_openshift_ai/tasks/workload.yml +++ b/roles/ocp4_workload_openshift_ai/tasks/workload.yml @@ -65,7 +65,12 @@ retries: 30 delay: 10 + - name: Get current dashboard replica count + ansible.builtin.set_fact: + _ocp4_workload_openshift_ai_current_dashboard_replicas: "{{ r_dashboard_deployment.resources[0].spec.replicas }}" + - name: Scale dashboard deployment + when: _ocp4_workload_openshift_ai_current_dashboard_replicas | int != ocp4_workload_openshift_ai_dashboard_replicas | int kubernetes.core.k8s: api_version: apps/v1 kind: Deployment @@ -76,6 +81,7 @@ replicas: "{{ ocp4_workload_openshift_ai_dashboard_replicas }}" - name: Set dashboard component to Unmanaged after scaling + when: _ocp4_workload_openshift_ai_current_dashboard_replicas | int != ocp4_workload_openshift_ai_dashboard_replicas | int kubernetes.core.k8s: api_version: datasciencecluster.opendatahub.io/{{ 'v2' if _ocp4_workload_openshift_ai_version_3 else 'v1' }} kind: DataScienceCluster From 4d21f5491b66417193f14a52818d698295361d5a Mon Sep 17 00:00:00 2001 From: treddy08 Date: Tue, 19 May 2026 13:06:09 +1000 Subject: [PATCH 5/5] Simplify idempotency check using registered variable Remove unnecessary set_fact task and use the registered deployment variable directly in when conditions with array syntax. Changes: - Remove intermediate variable creation - Use r_dashboard_deployment.resources[0].spec.replicas directly - Convert when conditions to array format Cleaner code with no functional changes. --- roles/ocp4_workload_openshift_ai/tasks/workload.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/roles/ocp4_workload_openshift_ai/tasks/workload.yml b/roles/ocp4_workload_openshift_ai/tasks/workload.yml index d0979fc..7468271 100644 --- a/roles/ocp4_workload_openshift_ai/tasks/workload.yml +++ b/roles/ocp4_workload_openshift_ai/tasks/workload.yml @@ -65,12 +65,9 @@ retries: 30 delay: 10 - - name: Get current dashboard replica count - ansible.builtin.set_fact: - _ocp4_workload_openshift_ai_current_dashboard_replicas: "{{ r_dashboard_deployment.resources[0].spec.replicas }}" - - name: Scale dashboard deployment - when: _ocp4_workload_openshift_ai_current_dashboard_replicas | int != ocp4_workload_openshift_ai_dashboard_replicas | int + when: + - r_dashboard_deployment.resources[0].spec.replicas | int != ocp4_workload_openshift_ai_dashboard_replicas | int kubernetes.core.k8s: api_version: apps/v1 kind: Deployment @@ -81,7 +78,8 @@ replicas: "{{ ocp4_workload_openshift_ai_dashboard_replicas }}" - name: Set dashboard component to Unmanaged after scaling - when: _ocp4_workload_openshift_ai_current_dashboard_replicas | int != ocp4_workload_openshift_ai_dashboard_replicas | int + when: + - r_dashboard_deployment.resources[0].spec.replicas | int != ocp4_workload_openshift_ai_dashboard_replicas | int kubernetes.core.k8s: api_version: datasciencecluster.opendatahub.io/{{ 'v2' if _ocp4_workload_openshift_ai_version_3 else 'v1' }} kind: DataScienceCluster