From 9f7954d671b8c166192001bd25134c3edf2fa135 Mon Sep 17 00:00:00 2001 From: cfsnate Date: Mon, 22 Sep 2025 16:48:14 +0000 Subject: [PATCH] IDEA Release 25.09.1 --- CHANGELOG.md | 16 ++++++++++ IDEA_VERSION.txt | 2 +- idea-admin-windows.ps1 | 2 +- idea-admin.sh | 2 +- .../src/ideaadministrator_meta/__init__.py | 2 +- .../src/ideaclustermanager_meta/__init__.py | 2 +- source/idea/idea-cluster-manager/webapp/.env | 2 +- .../idea-cluster-manager/webapp/package.json | 2 +- .../src/ideadatamodel_meta/__init__.py | 2 +- .../node_monitor/node_house_keeper.py | 29 ++++++++++++++++++- .../src/ideascheduler_meta/__init__.py | 2 +- .../idea-sdk/src/ideasdk_meta/__init__.py | 2 +- .../__init__.py | 2 +- 13 files changed, 55 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 304be5d5..23d1ee03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Calendar Versioning](https://calver.org/). +## [25.09.1] - 2025-09-22 + +**Upgrade Instructions:** +* It's recommended to perform a full cluster upgrade as Base AMIs and other settings have been updated +* If you are already on `25.09.0` only an update to the `scheduler` module is required. +```bash +./idea-admin.sh upgrade-cluster --aws-region $IDEA_AWS_REGION --cluster-name $IDEA_CLUSTER_NAME +``` +([Upgrade Documentation](https://docs.idea-hpc.com/first-time-users/cluster-operations/update-idea-cluster/upgrade-cluster)) + +### **🐛 Bug Fixes** +* **Node House Keeper**: Fixed completed jobs not being properly removed and cleaned up + +### **🔧 Improvements** +* **Node House Keeper**: Added additional debug logging + ## [25.09.0] - 2025-09-03 **Upgrade Instructions:** diff --git a/IDEA_VERSION.txt b/IDEA_VERSION.txt index 4056ae65..739af04d 100644 --- a/IDEA_VERSION.txt +++ b/IDEA_VERSION.txt @@ -1 +1 @@ -25.09.0 +25.09.1 diff --git a/idea-admin-windows.ps1 b/idea-admin-windows.ps1 index 5dd70d3c..a953720f 100755 --- a/idea-admin-windows.ps1 +++ b/idea-admin-windows.ps1 @@ -38,7 +38,7 @@ function Verify-Command($type,$message,$command) { $IDEADevMode = if ($Env:IDEA_DEV_MODE) {$Env:IDEA_DEV_MODE} else {""} $VirtualEnv = if ($Env:VIRTUAL_ENV) {$Env:VIRTUAL_ENV} else {""} $ScriptDir = $PSScriptRoot -$IDEARevision = if ($Env:IDEA_REVISION) {$Env:IDEA_REVISION} else {"v25.09.0"} +$IDEARevision = if ($Env:IDEA_REVISION) {$Env:IDEA_REVISION} else {"v25.09.1"} $IDEADockerRepo = "public.ecr.aws/s5o2b4m0" $DocumentationError = "https://docs.idea-hpc.com" $AWSProfile = if ($Env:AWS_PROFILE) {$Env:AWS_PROFILE} else {"default"} diff --git a/idea-admin.sh b/idea-admin.sh index ef491b73..9c8a9847 100755 --- a/idea-admin.sh +++ b/idea-admin.sh @@ -28,7 +28,7 @@ # * IDEA_DEV_MODE - Set to "true" if you are working with IDEA sources SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -IDEA_REVISION=${IDEA_REVISION:-"v25.09.0"} +IDEA_REVISION=${IDEA_REVISION:-"v25.09.1"} IDEA_DOCKER_REPO=${IDEA_DOCKER_REPO:-"public.ecr.aws/s5o2b4m0/idea-administrator"} IDEA_ECR_CREDS_RESET=${IDEA_ECR_CREDS_RESET:-"true"} IDEA_ADMIN_AWS_CREDENTIAL_PROVIDER=${IDEA_ADMIN_AWS_CREDENTIAL_PROVIDER:=""} diff --git a/source/idea/idea-administrator/src/ideaadministrator_meta/__init__.py b/source/idea/idea-administrator/src/ideaadministrator_meta/__init__.py index 129d71d8..0beefbf5 100644 --- a/source/idea/idea-administrator/src/ideaadministrator_meta/__init__.py +++ b/source/idea/idea-administrator/src/ideaadministrator_meta/__init__.py @@ -12,4 +12,4 @@ # pkg config for idea-admin. no dependencies. __name__ = 'idea-administrator' -__version__ = '25.09.0' +__version__ = '25.09.1' diff --git a/source/idea/idea-cluster-manager/src/ideaclustermanager_meta/__init__.py b/source/idea/idea-cluster-manager/src/ideaclustermanager_meta/__init__.py index 24601373..f32fd585 100644 --- a/source/idea/idea-cluster-manager/src/ideaclustermanager_meta/__init__.py +++ b/source/idea/idea-cluster-manager/src/ideaclustermanager_meta/__init__.py @@ -10,4 +10,4 @@ # and limitations under the License. __name__ = 'idea-cluster-manager' -__version__ = '25.09.0' +__version__ = '25.09.1' diff --git a/source/idea/idea-cluster-manager/webapp/.env b/source/idea/idea-cluster-manager/webapp/.env index 6feafff8..4a9d755e 100644 --- a/source/idea/idea-cluster-manager/webapp/.env +++ b/source/idea/idea-cluster-manager/webapp/.env @@ -1,4 +1,4 @@ REACT_APP_IDEA_HTTP_ENDPOINT="http://localhost:8080" REACT_APP_IDEA_ALB_ENDPOINT="http://localhost:8080" REACT_APP_IDEA_HTTP_API_SUFFIX="/api/v1" -REACT_APP_IDEA_RELEASE_VERSION="25.09.0" +REACT_APP_IDEA_RELEASE_VERSION="25.09.1" diff --git a/source/idea/idea-cluster-manager/webapp/package.json b/source/idea/idea-cluster-manager/webapp/package.json index 40b26aef..4f303707 100644 --- a/source/idea/idea-cluster-manager/webapp/package.json +++ b/source/idea/idea-cluster-manager/webapp/package.json @@ -1,6 +1,6 @@ { "name": "web-portal", - "version": "25.09.0", + "version": "25.09.1", "private": true, "dependencies": { "@aperturerobotics/chonky": "^0.3.1", diff --git a/source/idea/idea-data-model/src/ideadatamodel_meta/__init__.py b/source/idea/idea-data-model/src/ideadatamodel_meta/__init__.py index 204621fc..dbe9cb77 100644 --- a/source/idea/idea-data-model/src/ideadatamodel_meta/__init__.py +++ b/source/idea/idea-data-model/src/ideadatamodel_meta/__init__.py @@ -10,4 +10,4 @@ # and limitations under the License. __name__ = 'idea-data-model' -__version__ = '25.09.0' +__version__ = '25.09.1' diff --git a/source/idea/idea-scheduler/src/ideascheduler/app/provisioning/node_monitor/node_house_keeper.py b/source/idea/idea-scheduler/src/ideascheduler/app/provisioning/node_monitor/node_house_keeper.py index 313aeb3e..94a9a1ba 100644 --- a/source/idea/idea-scheduler/src/ideascheduler/app/provisioning/node_monitor/node_house_keeper.py +++ b/source/idea/idea-scheduler/src/ideascheduler/app/provisioning/node_monitor/node_house_keeper.py @@ -246,8 +246,23 @@ def _can_terminate(self, instance: EC2Instance, node: SocaComputeNode) -> bool: if instance.is_soca_ephemeral_capacity: job = self._context.job_cache.get_job(job_id=instance.soca_job_id) - if job is not None: + if job is not None and job.state in ( + SocaJobState.QUEUED, + SocaJobState.RUNNING, + SocaJobState.HELD, + ): + self._logger.debug( + f'{self.log_tag(instance)} cannot terminate - job {instance.soca_job_id} is {job.state}' + ) return False + elif job is not None: + self._logger.debug( + f'{self.log_tag(instance)} can terminate - job {instance.soca_job_id} is {job.state}' + ) + else: + self._logger.debug( + f'{self.log_tag(instance)} can terminate - job {instance.soca_job_id} not in cache' + ) if instance.soca_keep_forever and instance.soca_terminate_when_idle == 0: return False @@ -393,6 +408,9 @@ def pass1_identify_potential_candidates_for_deletion(self): if not self._can_terminate(instance=instance, node=node): continue + self._logger.debug( + f'{self.log_tag(instance)} adding as candidate for deletion' + ) self._add_candidate_for_deletion(instance=instance) def pass2_compute_present_capacities(self): @@ -585,6 +603,9 @@ def pass6_finalize_resources_to_delete(self): ref.soca_queue_type, self.log_info(ref, 'Stack', ref.soca_compute_stack), ) + self._logger.debug( + f'{self.log_tag(ref)} marking stack for deletion: {ref.soca_compute_stack}' + ) self.stacks_to_delete.add(ref.soca_compute_stack) else: self.spot_fleet_info[spot_fleet_request_id] = ( @@ -613,6 +634,9 @@ def pass6_finalize_resources_to_delete(self): ref.soca_queue_type, self.log_info(ref, 'Stack', ref.soca_compute_stack), ) + self._logger.debug( + f'{self.log_tag(ref)} marking stack for deletion: {ref.soca_compute_stack}' + ) self.stacks_to_delete.add(ref.soca_compute_stack) else: self.auto_scaling_group_info[auto_scaling_group_name] = ( @@ -877,6 +901,9 @@ def cleanup(self): for stack_name in self.stacks_to_delete: queue_type, info = self.stack_info[stack_name] self._logger.info(f'{info} deleting stack') + self._logger.debug( + f'initiating CloudFormation stack deletion: {stack_name}' + ) self.aws_util.cloudformation_delete_stack(stack_name=stack_name) time.sleep(0.3) self._context.metrics.stacks_deleted(queue_type) diff --git a/source/idea/idea-scheduler/src/ideascheduler_meta/__init__.py b/source/idea/idea-scheduler/src/ideascheduler_meta/__init__.py index c83494c2..e8d5df6c 100644 --- a/source/idea/idea-scheduler/src/ideascheduler_meta/__init__.py +++ b/source/idea/idea-scheduler/src/ideascheduler_meta/__init__.py @@ -12,4 +12,4 @@ # pkgconfig for ideascheduler. no dependencies # noqa __name__ = 'idea-scheduler' -__version__ = '25.09.0' +__version__ = '25.09.1' diff --git a/source/idea/idea-sdk/src/ideasdk_meta/__init__.py b/source/idea/idea-sdk/src/ideasdk_meta/__init__.py index d5debae9..4d0206e4 100644 --- a/source/idea/idea-sdk/src/ideasdk_meta/__init__.py +++ b/source/idea/idea-sdk/src/ideasdk_meta/__init__.py @@ -12,4 +12,4 @@ # pkgconfig for soca-sdk. no dependencies # noqa __name__ = 'idea-sdk' -__version__ = '25.09.0' +__version__ = '25.09.1' diff --git a/source/idea/idea-virtual-desktop-controller/src/ideavirtualdesktopcontroller_meta/__init__.py b/source/idea/idea-virtual-desktop-controller/src/ideavirtualdesktopcontroller_meta/__init__.py index e50ed292..7d94cf7f 100644 --- a/source/idea/idea-virtual-desktop-controller/src/ideavirtualdesktopcontroller_meta/__init__.py +++ b/source/idea/idea-virtual-desktop-controller/src/ideavirtualdesktopcontroller_meta/__init__.py @@ -10,4 +10,4 @@ # and limitations under the License. __name__ = 'idea-virtual-desktop-controller' -__version__ = '25.09.0' +__version__ = '25.09.1'