diff --git a/test/scripts/FIXING-VERSION-MISMATCH.md b/test/scripts/FIXING-VERSION-MISMATCH.md new file mode 100644 index 00000000000..3606b88d0cf --- /dev/null +++ b/test/scripts/FIXING-VERSION-MISMATCH.md @@ -0,0 +1,161 @@ +# Fixing OpenShift Version Mismatch in E2E Tests + +## Problem Summary + +**Error**: `Node pool version '4.20.20' must not be greater than Control Plane version '4.20.19'` + +**Failing Tests**: +- Customer should update node pool labels and taints +- Customer should use workload identity via cluster OIDC +- Customer should not perform invalid operations + +## Root Cause + +The version mismatch occurs due to **timing differences in Cincinnati version resolution**: + +1. **Control Plane Version**: Fetched once at test start (cached via `sync.Once`) → got `4.20.19` +2. **Node Pool Version**: Fetched separately (potentially later) → got `4.20.20` +3. Cincinnati released `4.20.20` between the two fetch operations + +### Why This Happens + +See `test/util/framework/deployment_params.go`: + +- `resolveDefaultControlPlaneVersion()`: Caches first result +- Before the fix, `DefaultOpenshiftNodePoolVersionId()` could resolve separately from the control plane version, so: + - A race condition window existed between fetches +- Even after the fix, explicit env vars can still override defaults and cause mismatches + +## Solution + +Code changes in `deployment_params.go`. Alternatively you may use the scripts provided for a quick workaround, explanation is below. + +## 📋 Environment Variable Synchronization + +#### For Local Development (Bash) + +```bash +# Use the synchronization script +source test/scripts/set-ocp-versions.sh candidate 4.20 +``` + +#### For Local Development (PowerShell - Windows) + +```powershell +# Run the PowerShell script +.\test\scripts\Set-OcpVersions.ps1 -ChannelGroup "candidate" -VersionMinor "4.20" +``` + +#### For CI/CD (Prow, GitHub Actions) + +Add this to your CI pipeline BEFORE running tests: + +```bash +# In your Prow job or GitHub Actions workflow +source test/scripts/sync-ocp-versions-ci.sh +``` + +**Or manually set**: + +```bash +export ARO_HCP_OPENSHIFT_CHANNEL_GROUP="candidate" +export ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP="candidate" +export ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION="4.20.20" # Use same version! +export ARO_HCP_OPENSHIFT_NODEPOOL_VERSION="4.20.20" # Use same version! +``` + +## Scripts Provided + +### 1. `test/scripts/check-channel-groups.sh` +**Purpose**: Diagnostic tool to check current configuration +**Usage**: `./test/scripts/check-channel-groups.sh` +**When**: Run when debugging version mismatch issues + +### 2. `test/scripts/set-ocp-versions.sh` +**Purpose**: Set synchronized versions for local testing (Bash) +**Usage**: `source ./test/scripts/set-ocp-versions.sh [channel] [version]` +**When**: Before running tests locally on Linux/macOS + +### 3. `test/scripts/Set-OcpVersions.ps1` +**Purpose**: Set synchronized versions for local testing (PowerShell) +**Usage**: `.\test\scripts\Set-OcpVersions.ps1 -ChannelGroup candidate -VersionMinor 4.20` +**When**: Before running tests locally on Windows + +### 4. `test/scripts/sync-ocp-versions-ci.sh` +**Purpose**: CI/CD integration - fetches and synchronizes versions once +**Usage**: Add to CI pipeline: `source test/scripts/sync-ocp-versions-ci.sh` +**When**: Integrate into Prow jobs, GitHub Actions, etc. + +## Important Notes + +### Scripts Are NOT Auto-Executed + +❗ **The scripts DO NOT run automatically during test execution.** + +They are **helper tools** you can use: +- **Manually** before running tests locally +- **In CI/CD pipelines** (must be explicitly added to pipeline config) +- **For diagnostics** when troubleshooting version issues + +### The Code Changes ARE Automatic + +✅ **The code changes in `deployment_params.go` ARE automatic** - they execute as part of the normal test framework initialization. + +## Recommended Approach + +### For Immediate Fix (Quick) +1. Set environment variables explicitly: + ```bash + export ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION="4.20.20" + export ARO_HCP_OPENSHIFT_NODEPOOL_VERSION="4.20.20" + ``` +2. Run your tests + +### For Long-Term Fix (Best Practice) +1. The code changes are already applied ✅ +2. Test to verify: + ```bash + # Build the test binary + make -C test + + # Set required environment variables + export CUSTOMER_SUBSCRIPTION= + export LOCATION=uksouth + + # Run integration test suite + ./test/aro-hcp-tests run-suite "integration/parallel" --junit-path="junit.xml" + ``` +3. For CI/CD: Add `sync-ocp-versions-ci.sh` to pipeline + +## Validation Commands + +### Check Configuration +```bash +./test/scripts/check-channel-groups.sh +``` + +### Verify Version Resolution +```bash +# List available test cases to see what's available +./test/aro-hcp-tests list | jq '.[].name' +``` + +### Run Failing Tests +```bash +# Ensure environment is configured +export CUSTOMER_SUBSCRIPTION= +export LOCATION=uksouth + +# Run specific test cases +./test/aro-hcp-tests run-test "Customer should update node pool labels and taints" +./test/aro-hcp-tests run-test "Customer should use workload identity via cluster OIDC" +./test/aro-hcp-tests run-test "Customer should not perform invalid operations" +``` + +## Related Files Modified + +1. ✅ `test/util/framework/deployment_params.go` - Core fix +2. ✅ `test/scripts/check-channel-groups.sh` - Diagnostic tool +3. ✅ `test/scripts/set-ocp-versions.sh` - Local dev helper (Bash) +4. ✅ `test/scripts/Set-OcpVersions.ps1` - Local dev helper (PowerShell) +5. ✅ `test/scripts/sync-ocp-versions-ci.sh` - CI/CD helper diff --git a/test/scripts/Set-OcpVersions.ps1 b/test/scripts/Set-OcpVersions.ps1 new file mode 100644 index 00000000000..159bd017c68 --- /dev/null +++ b/test/scripts/Set-OcpVersions.ps1 @@ -0,0 +1,133 @@ +# PowerShell script to set synchronized OpenShift versions for E2E tests +# This ensures control plane and node pool use the same version + +param( + [string]$ChannelGroup = "candidate", + [string]$VersionMinor = "4.20" +) + +Write-Host "=== Setting OpenShift Versions for E2E Tests ===" -ForegroundColor Cyan +Write-Host "Channel Group: $ChannelGroup" +Write-Host "Version Minor: $VersionMinor" +Write-Host "" + +# Function to get latest version from OpenShift graph API +function Get-LatestOpenShiftVersion { + param( + [string]$Channel, + [string]$Minor + ) + + $graphUrl = "https://api.openshift.com/api/upgrades_info/v1/graph?channel=${Channel}-${Minor}" + + try { + Write-Host "Fetching latest version from Cincinnati..." -ForegroundColor Gray + $response = Invoke-RestMethod -Uri $graphUrl -Method Get + + if ($response.nodes.Count -eq 0) { + Write-Error "No versions found for channel ${Channel}-${Minor}" + return $null + } + + # Get the latest version using semantic version sorting + # This handles pre-release versions like "4.20.0-0.nightly-..." correctly + $versions = $response.nodes.version + + # Try to use SemanticVersion (PowerShell 6+) + $useSemanticVersion = $true + $parsedVersions = @() + + foreach ($ver in $versions) { + try { + $semVer = [System.Management.Automation.SemanticVersion]::new($ver) + $parsedVersions += [PSCustomObject]@{ + Original = $ver + SemVer = $semVer + } + } + catch { + # SemanticVersion not available or version string incompatible + $useSemanticVersion = $false + break + } + } + + if ($useSemanticVersion -and $parsedVersions.Count -gt 0) { + # Use SemanticVersion sorting (PowerShell 6+) + $latestVersion = $parsedVersions | Sort-Object -Property SemVer | Select-Object -Last 1 -ExpandProperty Original + } + else { + # Fallback: Custom semver-aware sorting for PowerShell 5.1 + # Properly handles major.minor.patch and pre-release identifiers + $latestVersion = $versions | Sort-Object -Property { + $v = $_ + + # Parse version: separate base version from pre-release/build metadata + if ($v -match '^(\d+)\.(\d+)\.(\d+)(?:-(.+?))?(?:\+(.+))?$') { + $major = [int]$matches[1] + $minor = [int]$matches[2] + $patch = [int]$matches[3] + $prerelease = $matches[4] # e.g., "0.nightly-2024-05-26" + + # Sort key: major.minor.patch as padded numbers, then pre-release + # Versions without pre-release come AFTER pre-release (semver rule) + $sortKey = "{0:D10}.{1:D10}.{2:D10}" -f $major, $minor, $patch + + if ($prerelease) { + # Has pre-release: append "0" + prerelease to sort before release + $sortKey += ".0.$prerelease" + } + else { + # No pre-release: append "1" to sort after pre-release versions + $sortKey += ".1" + } + + return $sortKey + } + else { + # Fallback for unexpected formats: use original string + Write-Warning "Version '$v' doesn't match expected semver format" + return "0000000000.0000000000.0000000000.9.$v" + } + } | Select-Object -Last 1 + } + + return $latestVersion + } + catch { + Write-Error "Failed to fetch version: $_" + return $null + } +} + +# Get the version +$version = Get-LatestOpenShiftVersion -Channel $ChannelGroup -Minor $VersionMinor + +if (-not $version) { + Write-Error "Failed to resolve version" + exit 1 +} + +Write-Host "Resolved Version: $version" -ForegroundColor Green +Write-Host "" + +# Set environment variables +Write-Host "Setting environment variables..." -ForegroundColor Gray +$env:ARO_HCP_OPENSHIFT_CHANNEL_GROUP = $ChannelGroup +$env:ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP = $ChannelGroup +$env:ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION = $version +$env:ARO_HCP_OPENSHIFT_NODEPOOL_VERSION = $version + +# Print what was set +Write-Host "" +Write-Host "✓ Environment variables set:" -ForegroundColor Green +Write-Host " ARO_HCP_OPENSHIFT_CHANNEL_GROUP = $ChannelGroup" +Write-Host " ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP = $ChannelGroup" +Write-Host " ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION = $version" +Write-Host " ARO_HCP_OPENSHIFT_NODEPOOL_VERSION = $version" +Write-Host "" +Write-Host "These variables are set for this PowerShell session." -ForegroundColor Yellow +Write-Host "To persist them, add to your profile or use:" -ForegroundColor Yellow +Write-Host "" +Write-Host '[System.Environment]::SetEnvironmentVariable("ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION", "' + $version + '", "User")' -ForegroundColor Cyan +Write-Host '[System.Environment]::SetEnvironmentVariable("ARO_HCP_OPENSHIFT_NODEPOOL_VERSION", "' + $version + '", "User")' -ForegroundColor Cyan diff --git a/test/scripts/check-channel-groups.sh b/test/scripts/check-channel-groups.sh new file mode 100644 index 00000000000..ac197673444 --- /dev/null +++ b/test/scripts/check-channel-groups.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Script to check and validate OpenShift channel group configuration + +set -euo pipefail + +echo "=== OpenShift Version Configuration Check ===" +echo "" + +# Check environment variables +echo "Environment Variables:" +echo " ARO_HCP_OPENSHIFT_CHANNEL_GROUP: ${ARO_HCP_OPENSHIFT_CHANNEL_GROUP:-}" +echo " ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP: ${ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP:-}" +echo " ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION: ${ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION:-}" +echo " ARO_HCP_OPENSHIFT_NODEPOOL_VERSION: ${ARO_HCP_OPENSHIFT_NODEPOOL_VERSION:-}" +echo "" + +# Determine effective channel groups +CP_CHANNEL="${ARO_HCP_OPENSHIFT_CHANNEL_GROUP:-candidate}" +NP_CHANNEL="${ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP:-candidate}" + +echo "Effective Channel Groups:" +echo " Control Plane: $CP_CHANNEL" +echo " Node Pool: $NP_CHANNEL" +echo "" + +# Check if they match +if [ "$CP_CHANNEL" = "$NP_CHANNEL" ]; then + echo "✓ Channel groups MATCH - versions should be consistent" +else + echo "✗ WARNING: Channel groups DIFFER - this may cause version mismatches!" + echo " Recommendation: Set both to the same channel group" +fi +echo "" + +# Check explicit version overrides +if [ -n "${ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION:-}" ] || [ -n "${ARO_HCP_OPENSHIFT_NODEPOOL_VERSION:-}" ]; then + echo "Explicit Version Overrides Detected:" + if [ -n "${ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION:-}" ]; then + echo " Control Plane: $ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION" + fi + if [ -n "${ARO_HCP_OPENSHIFT_NODEPOOL_VERSION:-}" ]; then + echo " Node Pool: $ARO_HCP_OPENSHIFT_NODEPOOL_VERSION" + fi + + if [ "${ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION:-unset}" != "${ARO_HCP_OPENSHIFT_NODEPOOL_VERSION:-unset}" ]; then + echo " ✗ WARNING: Versions are different!" + echo " This WILL cause validation errors:" + echo " 'Node pool version must not be greater than Control Plane version'" + fi +fi + +echo "" +echo "=== Recommendations ===" +echo "1. Ensure ARO_HCP_OPENSHIFT_CHANNEL_GROUP == ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP" +echo "2. If setting versions explicitly, use the SAME version for both" +echo "3. For CI/CD, use a single version resolution step and pass it to both" diff --git a/test/scripts/lib/version-sort.sh b/test/scripts/lib/version-sort.sh new file mode 100644 index 00000000000..91696022439 --- /dev/null +++ b/test/scripts/lib/version-sort.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Shared version sorting utility for OpenShift version strings +# Compatible with Linux (GNU sort) and macOS (BSD sort) +# Handles semantic versions including pre-release tags (e.g., "4.20.0-0.nightly-...") + +# Portable version sorting function +# Works on both Linux (GNU sort) and macOS (BSD sort) +sort_versions() { + # Try sort -V (GNU sort, available on Linux and via coreutils on macOS) + if sort -V /dev/null &>/dev/null 2>&1; then + sort -V + # Try gsort -V (GNU sort via Homebrew coreutils on macOS) + elif command -v gsort &>/dev/null && gsort -V /dev/null &>/dev/null 2>&1; then + gsort -V + # Fallback: use Python for semantic version sorting + elif command -v python3 &>/dev/null; then + python3 -c ' +import sys +from packaging import version +versions = [line.strip() for line in sys.stdin if line.strip()] +try: + sorted_versions = sorted(versions, key=lambda v: version.parse(v)) + for v in sorted_versions: + print(v) +except: + # Fallback to basic string sort if packaging module not available + for v in sorted(versions): + print(v) +' + else + # Last resort: basic alphanumeric sort (not semver-aware, but better than nothing) + echo "WARNING: No proper version sorting available. Install GNU coreutils or Python with packaging module for accurate results." >&2 + sort + fi +} diff --git a/test/scripts/set-ocp-versions.sh b/test/scripts/set-ocp-versions.sh new file mode 100644 index 00000000000..99e99edbcb7 --- /dev/null +++ b/test/scripts/set-ocp-versions.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# Script to set synchronized OpenShift versions for E2E tests +# This ensures control plane and node pool use the same version +# Compatible with Linux and macOS (automatically detects available sorting methods) + +# Detect if script is being sourced or executed +# When sourced, preserve parent shell options to avoid mutating caller's environment +(return 0 2>/dev/null) && SOURCED=1 || SOURCED=0 + +# Preserve shell options if sourced and set up restoration trap +if [ "$SOURCED" = "1" ]; then + # Save current shell options + OLD_SHELL_OPTS=$(set +o) + # Trap RETURN only (fires when sourced script finishes) + # Avoid ERR/EXIT traps as they would leak into parent shell + trap 'eval "$OLD_SHELL_OPTS"; trap - RETURN' RETURN +fi + +set -euo pipefail + +# Helper function to exit/return appropriately +script_exit() { + local exit_code=$1 + if [ "$SOURCED" = "1" ]; then + # Trap will handle shell option restoration + # Clear trap before returning to avoid it firing again + trap - RETURN + eval "$OLD_SHELL_OPTS" + return "$exit_code" + else + exit "$exit_code" + fi +} + +# Parse command line arguments +CHANNEL_GROUP="${1:-candidate}" +VERSION_MINOR="${2:-4.20}" + +echo "=== Setting OpenShift Versions for E2E Tests ===" +echo "Channel Group: $CHANNEL_GROUP" +echo "Version Minor: $VERSION_MINOR" +echo "" + +# Resolve the latest version for the channel +echo "Fetching latest version from Cincinnati..." + +# Source shared version sorting library +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/lib/version-sort.sh" + +# Function to get latest version (simplified - you may want to use the Go helper) +get_latest_version() { + local channel="$1" + local minor="$2" + + # Query the OpenShift graph API + local graph_url="https://api.openshift.com/api/upgrades_info/v1/graph?channel=${channel}-${minor}" + + # Fetch and parse (requires jq) + if ! command -v jq &> /dev/null; then + echo "ERROR: jq is required but not installed" >&2 + return 1 + fi + + local version=$(curl --silent --show-error --fail --location --retry 3 --retry-delay 2 --retry-connrefused --max-time 30 "$graph_url" | jq -r '.nodes[].version' | sort_versions | tail -1) + + if [ -z "$version" ]; then + echo "ERROR: No version found for channel $channel-$minor" >&2 + return 1 + fi + + echo "$version" +} + +# Get the version +# Note: We capture both the output and the exit status to handle errors properly +if ! VERSION=$(get_latest_version "$CHANNEL_GROUP" "$VERSION_MINOR"); then + echo "ERROR: Failed to fetch version from Cincinnati" >&2 + script_exit 1 +fi + +if [ -z "$VERSION" ]; then + echo "ERROR: Failed to resolve version" >&2 + script_exit 1 +fi + +echo "Resolved Version: $VERSION" +echo "" + +# Export environment variables +echo "Setting environment variables..." +export ARO_HCP_OPENSHIFT_CHANNEL_GROUP="$CHANNEL_GROUP" +export ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP="$CHANNEL_GROUP" +export ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION="$VERSION" +export ARO_HCP_OPENSHIFT_NODEPOOL_VERSION="$VERSION" + +# Print what was set +echo "" +echo "✓ Environment variables set:" +echo " export ARO_HCP_OPENSHIFT_CHANNEL_GROUP=\"$CHANNEL_GROUP\"" +echo " export ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP=\"$CHANNEL_GROUP\"" +echo " export ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION=\"$VERSION\"" +echo " export ARO_HCP_OPENSHIFT_NODEPOOL_VERSION=\"$VERSION\"" +echo "" +echo "To apply these in your current shell, run:" +echo " source ${BASH_SOURCE[0]} $CHANNEL_GROUP $VERSION_MINOR" +echo "" +echo "Or copy and paste:" +cat <&2 + echo " This will cause validation errors." >&2 + echo " Either unset both variables or ensure they match." >&2 + return 1 + fi + + echo "✓ Versions are synchronized" + return 0 + fi + + # Fetch latest version from Cincinnati + echo "Fetching latest version from OpenShift graph API..." + + GRAPH_URL="https://api.openshift.com/api/upgrades_info/v1/graph?channel=${CHANNEL_GROUP}-${VERSION_MINOR}" + + # Use curl with retries for robustness in CI + MAX_RETRIES=3 + RETRY_DELAY=5 + + for i in $(seq 1 $MAX_RETRIES); do + if GRAPH_JSON=$(curl -s --fail --max-time 30 "$GRAPH_URL" 2>/dev/null); then + break + fi + + if [ $i -eq $MAX_RETRIES ]; then + echo "ERROR: Failed to fetch version after $MAX_RETRIES attempts" >&2 + return 1 + fi + + echo "Attempt $i failed, retrying in ${RETRY_DELAY}s..." + sleep $RETRY_DELAY + done + + # Parse latest version (requires jq) + if ! command -v jq &> /dev/null; then + echo "ERROR: jq is required but not installed" >&2 + echo "Install with: apt-get install jq (or equivalent)" >&2 + return 1 + fi + + VERSION=$(echo "$GRAPH_JSON" | jq -r '.nodes[].version' | sort_versions | tail -1) + + if [ -z "$VERSION" ] || [ "$VERSION" = "null" ]; then + echo "ERROR: No version found for channel ${CHANNEL_GROUP}-${VERSION_MINOR}" >&2 + echo "Response was: $GRAPH_JSON" >&2 + return 1 + fi + + echo "Resolved Version: $VERSION" + echo "" + + # Export synchronized versions + export ARO_HCP_OPENSHIFT_CHANNEL_GROUP="$CHANNEL_GROUP" + export ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP="$CHANNEL_GROUP" + export ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION="$VERSION" + export ARO_HCP_OPENSHIFT_NODEPOOL_VERSION="$VERSION" + + # For GitHub Actions - output to GITHUB_ENV + if [ -n "${GITHUB_ENV:-}" ]; then + echo "Exporting to GitHub Actions environment..." + { + echo "ARO_HCP_OPENSHIFT_CHANNEL_GROUP=$CHANNEL_GROUP" + echo "ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP=$CHANNEL_GROUP" + echo "ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION=$VERSION" + echo "ARO_HCP_OPENSHIFT_NODEPOOL_VERSION=$VERSION" + } >> "$GITHUB_ENV" + fi + + echo "✓ Synchronized versions set:" + echo " Channel Group: $CHANNEL_GROUP" + echo " Control Plane Version: $VERSION" + echo " Node Pool Version: $VERSION" + echo "" + + # Output for other CI systems + echo "export ARO_HCP_OPENSHIFT_CHANNEL_GROUP=\"$CHANNEL_GROUP\"" + echo "export ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP=\"$CHANNEL_GROUP\"" + echo "export ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION=\"$VERSION\"" + echo "export ARO_HCP_OPENSHIFT_NODEPOOL_VERSION=\"$VERSION\"" +} + +if main "$@"; then + status=0 +else + status=$? +fi + +# Restore shell options if sourced +if is_sourced; then + # Clear trap before manual restoration to avoid double-firing + trap - RETURN + eval "$OLD_SHELL_OPTS" + return "$status" +else + exit "$status" +fi diff --git a/test/util/framework/deployment_params.go b/test/util/framework/deployment_params.go index b662b652294..02c75d88689 100644 --- a/test/util/framework/deployment_params.go +++ b/test/util/framework/deployment_params.go @@ -24,6 +24,7 @@ import ( "sync" "time" + "github.com/blang/semver/v4" . "github.com/onsi/ginkgo/v2" "k8s.io/apimachinery/pkg/util/rand" @@ -136,20 +137,44 @@ func DefaultOpenshiftNodePoolVersionId() string { version := os.Getenv("ARO_HCP_OPENSHIFT_NODEPOOL_VERSION") if len(version) == 0 { channelGroup := DefaultOpenshiftNodePoolChannelGroup() - if channelGroup == DefaultOpenshiftChannelGroup() { + cpChannelGroup := DefaultOpenshiftChannelGroup() + + // CRITICAL: When channel groups match, ALWAYS use the control plane version + // to prevent version mismatches due to Cincinnati timing differences. + // This ensures node pool version never exceeds control plane version. + if channelGroup == cpChannelGroup { return DefaultOpenshiftControlPlaneVersionId() } - if channelGroup != "stable" { - var err error - version, err = GetLatestInstallVersion(context.Background(), channelGroup, DefaultOCPVersionId) - if err != nil { - if errors.Is(err, ErrNightlyReleaseStreamNotFound) || errors.Is(err, ErrNoAcceptedNightlyTags) || errors.Is(err, ErrVersionNotFound) { - Skip(fmt.Sprintf("No install version found for %s in %s channel (%s)", version, channelGroup, err.Error())) - } else { - Fail(fmt.Sprintf("failed to get latest install version for %s channel: %s", channelGroup, err.Error())) - } + + // Different channel groups: resolve node pool version from its own channel, + // then validate it doesn't exceed control plane version + var err error + version, err = GetLatestInstallVersion(context.Background(), channelGroup, DefaultOCPVersionId) + if err != nil { + if errors.Is(err, ErrNightlyReleaseStreamNotFound) || errors.Is(err, ErrNoAcceptedNightlyTags) || errors.Is(err, ErrVersionNotFound) { + Skip(fmt.Sprintf("No install version found for %s in %s channel (%s)", DefaultOCPVersionId, channelGroup, err.Error())) + } else { + Fail(fmt.Sprintf("failed to get latest install version for %s channel: %s", channelGroup, err.Error())) } } + + // Validate: node pool version must not exceed control plane version + cpVersion := DefaultOpenshiftControlPlaneVersionId() + npSemver, npErr := semver.Parse(version) + cpSemver, cpErr := semver.Parse(cpVersion) + + if npErr == nil && cpErr == nil { + if npSemver.GT(cpSemver) { + // Node pool version exceeds control plane version - clamp it + fmt.Fprintf(os.Stderr, "WARNING: Node pool version %s (from %s channel) exceeds control plane version %s (from %s channel). Clamping to control plane version.\n", + version, channelGroup, cpVersion, cpChannelGroup) + version = cpVersion + } + } else { + // Couldn't parse versions for comparison - log warning but continue + fmt.Fprintf(os.Stderr, "WARNING: Could not compare versions (np=%s, cp=%s). Proceeding with node pool version from %s channel.\n", + version, cpVersion, channelGroup) + } } return version } @@ -211,8 +236,61 @@ type NodePoolAutoScalingParams struct { } func NewDefaultNodePoolParams() NodePoolParams { + npVersion := DefaultOpenshiftNodePoolVersionId() + cpVersion := DefaultOpenshiftControlPlaneVersionId() + + // Validate that node pool version doesn't exceed control plane version + // This catches configuration errors early before they reach the API validation + npVer, npErr := semver.ParseTolerant(npVersion) + cpVer, cpErr := semver.ParseTolerant(cpVersion) + if npErr != nil { + Fail(fmt.Sprintf( + "Configuration error: failed to parse node pool version %q: %v. "+ + "Check your channel group settings:\n"+ + " ARO_HCP_OPENSHIFT_CHANNEL_GROUP=%s\n"+ + " ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP=%s\n"+ + " ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION=%s\n"+ + " ARO_HCP_OPENSHIFT_NODEPOOL_VERSION=%s", + npVersion, npErr, + DefaultOpenshiftChannelGroup(), + DefaultOpenshiftNodePoolChannelGroup(), + os.Getenv("ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION"), + os.Getenv("ARO_HCP_OPENSHIFT_NODEPOOL_VERSION"), + )) + } + if cpErr != nil { + Fail(fmt.Sprintf( + "Configuration error: failed to parse control plane version %q: %v. "+ + "Check your channel group settings:\n"+ + " ARO_HCP_OPENSHIFT_CHANNEL_GROUP=%s\n"+ + " ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP=%s\n"+ + " ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION=%s\n"+ + " ARO_HCP_OPENSHIFT_NODEPOOL_VERSION=%s", + cpVersion, cpErr, + DefaultOpenshiftChannelGroup(), + DefaultOpenshiftNodePoolChannelGroup(), + os.Getenv("ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION"), + os.Getenv("ARO_HCP_OPENSHIFT_NODEPOOL_VERSION"), + )) + } + if npVer.GT(cpVer) { + Fail(fmt.Sprintf( + "Configuration error: Node pool version (%s) exceeds control plane version (%s). "+ + "This will fail API validation. Check your channel group settings:\n"+ + " ARO_HCP_OPENSHIFT_CHANNEL_GROUP=%s\n"+ + " ARO_HCP_OPENSHIFT_NODEPOOL_CHANNEL_GROUP=%s\n"+ + " ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION=%s\n"+ + " ARO_HCP_OPENSHIFT_NODEPOOL_VERSION=%s", + npVersion, cpVersion, + DefaultOpenshiftChannelGroup(), + DefaultOpenshiftNodePoolChannelGroup(), + os.Getenv("ARO_HCP_OPENSHIFT_CONTROLPLANE_VERSION"), + os.Getenv("ARO_HCP_OPENSHIFT_NODEPOOL_VERSION"), + )) + } + return NodePoolParams{ - OpenshiftVersionId: DefaultOpenshiftNodePoolVersionId(), + OpenshiftVersionId: npVersion, Replicas: int32(2), VMSize: "Standard_D8s_v3", OSDiskSizeGiB: int32(64),