diff --git a/backend/cmd/root.go b/backend/cmd/root.go index 65858e7a09f..cfd3b97741d 100644 --- a/backend/cmd/root.go +++ b/backend/cmd/root.go @@ -377,16 +377,27 @@ func (f *BackendRootCmdFlags) ToBackendOptions(ctx context.Context, cmd *cobra.C smiClientBuilder := app.NewServiceManagedIdentityClientBuilder(fpaMIDataplaneClientBuilder, azureConfig) - resourcesCosmosDBClient, billingDBClient, err := app.NewCosmosDBClients( - ctx, + azCoreClientOptions := *azureConfig.CloudEnvironment.AZCoreClientOptions() + + cosmosDatabaseClient, err := app.NewCosmosDatabaseClient( f.AzureCosmosDBURL, f.AzureCosmosDBName, - *azureConfig.CloudEnvironment.AZCoreClientOptions(), + azCoreClientOptions, ) if err != nil { return nil, utils.TrackError(err) } + resourcesCosmosDBClient, billingDBClient, err := app.NewCosmosDBClients(cosmosDatabaseClient) + if err != nil { + return nil, utils.TrackError(err) + } + + fleetDBClient, err := app.NewFleetDBClient(cosmosDatabaseClient) + if err != nil { + return nil, utils.TrackError(fmt.Errorf("failed to create fleet db client: %w", err)) + } + clustersServiceClient, err := app.NewClustersServiceClient(ctx, f.ClustersServiceURL, f.ClustersServiceTLSInsecure) if err != nil { return nil, utils.TrackError(fmt.Errorf("failed to create clusters service client: %w", err)) @@ -401,6 +412,7 @@ func (f *BackendRootCmdFlags) ToBackendOptions(ctx context.Context, cmd *cobra.C LeaderElectionLock: leaderElectionLock, ResourcesDBClient: resourcesCosmosDBClient, BillingDBClient: billingDBClient, + FleetDBClient: fleetDBClient, ClustersServiceClient: clustersServiceClient, MetricsServerListenAddress: f.MetricsServerListenAddress, HealthzServerListenAddress: f.HealthzServerListenAddress, diff --git a/backend/go.mod b/backend/go.mod index 2a25c9a2d4d..b46a5076219 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -15,6 +15,7 @@ require ( github.com/Azure/msi-dataplane v0.4.3 github.com/blang/semver/v4 v4.0.0 github.com/go-logr/logr v1.4.3 + github.com/google/go-cmp v0.7.0 github.com/google/uuid v1.6.0 github.com/openshift-online/maestro v0.0.0-20260213014104-081c1f6df17b github.com/openshift-online/ocm-sdk-go v0.1.499 @@ -75,7 +76,6 @@ require ( github.com/golang/glog v1.2.5 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.7.0 // indirect - github.com/google/go-cmp v0.7.0 // indirect github.com/gorilla/css v1.0.1 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect diff --git a/backend/pkg/app/backend.go b/backend/pkg/app/backend.go index f5be8f6f412..0a91d48ff31 100644 --- a/backend/pkg/app/backend.go +++ b/backend/pkg/app/backend.go @@ -39,6 +39,7 @@ import ( "github.com/Azure/ARO-HCP/backend/pkg/controllers/clusterpropertiescontroller" "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" "github.com/Azure/ARO-HCP/backend/pkg/controllers/datadumpcontrollers" + "github.com/Azure/ARO-HCP/backend/pkg/controllers/managementclustercontrollers" "github.com/Azure/ARO-HCP/backend/pkg/controllers/metricscontrollers" "github.com/Azure/ARO-HCP/backend/pkg/controllers/mismatchcontrollers" "github.com/Azure/ARO-HCP/backend/pkg/controllers/nodepoolpropertiescontroller" @@ -50,6 +51,7 @@ import ( "github.com/Azure/ARO-HCP/backend/pkg/maestro" internalazure "github.com/Azure/ARO-HCP/internal/azure" "github.com/Azure/ARO-HCP/internal/database" + dbinformers "github.com/Azure/ARO-HCP/internal/database/informers" "github.com/Azure/ARO-HCP/internal/ocm" "github.com/Azure/ARO-HCP/internal/utils" ) @@ -65,6 +67,7 @@ type BackendOptions struct { LeaderElectionLock resourcelock.Interface ResourcesDBClient database.ResourcesDBClient BillingDBClient database.BillingDBClient + FleetDBClient database.FleetDBClient ClustersServiceClient ocm.ClusterServiceClientSpec MetricsRegisterer prometheus.Registerer MetricsGatherer prometheus.Gatherer @@ -362,6 +365,10 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, operationPhaseMetricsController := metricscontrollers.NewController( "OperationPhaseMetrics", backendInformers.AllOperations(), operationPhaseHandler) + fleetInformers := dbinformers.NewFleetInformers(ctx, b.options.FleetDBClient.GlobalListers()) + _, stampLister := fleetInformers.Stamps() + _, managementClusterLister := fleetInformers.ManagementClusters() + clusterInformer, clusterLister := backendInformers.Clusters() clusterHandler := metricscontrollers.NewClusterMetricsHandler(b.options.MetricsRegisterer) clusterMetricsController := metricscontrollers.NewController( @@ -385,6 +392,7 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, clusterRecursiveDataDumpController := datadumpcontrollers.NewClusterRecursiveDataDumpController(b.options.ResourcesDBClient, activeOperationLister, backendInformers) csStateDumpController := datadumpcontrollers.NewCSStateDumpController(b.options.ResourcesDBClient, activeOperationLister, backendInformers, b.options.ClustersServiceClient) billingDumpController := datadumpcontrollers.NewBillingDumpController(b.options.ResourcesDBClient, b.options.BillingDBClient, activeOperationLister, backendInformers) + managementClusterDumpController := datadumpcontrollers.NewManagementClusterDataDumpController(b.options.FleetDBClient, managementClusterLister, fleetInformers) doNothingController := controllers.NewDoNothingExampleController(b.options.ResourcesDBClient, subscriptionLister) dispatchRequestCredentialController := operationcontrollers.NewDispatchRequestCredentialController( utilsclock.RealClock{}, @@ -558,20 +566,31 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, b.options.ResourcesDBClient, backendInformers, ) - nodePoolVersionController := upgradecontrollers.NewNodePoolVersionController( b.options.ResourcesDBClient, b.options.ClustersServiceClient, activeOperationLister, backendInformers, ) - triggerNodePoolUpgradeController := upgradecontrollers.NewTriggerNodePoolUpgradeController( b.options.ResourcesDBClient, b.options.ClustersServiceClient, activeOperationLister, backendInformers, ) + managementClusterMigrationController := managementclustercontrollers.NewManagementClusterMigrationController( + b.options.ClustersServiceClient, + b.options.FleetDBClient, + stampLister, + managementClusterLister, + ) + placementSyncController := managementclustercontrollers.NewManagementClusterPlacementSyncController( + b.options.ResourcesDBClient, + b.options.ClustersServiceClient, + activeOperationLister, + managementClusterLister, + backendInformers, + ) nodePoolPropertiesSyncController := nodepoolpropertiescontroller.NewNodePoolPropertiesSyncController( b.options.ResourcesDBClient, @@ -596,11 +615,13 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, OnStartedLeading: func(ctx context.Context) { // start the SharedInformers go backendInformers.RunWithContext(ctx) + go fleetInformers.RunWithContext(ctx) go subscriptionNonClusterDataDumpController.Run(ctx, 20) go clusterRecursiveDataDumpController.Run(ctx, 20) go csStateDumpController.Run(ctx, 20) go billingDumpController.Run(ctx, 20) + go managementClusterDumpController.Run(ctx, 20) go doNothingController.Run(ctx, 20) go dispatchRequestCredentialController.Run(ctx, 20) go dispatchRevokeCredentialsController.Run(ctx, 20) @@ -645,6 +666,8 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, go clusterMetricsController.Run(ctx, 1) go nodePoolMetricsController.Run(ctx, 1) go externalAuthMetricsController.Run(ctx, 1) + go managementClusterMigrationController.Run(ctx, 1) + go placementSyncController.Run(ctx, 20) }, OnStoppedLeading: func() { // This needs to be defined even though it does nothing. diff --git a/backend/pkg/app/cosmos_wiring.go b/backend/pkg/app/cosmos_wiring.go index 4e7f7f00148..e68a36b6a46 100644 --- a/backend/pkg/app/cosmos_wiring.go +++ b/backend/pkg/app/cosmos_wiring.go @@ -15,23 +15,28 @@ package app import ( - "context" "fmt" "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/data/azcosmos" "github.com/Azure/ARO-HCP/internal/database" "github.com/Azure/ARO-HCP/internal/utils" ) -// NewCosmosDBClients opens the shared async Cosmos database and returns data-plane clients for -// ARM resource documents (Resources container) and billing documents (Billing container). -func NewCosmosDBClients(ctx context.Context, cosmosDBURL string, cosmosDBName string, azCoreClientOptions azcore.ClientOptions) (database.ResourcesDBClient, database.BillingDBClient, error) { - cosmosDatabaseClient, err := database.NewCosmosDatabaseClient(cosmosDBURL, cosmosDBName, azCoreClientOptions) +// NewCosmosDatabaseClient creates the shared Cosmos DatabaseClient that +// is passed into the per-container wiring functions below. +func NewCosmosDatabaseClient(cosmosDBURL string, cosmosDBName string, azCoreClientOptions azcore.ClientOptions) (*azcosmos.DatabaseClient, error) { + client, err := database.NewCosmosDatabaseClient(cosmosDBURL, cosmosDBName, azCoreClientOptions) if err != nil { - return nil, nil, utils.TrackError(fmt.Errorf("failed to create Azure Cosmos database client: %w", err)) + return nil, utils.TrackError(fmt.Errorf("failed to create Azure Cosmos database client: %w", err)) } + return client, nil +} +// NewCosmosDBClients returns data-plane clients for +// ARM resource documents (Resources container) and billing documents (Billing container). +func NewCosmosDBClients(cosmosDatabaseClient *azcosmos.DatabaseClient) (database.ResourcesDBClient, database.BillingDBClient, error) { resourcesDBClient, err := database.NewResourcesDBClient(cosmosDatabaseClient) if err != nil { return nil, nil, utils.TrackError(fmt.Errorf("failed to create resources database client: %w", err)) @@ -44,3 +49,12 @@ func NewCosmosDBClients(ctx context.Context, cosmosDBURL string, cosmosDBName st return resourcesDBClient, billingDBClient, nil } + +func NewFleetDBClient(cosmosDatabaseClient *azcosmos.DatabaseClient) (database.FleetDBClient, error) { + fleetClient, err := database.NewFleetDBClient(cosmosDatabaseClient) + if err != nil { + return nil, utils.TrackError(fmt.Errorf("failed to create Fleet DBClient: %w", err)) + } + + return fleetClient, nil +} diff --git a/backend/pkg/controllers/controllerutils/management_cluster_watching_controller.go b/backend/pkg/controllers/controllerutils/management_cluster_watching_controller.go new file mode 100644 index 00000000000..bbf789fb107 --- /dev/null +++ b/backend/pkg/controllers/controllerutils/management_cluster_watching_controller.go @@ -0,0 +1,131 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controllerutils + +import ( + "context" + "errors" + "time" + + "github.com/go-logr/logr" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/database" + dbinformers "github.com/Azure/ARO-HCP/internal/database/informers" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type ManagementClusterKey struct { + StampIdentifier string `json:"stampIdentifier"` +} + +func (k ManagementClusterKey) GetResourceID() *azcorearm.ResourceID { + return api.Must(fleet.ToManagementClusterResourceID(k.StampIdentifier)) +} + +func (k ManagementClusterKey) AddLoggerValues(logger logr.Logger) logr.Logger { + return logger.WithValues( + utils.LogValues{}. + AddLogValuesForResourceID(k.GetResourceID())...) +} + +func (k ManagementClusterKey) InitialController(controllerName string) *api.Controller { + resourceID := api.Must(azcorearm.ParseResourceID(k.GetResourceID().String() + "/" + fleet.ControllerResourceTypeName + "/" + controllerName)) + return &api.Controller{ + CosmosMetadata: api.CosmosMetadata{ + ResourceID: resourceID, + }, + ExternalID: k.GetResourceID(), + Status: api.ControllerStatus{ + Conditions: []metav1.Condition{}, + }, + } +} + +type ManagementClusterSyncer interface { + SyncOnce(ctx context.Context, key ManagementClusterKey) error + CooldownChecker() CooldownChecker +} + +type managementClusterWatchingController struct { + name string + syncer ManagementClusterSyncer + fleetDBClient database.FleetDBClient +} + +// NewManagementClusterWatchingController periodically looks up all management clusters and queues them. +func NewManagementClusterWatchingController( + name string, + fleetDBClient database.FleetDBClient, + fleetInformers dbinformers.FleetInformers, + resyncDuration time.Duration, + syncer ManagementClusterSyncer, +) Controller { + mcSyncer := &managementClusterWatchingController{ + name: name, + syncer: syncer, + fleetDBClient: fleetDBClient, + } + mcController := newGenericWatchingController(name, fleet.ManagementClusterResourceType, mcSyncer) + + // this happens when unit tests don't want triggering. This isn't beautiful, but fails to do nothing which is pretty safe. + if fleetInformers != nil { + managementClusterInformer, _ := fleetInformers.ManagementClusters() + err := mcController.QueueForInformers(resyncDuration, managementClusterInformer) + if err != nil { + panic(err) // coding error + } + } + + return mcController +} + +func (c *managementClusterWatchingController) SyncOnce(ctx context.Context, key ManagementClusterKey) error { + controllerCRUD := c.fleetDBClient.Stamps().ManagementClusters(key.StampIdentifier).Controllers() + + defer utilruntime.HandleCrash(DegradedControllerPanicHandler( + ctx, + controllerCRUD, + c.name, + key.InitialController)) + + syncErr := c.syncer.SyncOnce(ctx, key) + + controllerWriteErr := WriteController( + ctx, + controllerCRUD, + c.name, + key.InitialController, + ReportSyncError(syncErr), + ) + + return errors.Join(syncErr, controllerWriteErr) +} + +func (c *managementClusterWatchingController) CooldownChecker() CooldownChecker { + return c.syncer.CooldownChecker() +} + +func (c *managementClusterWatchingController) MakeKey(resourceID *azcorearm.ResourceID) ManagementClusterKey { + return ManagementClusterKey{ + StampIdentifier: resourceID.Parent.Name, + } +} diff --git a/backend/pkg/controllers/datadumpcontrollers/dump_management_cluster.go b/backend/pkg/controllers/datadumpcontrollers/dump_management_cluster.go new file mode 100644 index 00000000000..51cc3f49390 --- /dev/null +++ b/backend/pkg/controllers/datadumpcontrollers/dump_management_cluster.go @@ -0,0 +1,80 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package datadumpcontrollers + +import ( + "context" + "fmt" + "time" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/database" + dbinformers "github.com/Azure/ARO-HCP/internal/database/informers" + dblisters "github.com/Azure/ARO-HCP/internal/database/listers" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type managementClusterDataDump struct { + cooldownChecker controllerutils.CooldownChecker + managementClusterLister dblisters.ManagementClusterLister + + nextDataDumpChecker controllerutils.CooldownChecker +} + +// NewManagementClusterDataDumpController periodically dumps management cluster data. +func NewManagementClusterDataDumpController( + fleetDBClient database.FleetDBClient, + managementClusterLister dblisters.ManagementClusterLister, + fleetInformers dbinformers.FleetInformers, +) controllerutils.Controller { + syncer := &managementClusterDataDump{ + cooldownChecker: controllerutils.NewTimeBasedCooldownChecker(4 * time.Minute), + managementClusterLister: managementClusterLister, + nextDataDumpChecker: controllerutils.NewTimeBasedCooldownChecker(4 * time.Minute), + } + + return controllerutils.NewManagementClusterWatchingController( + "ManagementClusterDataDump", + fleetDBClient, + fleetInformers, + 5*time.Minute, + syncer, + ) +} + +func (c *managementClusterDataDump) SyncOnce(ctx context.Context, key controllerutils.ManagementClusterKey) error { + if !c.nextDataDumpChecker.CanSync(ctx, key) { + return nil + } + + logger := utils.LoggerFromContext(ctx) + + mc, err := c.managementClusterLister.Get(ctx, key.StampIdentifier) + if err != nil { + logger.Error(err, "failed to get management cluster") + return nil + } + + logger.Info(fmt.Sprintf("dumping resourceID %v", mc.CosmosMetadata.ResourceID), + "currentResourceID", mc.CosmosMetadata.ResourceID.String(), + "content", mc, + ) + + return nil +} + +func (c *managementClusterDataDump) CooldownChecker() controllerutils.CooldownChecker { + return c.cooldownChecker +} diff --git a/backend/pkg/controllers/managementclustercontrollers/management_cluster_migration.go b/backend/pkg/controllers/managementclustercontrollers/management_cluster_migration.go new file mode 100644 index 00000000000..19b51464cc7 --- /dev/null +++ b/backend/pkg/controllers/managementclustercontrollers/management_cluster_migration.go @@ -0,0 +1,290 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package managementclustercontrollers + +import ( + "context" + "errors" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/equality" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/workqueue" + + arohcpv1alpha1 "github.com/openshift-online/ocm-sdk-go/arohcp/v1alpha1" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/database" + dblisters "github.com/Azure/ARO-HCP/internal/database/listers" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +const controllerName = "ManagementClusterMigration" + +var _ controllerutils.Controller = &managementClusterMigrationController{} + +type managementClusterMigrationController struct { + name string + + clusterServiceClient ocm.ClusterServiceClientSpec + fleetDBClient database.FleetDBClient + stampLister dblisters.StampLister + managementClusterLister dblisters.ManagementClusterLister + + resyncDuration time.Duration + queue workqueue.TypedRateLimitingInterface[string] +} + +// NewManagementClusterMigrationController creates a controller that periodically lists +// all management clusters from Cluster Service and upserts them into CosmosDB. +func NewManagementClusterMigrationController( + clusterServiceClient ocm.ClusterServiceClientSpec, + fleetDBClient database.FleetDBClient, + stampLister dblisters.StampLister, + managementClusterLister dblisters.ManagementClusterLister, +) controllerutils.Controller { + return &managementClusterMigrationController{ + name: controllerName, + clusterServiceClient: clusterServiceClient, + fleetDBClient: fleetDBClient, + stampLister: stampLister, + managementClusterLister: managementClusterLister, + resyncDuration: 30 * time.Minute, + queue: workqueue.NewTypedRateLimitingQueueWithConfig( + workqueue.DefaultTypedControllerRateLimiter[string](), + workqueue.TypedRateLimitingQueueConfig[string]{ + Name: controllerName, + }, + ), + } +} + +func (c *managementClusterMigrationController) SyncOnce(ctx context.Context, _ any) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("Syncing management clusters from Cluster Service") + + return utils.TrackError(c.syncAllManagementClusters(ctx)) +} + +// syncAllManagementClusters lists all provision shards from Cluster Service and +// upserts them into Cosmos. Note: this is an additive sync only — management +// clusters removed from CS are not pruned from Cosmos. This is intentional: +// Cosmos is becoming the source of truth, and the admin API registration path +// will eventually replace this sync controller. Decommissioning a management +// cluster will be handled explicitly when the time comes. +func (c *managementClusterMigrationController) syncAllManagementClusters(ctx context.Context) error { + logger := utils.LoggerFromContext(ctx) + + iter := c.clusterServiceClient.ListProvisionShards() + var syncErrors []error + + for csShard := range iter.Items(ctx) { + if err := c.syncProvisionShard(ctx, csShard); err != nil { + syncErrors = append(syncErrors, err) + } + } + + if err := iter.GetError(); err != nil { + logger.Error(err, "failed to list management clusters from Cluster Service") + syncErrors = append(syncErrors, err) + } + + return errors.Join(syncErrors...) +} + +// syncProvisionShard converts a single CS provision shard and upserts it into Cosmos. +func (c *managementClusterMigrationController) syncProvisionShard(ctx context.Context, csShard *arohcpv1alpha1.ProvisionShard) error { + logger := utils.LoggerFromContext(ctx).WithValues("cs_shard_href", csShard.HREF(), "aks_resource_id", csShard.AzureShard().AksManagementClusterResourceId()) + + convertedManagementCluster, err := ocm.ConvertCSManagementClusterToInternal(csShard) + if err != nil { + return fmt.Errorf("failed to convert management cluster: %w", err) + } + + stampIdentifier := convertedManagementCluster.GetStampIdentifier() + + if err := c.ensureStamp(ctx, stampIdentifier); err != nil { + return fmt.Errorf("stamp %s: %w", stampIdentifier, err) + } + + managementClusterCRUD := c.fleetDBClient.Stamps().ManagementClusters(stampIdentifier) + + existing, err := c.managementClusterLister.Get(ctx, stampIdentifier) + if err != nil && !database.IsNotFoundError(err) { + return fmt.Errorf("management cluster %s: %w", stampIdentifier, err) + } + + if database.IsNotFoundError(err) { + // The lister cache may be stale (e.g. on startup). Check CosmosDB directly + // before attempting to create. + existing, err = managementClusterCRUD.Get(ctx, fleet.ManagementClusterResourceName) + if err != nil && !database.IsNotFoundError(err) { + return fmt.Errorf("management cluster %s: %w", stampIdentifier, err) + } + } + + if database.IsNotFoundError(err) { + created, err := managementClusterCRUD.Create(ctx, convertedManagementCluster, nil) + if err != nil { + return fmt.Errorf("management cluster %s: %w", convertedManagementCluster.ResourceID, err) + } + logger.Info("created management cluster", "resource_id", created.ResourceID) + return nil + } + + logger = logger.WithValues("resource_id", existing.ResourceID) + managementClusterToWrite := existing.DeepCopy() + + // SchedulingPolicy is currently synced from Cluster Service provision shard + // status. This is a temporary arrangement during the CS-to-Cosmos migration. + // Once we populate management clusters via rollout pipelines and manage them + // via Geneva Action, this controller will be removed. + managementClusterToWrite.Spec.SchedulingPolicy = convertedManagementCluster.Spec.SchedulingPolicy + for _, cond := range convertedManagementCluster.Status.Conditions { + apimeta.SetStatusCondition(&managementClusterToWrite.Status.Conditions, cond) + } + if equality.Semantic.DeepEqual(existing, managementClusterToWrite) { + logger.V(1).Info("management cluster unchanged, skipping update") + return nil + } + if _, err = managementClusterCRUD.Replace(ctx, managementClusterToWrite, existing, nil); err != nil { + return fmt.Errorf("management cluster %s: %w", existing.ResourceID, err) + } + logger.Info("updated management cluster") + return nil +} + +// ensureStamp upserts the Stamp record. Stamps synced from Cluster Service +// are auto-approved since the provision shard already exists. +func (c *managementClusterMigrationController) ensureStamp(ctx context.Context, stampIdentifier string) error { + logger := utils.LoggerFromContext(ctx) + + existing, err := c.stampLister.Get(ctx, stampIdentifier) + if err != nil && !database.IsNotFoundError(err) { + return fmt.Errorf("stamp %s: %w", stampIdentifier, err) + } + + approvedCondition := metav1.Condition{ + Type: string(fleet.StampConditionApproved), + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Reason: string(fleet.StampConditionReasonAutoApproved), + Message: "Synced from Cluster Service provision shard", + } + + stampsCRUD := c.fleetDBClient.Stamps() + + if database.IsNotFoundError(err) { + // The lister cache may be stale (e.g. on startup). Check CosmosDB directly + // before attempting to create. + existing, err = stampsCRUD.Get(ctx, stampIdentifier) + if err != nil && !database.IsNotFoundError(err) { + return fmt.Errorf("stamp %s: %w", stampIdentifier, err) + } + } + + if database.IsNotFoundError(err) { + stampResourceID, err := fleet.ToStampResourceID(stampIdentifier) + if err != nil { + return fmt.Errorf("invalid stamp identifier %q: %w", stampIdentifier, err) + } + stamp := &fleet.Stamp{ + CosmosMetadata: api.CosmosMetadata{ + ResourceID: stampResourceID, + }, + ResourceID: stampResourceID, + } + apimeta.SetStatusCondition(&stamp.Status.Conditions, approvedCondition) + + created, err := stampsCRUD.Create(ctx, stamp, nil) + if err != nil { + return fmt.Errorf("stamp %s: %w", stampIdentifier, err) + } + logger.Info("created stamp", "stamp_identifier", stampIdentifier, "resource_id", created.CosmosMetadata.ResourceID) + return nil + } + + stampToWrite := existing.DeepCopy() + apimeta.SetStatusCondition(&stampToWrite.Status.Conditions, approvedCondition) + if equality.Semantic.DeepEqual(existing, stampToWrite) { + logger.V(1).Info("stamp unchanged, skipping update") + return nil + } + + if _, err := stampsCRUD.Replace(ctx, stampToWrite, existing, nil); err != nil { + return fmt.Errorf("stamp %s: %w", stampIdentifier, err) + } + logger.Info("updated stamp", "stamp_identifier", stampIdentifier) + return nil +} + +func (c *managementClusterMigrationController) Run(ctx context.Context, threadiness int) { + defer utilruntime.HandleCrash() + defer c.queue.ShutDown() + + ctx = utils.ContextWithControllerName(ctx, c.name) + logger := utils.LoggerFromContext(ctx) + logger = logger.WithValues(utils.LogValues{}.AddControllerName(c.name)...) + ctx = utils.ContextWithLogger(ctx, logger) + logger.Info("Starting") + + for i := 0; i < threadiness; i++ { + go wait.UntilWithContext(ctx, c.runWorker, time.Second) + } + + // We run this periodically enqueuing an arbitrary item named "doWork" to trigger the sync. + go wait.JitterUntilWithContext(ctx, func(ctx context.Context) { c.queue.Add("doWork") }, c.resyncDuration, 0.1, true) + + logger.Info("Started workers") + + <-ctx.Done() + logger.Info("Shutting down") +} + +func (c *managementClusterMigrationController) runWorker(ctx context.Context) { + for c.processNextWorkItem(ctx) { + } +} + +func (c *managementClusterMigrationController) processNextWorkItem(ctx context.Context) bool { + ref, shutdown := c.queue.Get() + if shutdown { + return false + } + defer c.queue.Done(ref) + + logger := utils.LoggerFromContext(ctx) + logger = controllerutils.AddLoggerValues(logger, ref) + ctx = utils.ContextWithLogger(ctx, logger) + + controllerutils.ReconcileTotal.WithLabelValues(c.name).Inc() + err := c.SyncOnce(ctx, ref) + if err == nil { + c.queue.Forget(ref) + return true + } + + utilruntime.HandleErrorWithContext(ctx, err, "Error syncing; requeuing for later retry", "objectReference", ref) + c.queue.AddRateLimited(ref) + + return true +} diff --git a/backend/pkg/controllers/managementclustercontrollers/management_cluster_migration_test.go b/backend/pkg/controllers/managementclustercontrollers/management_cluster_migration_test.go new file mode 100644 index 00000000000..45260390ee6 --- /dev/null +++ b/backend/pkg/controllers/managementclustercontrollers/management_cluster_migration_test.go @@ -0,0 +1,373 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package managementclustercontrollers + +import ( + "context" + "fmt" + "testing" + + "github.com/go-logr/logr/testr" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + arohcpv1alpha1 "github.com/openshift-online/ocm-sdk-go/arohcp/v1alpha1" + + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/database" + dblisters "github.com/Azure/ARO-HCP/internal/database/listers" + "github.com/Azure/ARO-HCP/internal/database/listertesting" + "github.com/Azure/ARO-HCP/internal/databasetesting" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +const ( + testSubscriptionID = "00000000-0000-0000-0000-000000000000" + testResourceGroup = "rg" + testDNSResourceGroup = "dns-rg" + testDNSZone = "test.example.com" + testHostedClustersSecretsKeyVaultURL = "https://cx-kv.vault.azure.net/" + testHostedClustersManagedIdentitiesKeyVaultURL = "https://mi-kv.vault.azure.net/" + testHostedClustersSecretsKeyVaultManagedIdentityClientID = "c2bde1aa-d904-48cd-a728-9de33e3ddca9" + testMaestroRestURL = "http://maestro.maestro.svc.cluster.local:8000" + testMaestroGRPCURL = "maestro-grpc.maestro.svc.cluster.local:8090" + + testShardID = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + testShardID2 = "11111111-2222-3333-4444-555555555555" + testShardID3 = "22222222-3333-4444-5555-666666666666" +) + +func testAKSResourceIDString(aksName string) string { + return fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ContainerService/managedClusters/%s", testSubscriptionID, testResourceGroup, aksName) +} + +func testPublicDNSZoneResourceIDString() string { + return fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/dnszones/%s", testSubscriptionID, testDNSResourceGroup, testDNSZone) +} + +var managementClusterCmpOptions = append( + api.CmpDiffOptions, + cmpopts.IgnoreFields(fleet.ManagementCluster{}, "CosmosMetadata", "ResourceID"), + cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"), +) + +func assertManagementClusterEqual(t *testing.T, expected, got *fleet.ManagementCluster) { + t.Helper() + if diff := cmp.Diff(expected, got, managementClusterCmpOptions...); diff != "" { + t.Errorf("ManagementCluster mismatch (-expected +got):\n%s", diff) + } +} + +func testProvisionShardHREF(shardID string) string { + return "/api/aro_hcp/v1alpha1/provision_shards/" + shardID +} + +func buildExpectedManagementCluster(t *testing.T, shardID, consumerName, aksName string) *fleet.ManagementCluster { + t.Helper() + aksResourceID := api.Must(azcorearm.ParseResourceID(testAKSResourceIDString(aksName))) + publicDNSZoneResourceID := api.Must(azcorearm.ParseResourceID(testPublicDNSZoneResourceIDString())) + return &fleet.ManagementCluster{ + Spec: fleet.ManagementClusterSpec{ + SchedulingPolicy: fleet.ManagementClusterSchedulingPolicySchedulable, + }, + Status: fleet.ManagementClusterStatus{ + AKSResourceID: aksResourceID, + PublicDNSZoneResourceID: publicDNSZoneResourceID, + HostedClustersSecretsKeyVaultURL: testHostedClustersSecretsKeyVaultURL, + HostedClustersManagedIdentitiesKeyVaultURL: testHostedClustersManagedIdentitiesKeyVaultURL, + HostedClustersSecretsKeyVaultManagedIdentityClientID: testHostedClustersSecretsKeyVaultManagedIdentityClientID, + ClusterServiceProvisionShardID: ptr.To(api.Must(api.NewInternalID(testProvisionShardHREF(shardID)))), + MaestroConsumerName: consumerName, + MaestroRESTAPIURL: testMaestroRestURL, + MaestroGRPCTarget: testMaestroGRPCURL, + Conditions: []metav1.Condition{ + { + Type: string(fleet.ManagementClusterConditionReady), + Status: metav1.ConditionTrue, + Reason: string(fleet.ManagementClusterConditionReasonProvisionShardActive), + }, + }, + }, + } +} + +func getManagementCluster(t *testing.T, ctx context.Context, client database.FleetDBClient, stampIdentifier string) *fleet.ManagementCluster { + t.Helper() + mc, err := client.Stamps().ManagementClusters(stampIdentifier).Get(ctx, fleet.ManagementClusterResourceName) + require.NoError(t, err) + return mc +} + +func buildTestProvisionShard(t *testing.T, shardID, consumerName, aksName, status string) *arohcpv1alpha1.ProvisionShard { + t.Helper() + shard, err := arohcpv1alpha1.NewProvisionShard(). + ID(shardID). + HREF(testProvisionShardHREF(shardID)). + Status(status). + Topology("shared"). + AzureShard(arohcpv1alpha1.NewAzureShard(). + AksManagementClusterResourceId(testAKSResourceIDString(aksName)). + PublicDnsZoneResourceId(testPublicDNSZoneResourceIDString()). + CxSecretsKeyVaultUrl(testHostedClustersSecretsKeyVaultURL). + CxManagedIdentitiesKeyVaultUrl(testHostedClustersManagedIdentitiesKeyVaultURL). + CxSecretsKeyVaultManagedIdentityClientId(testHostedClustersSecretsKeyVaultManagedIdentityClientID), + ). + MaestroConfig( + arohcpv1alpha1.NewProvisionShardMaestroConfig(). + ConsumerName(consumerName). + RestApiConfig(arohcpv1alpha1.NewProvisionShardMaestroRestApiConfig(). + Url(testMaestroRestURL)). + GrpcApiConfig(arohcpv1alpha1.NewProvisionShardMaestroGrpcApiConfig(). + Url(testMaestroGRPCURL)), + ). + Build() + require.NoError(t, err) + return shard +} + +type errorManagementClusterLister struct { + listertesting.SliceManagementClusterLister + err error +} + +func (e *errorManagementClusterLister) Get(_ context.Context, _ string) (*fleet.ManagementCluster, error) { + return nil, e.err +} + +func TestSyncOnce(t *testing.T) { + tests := []struct { + name string + setup func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) + expectedErrorSubstr string + validate func(t *testing.T, ctx context.Context, client *databasetesting.MockFleetDBClient) + }{ + { + name: "no shards syncs successfully", + setup: func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) { + t.Helper() + mockCS := ocm.NewMockClusterServiceClientSpec(ctrl) + mockCS.EXPECT().ListProvisionShards().Return(ocm.NewSimpleProvisionShardListIterator(nil, nil)) + return mockCS, databasetesting.NewMockFleetDBClient(), &listertesting.SliceStampLister{}, &listertesting.SliceManagementClusterLister{} + }, + }, + { + name: "CS list error is propagated", + setup: func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) { + t.Helper() + mockCS := ocm.NewMockClusterServiceClientSpec(ctrl) + mockCS.EXPECT().ListProvisionShards().Return(ocm.NewSimpleProvisionShardListIterator(nil, fmt.Errorf("list failed"))) + return mockCS, databasetesting.NewMockFleetDBClient(), &listertesting.SliceStampLister{}, &listertesting.SliceManagementClusterLister{} + }, + expectedErrorSubstr: "list failed", + }, + { + name: "new management cluster is created", + setup: func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) { + t.Helper() + mockCS := ocm.NewMockClusterServiceClientSpec(ctrl) + shard := buildTestProvisionShard(t, testShardID, "test-consumer", "test-westus3-mgmt-1", "active") + mockCS.EXPECT().ListProvisionShards().Return( + ocm.NewSimpleProvisionShardListIterator([]*arohcpv1alpha1.ProvisionShard{shard}, nil), + ) + return mockCS, databasetesting.NewMockFleetDBClient(), &listertesting.SliceStampLister{}, &listertesting.SliceManagementClusterLister{} + }, + validate: func(t *testing.T, ctx context.Context, client *databasetesting.MockFleetDBClient) { + t.Helper() + doc := getManagementCluster(t, ctx, client, "1") + assert.Equal(t, fleet.ManagementClusterResourceName, doc.ResourceID.Name, "resource name should be 'default'") + assert.Equal(t, "1", doc.ResourceID.Parent.Name, "parent name should be the stamp identifier") + expected := buildExpectedManagementCluster(t, testShardID, "test-consumer", "test-westus3-mgmt-1") + assertManagementClusterEqual(t, expected, doc) + }, + }, + { + name: "existing management cluster is updated when SchedulingPolicy changed", + setup: func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) { + t.Helper() + mockCS := ocm.NewMockClusterServiceClientSpec(ctrl) + fleetClient := databasetesting.NewMockFleetDBClient() + + shard := buildTestProvisionShard(t, testShardID, "test-consumer", "test-westus3-mgmt-1", "active") + existing, err := ocm.ConvertCSManagementClusterToInternal(shard) + require.NoError(t, err) + existing.Spec.SchedulingPolicy = fleet.ManagementClusterSchedulingPolicyUnschedulable + _, err = fleetClient.Stamps().ManagementClusters("1").Create(t.Context(), existing, nil) + require.NoError(t, err) + + mockCS.EXPECT().ListProvisionShards().Return( + ocm.NewSimpleProvisionShardListIterator([]*arohcpv1alpha1.ProvisionShard{shard}, nil), + ) + return mockCS, fleetClient, &listertesting.SliceStampLister{}, &listertesting.SliceManagementClusterLister{ManagementClusters: []*fleet.ManagementCluster{existing}} + }, + validate: func(t *testing.T, ctx context.Context, client *databasetesting.MockFleetDBClient) { + t.Helper() + doc := getManagementCluster(t, ctx, client, "1") + expected := buildExpectedManagementCluster(t, testShardID, "test-consumer", "test-westus3-mgmt-1") + assertManagementClusterEqual(t, expected, doc) + }, + }, + { + name: "unchanged management cluster is not replaced", + setup: func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) { + t.Helper() + mockCS := ocm.NewMockClusterServiceClientSpec(ctrl) + fleetClient := databasetesting.NewMockFleetDBClient() + + shard := buildTestProvisionShard(t, testShardID, "test-consumer", "test-westus3-mgmt-1", "active") + existing, err := ocm.ConvertCSManagementClusterToInternal(shard) + require.NoError(t, err) + + mockCS.EXPECT().ListProvisionShards().Return( + ocm.NewSimpleProvisionShardListIterator([]*arohcpv1alpha1.ProvisionShard{shard}, nil), + ) + return mockCS, fleetClient, &listertesting.SliceStampLister{}, &listertesting.SliceManagementClusterLister{ManagementClusters: []*fleet.ManagementCluster{existing}} + }, + }, + { + name: "conversion error is collected and reported", + setup: func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) { + t.Helper() + mockCS := ocm.NewMockClusterServiceClientSpec(ctrl) + badShard, err := arohcpv1alpha1.NewProvisionShard(). + ID("bad-shard"). + AzureShard(arohcpv1alpha1.NewAzureShard(). + AksManagementClusterResourceId(testAKSResourceIDString("test-westus3-mgmt-bad")). + PublicDnsZoneResourceId(testPublicDNSZoneResourceIDString()). + CxSecretsKeyVaultUrl(testHostedClustersSecretsKeyVaultURL). + CxManagedIdentitiesKeyVaultUrl(testHostedClustersManagedIdentitiesKeyVaultURL). + CxSecretsKeyVaultManagedIdentityClientId(testHostedClustersSecretsKeyVaultManagedIdentityClientID), + ). + Build() + require.NoError(t, err) + mockCS.EXPECT().ListProvisionShards().Return( + ocm.NewSimpleProvisionShardListIterator([]*arohcpv1alpha1.ProvisionShard{badShard}, nil), + ) + return mockCS, databasetesting.NewMockFleetDBClient(), &listertesting.SliceStampLister{}, &listertesting.SliceManagementClusterLister{} + }, + expectedErrorSubstr: "provision shard has empty HREF", + }, + { + name: "multiple shards are all created", + setup: func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) { + t.Helper() + mockCS := ocm.NewMockClusterServiceClientSpec(ctrl) + + shard1 := buildTestProvisionShard(t, testShardID2, "consumer-1", "test-westus3-mgmt-1", "active") + shard2 := buildTestProvisionShard(t, testShardID3, "consumer-2", "test-eastus-mgmt-2", "active") + + mockCS.EXPECT().ListProvisionShards().Return( + ocm.NewSimpleProvisionShardListIterator([]*arohcpv1alpha1.ProvisionShard{shard1, shard2}, nil), + ) + return mockCS, databasetesting.NewMockFleetDBClient(), &listertesting.SliceStampLister{}, &listertesting.SliceManagementClusterLister{} + }, + validate: func(t *testing.T, ctx context.Context, client *databasetesting.MockFleetDBClient) { + t.Helper() + doc1 := getManagementCluster(t, ctx, client, "1") + expected1 := buildExpectedManagementCluster(t, testShardID2, "consumer-1", "test-westus3-mgmt-1") + assertManagementClusterEqual(t, expected1, doc1) + + doc2 := getManagementCluster(t, ctx, client, "2") + expected2 := buildExpectedManagementCluster(t, testShardID3, "consumer-2", "test-eastus-mgmt-2") + assertManagementClusterEqual(t, expected2, doc2) + }, + }, + { + name: "existing cluster condition transitions from Ready=False to Ready=True", + setup: func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) { + t.Helper() + mockCS := ocm.NewMockClusterServiceClientSpec(ctrl) + fleetClient := databasetesting.NewMockFleetDBClient() + + maintenanceShard := buildTestProvisionShard(t, testShardID, "test-consumer", "test-westus3-mgmt-1", "maintenance") + existing, err := ocm.ConvertCSManagementClusterToInternal(maintenanceShard) + require.NoError(t, err) + _, err = fleetClient.Stamps().ManagementClusters("1").Create(t.Context(), existing, nil) + require.NoError(t, err) + + activeShard := buildTestProvisionShard(t, testShardID, "test-consumer", "test-westus3-mgmt-1", "active") + mockCS.EXPECT().ListProvisionShards().Return( + ocm.NewSimpleProvisionShardListIterator([]*arohcpv1alpha1.ProvisionShard{activeShard}, nil), + ) + return mockCS, fleetClient, &listertesting.SliceStampLister{}, &listertesting.SliceManagementClusterLister{ManagementClusters: []*fleet.ManagementCluster{existing}} + }, + validate: func(t *testing.T, ctx context.Context, client *databasetesting.MockFleetDBClient) { + t.Helper() + doc := getManagementCluster(t, ctx, client, "1") + assert.Equal(t, fleet.ManagementClusterSchedulingPolicySchedulable, doc.Spec.SchedulingPolicy, "should be schedulable") + var readyCond *metav1.Condition + for i := range doc.Status.Conditions { + if doc.Status.Conditions[i].Type == string(fleet.ManagementClusterConditionReady) { + readyCond = &doc.Status.Conditions[i] + break + } + } + require.NotNil(t, readyCond, "Ready condition must exist") + assert.Equal(t, metav1.ConditionTrue, readyCond.Status, "Ready condition should be True") + assert.Equal(t, string(fleet.ManagementClusterConditionReasonProvisionShardActive), readyCond.Reason) + }, + }, + { + name: "lister Get non-404 error is collected and reported", + setup: func(t *testing.T, ctrl *gomock.Controller) (ocm.ClusterServiceClientSpec, *databasetesting.MockFleetDBClient, dblisters.StampLister, dblisters.ManagementClusterLister) { + t.Helper() + mockCS := ocm.NewMockClusterServiceClientSpec(ctrl) + shard := buildTestProvisionShard(t, testShardID, "test-consumer", "test-westus3-mgmt-1", "active") + mockCS.EXPECT().ListProvisionShards().Return( + ocm.NewSimpleProvisionShardListIterator([]*arohcpv1alpha1.ProvisionShard{shard}, nil), + ) + return mockCS, databasetesting.NewMockFleetDBClient(), &listertesting.SliceStampLister{}, &errorManagementClusterLister{err: fmt.Errorf("lister internal error")} + }, + expectedErrorSubstr: "lister internal error", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + ctx := utils.ContextWithLogger(t.Context(), testr.New(t)) + ctrl := gomock.NewController(t) + cs, fleetClient, sLister, mcLister := tt.setup(t, ctrl) + + c := &managementClusterMigrationController{ + name: "test", + clusterServiceClient: cs, + fleetDBClient: fleetClient, + stampLister: sLister, + managementClusterLister: mcLister, + } + + err := c.SyncOnce(ctx, nil) + if len(tt.expectedErrorSubstr) > 0 { + require.Error(t, err) + assert.Contains(t, err.Error(), tt.expectedErrorSubstr) + } else { + require.NoError(t, err) + } + if tt.validate != nil { + tt.validate(t, ctx, fleetClient) + } + }) + } +} diff --git a/backend/pkg/controllers/managementclustercontrollers/management_cluster_placement_sync.go b/backend/pkg/controllers/managementclustercontrollers/management_cluster_placement_sync.go new file mode 100644 index 00000000000..69b6bef9c1f --- /dev/null +++ b/backend/pkg/controllers/managementclustercontrollers/management_cluster_placement_sync.go @@ -0,0 +1,169 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package managementclustercontrollers + +import ( + "context" + "fmt" + "time" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/backend/pkg/informers" + "github.com/Azure/ARO-HCP/backend/pkg/listers" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + dblisters "github.com/Azure/ARO-HCP/internal/database/listers" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +// managementClusterPlacementSyncer resolves the management cluster an HCP runs on +// and updates the ServiceProviderCluster document with the ManagementClusterResourceID. +type managementClusterPlacementSyncer struct { + cooldownChecker controllerutils.CooldownChecker + + serviceProviderClusterLister listers.ServiceProviderClusterLister + clusterLister listers.ClusterLister + managementClusterLister dblisters.ManagementClusterLister + cosmosClient database.ResourcesDBClient + clusterServiceClient ocm.ClusterServiceClientSpec +} + +var _ controllerutils.ClusterSyncer = (*managementClusterPlacementSyncer)(nil) + +// NewManagementClusterPlacementSyncController creates a new controller that syncs the +// management cluster placement from Cluster Service into the ServiceProviderCluster document. +func NewManagementClusterPlacementSyncController( + cosmosClient database.ResourcesDBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + activeOperationLister listers.ActiveOperationLister, + managementClusterLister dblisters.ManagementClusterLister, + informers informers.BackendInformers, +) controllerutils.Controller { + _, clusterLister := informers.Clusters() + _, serviceProviderClusterLister := informers.ServiceProviderClusters() + + syncer := &managementClusterPlacementSyncer{ + cooldownChecker: controllerutils.DefaultActiveOperationPrioritizingCooldown(activeOperationLister), + serviceProviderClusterLister: serviceProviderClusterLister, + clusterLister: clusterLister, + managementClusterLister: managementClusterLister, + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + } + + controller := controllerutils.NewClusterWatchingController( + "ManagementClusterPlacementSync", + cosmosClient, + informers, + 5*time.Minute, // Check every 5 minutes + syncer, + ) + + return controller +} + +func (c *managementClusterPlacementSyncer) CooldownChecker() controllerutils.CooldownChecker { + return c.cooldownChecker +} + +// needsWork checks if the ServiceProviderCluster still needs its ManagementClusterResourceID resolved. +func (c *managementClusterPlacementSyncer) needsWork(spc *api.ServiceProviderCluster) bool { + return spc.Status.ManagementClusterResourceID == nil +} + +// SyncOnce resolves the management cluster placement for a single HCP cluster. +// It fetches the provision shard from Cluster Service, resolves it to a ManagementCluster +// document in CosmosDB, and sets ManagementClusterResourceID on the ServiceProviderCluster. +func (c *managementClusterPlacementSyncer) SyncOnce(ctx context.Context, key controllerutils.HCPClusterKey) error { + logger := utils.LoggerFromContext(ctx) + + // do the super cheap cache check first + cachedSPC, err := c.serviceProviderClusterLister.Get(ctx, key.SubscriptionID, key.ResourceGroupName, key.HCPClusterName) + if database.IsNotFoundError(err) { + logger.V(1).Info("ServiceProviderCluster not found in cache, skipping") + return nil + } + if err != nil { + return utils.TrackError(fmt.Errorf("failed to get ServiceProviderCluster from cache: %w", err)) + } + if !c.needsWork(cachedSPC) { + logger.V(1).Info("ServiceProviderCluster already has ManagementClusterResourceID, skipping") + return nil + } + + // Get the cluster from cache to check if it has a CS ID to query + cachedCluster, err := c.clusterLister.Get(ctx, key.SubscriptionID, key.ResourceGroupName, key.HCPClusterName) + if database.IsNotFoundError(err) { + logger.V(1).Info("Cluster not found in cache, skipping") + return nil + } + if err != nil { + return utils.TrackError(fmt.Errorf("failed to get cluster from cache: %w", err)) + } + if cachedCluster.ServiceProviderProperties.ClusterServiceID == nil || len(cachedCluster.ServiceProviderProperties.ClusterServiceID.String()) == 0 { + logger.V(1).Info("Cluster has no ClusterServiceID, skipping") + return nil + } + + // Get the ServiceProviderCluster from Cosmos (live read) + spcCRUD := c.cosmosClient.ServiceProviderClusters(key.SubscriptionID, key.ResourceGroupName, key.HCPClusterName) + existingSPC, err := spcCRUD.Get(ctx, api.ServiceProviderClusterResourceName) + if database.IsNotFoundError(err) { + logger.V(1).Info("ServiceProviderCluster not found in Cosmos, skipping") + return nil + } + if err != nil { + return utils.TrackError(fmt.Errorf("failed to get ServiceProviderCluster: %w", err)) + } + // check if we need to do work again. Sometimes the live data is more fresh than the cache + if !c.needsWork(existingSPC) { + logger.V(1).Info("ServiceProviderCluster already has ManagementClusterResourceID (live read), skipping") + return nil + } + + // Get the provision shard from Cluster Service via the dedicated endpoint. + csShard, err := c.clusterServiceClient.GetClusterProvisionShard(ctx, *cachedCluster.ServiceProviderProperties.ClusterServiceID) + if err != nil { + return utils.TrackError(fmt.Errorf("failed to get provision shard from Cluster Service: %w", err)) + } + + if len(csShard.HREF()) == 0 { + logger.V(1).Info("Provision shard not yet allocated by Cluster Service, skipping") + return nil + } + provisionShardID, err := api.NewInternalID(csShard.HREF()) + if err != nil { + return utils.TrackError(fmt.Errorf("failed to parse provision shard href: %w", err)) + } + + // Resolve the provision shard to a management cluster in CosmosDB + managementCluster, err := c.managementClusterLister.GetByCSProvisionShardID(ctx, provisionShardID.ID()) + if err != nil { + return utils.TrackError(fmt.Errorf("failed to resolve provision shard %q to management cluster: %w", provisionShardID.Path(), err)) + } + + // Set the ManagementClusterResourceID on the ServiceProviderCluster + existingSPC.Status.ManagementClusterResourceID = managementCluster.ResourceID + + if _, err := spcCRUD.Replace(ctx, existingSPC, nil); err != nil { + return utils.TrackError(fmt.Errorf("failed to update ServiceProviderCluster: %w", err)) + } + + logger.Info("synced management cluster placement", + "managementClusterID", managementCluster.ResourceID.String(), + ) + return nil +} diff --git a/backend/pkg/controllers/managementclustercontrollers/management_cluster_placement_sync_test.go b/backend/pkg/controllers/managementclustercontrollers/management_cluster_placement_sync_test.go new file mode 100644 index 00000000000..fd42690fb7d --- /dev/null +++ b/backend/pkg/controllers/managementclustercontrollers/management_cluster_placement_sync_test.go @@ -0,0 +1,332 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package managementclustercontrollers + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + + "k8s.io/utils/ptr" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + arohcpv1alpha1 "github.com/openshift-online/ocm-sdk-go/arohcp/v1alpha1" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/backend/pkg/listertesting" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/arm" + "github.com/Azure/ARO-HCP/internal/api/fleet" + dblistertesting "github.com/Azure/ARO-HCP/internal/database/listertesting" + "github.com/Azure/ARO-HCP/internal/databasetesting" + "github.com/Azure/ARO-HCP/internal/ocm" +) + +const ( + testClusterSubscriptionID = "00000000-0000-0000-0000-000000000000" + testClusterResourceGroup = "test-rg" + testClusterName = "test-cluster" + testClusterServiceIDStr = "/api/clusters_mgmt/v1/clusters/abc123" + testProvisionShardIDStr = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + testMgmtClusterName = "mc1" +) + +func testClusterResourceID() *azcorearm.ResourceID { + return api.Must(azcorearm.ParseResourceID( + "/subscriptions/" + testClusterSubscriptionID + + "/resourceGroups/" + testClusterResourceGroup + + "/providers/Microsoft.RedHatOpenShift/hcpOpenShiftClusters/" + testClusterName, + )) +} + +func testMgmtClusterResourceID() *azcorearm.ResourceID { + return api.Must(fleet.ToManagementClusterResourceID(testMgmtClusterName)) +} + +func newTestHCPCluster(opts ...func(*api.HCPOpenShiftCluster)) *api.HCPOpenShiftCluster { + resourceID := testClusterResourceID() + clusterServiceID := api.Must(api.NewInternalID(testClusterServiceIDStr)) + + cluster := &api.HCPOpenShiftCluster{ + TrackedResource: arm.TrackedResource{ + Resource: arm.Resource{ + ID: resourceID, + Name: testClusterName, + Type: resourceID.ResourceType.String(), + }, + }, + ServiceProviderProperties: api.HCPOpenShiftClusterServiceProviderProperties{ + ClusterServiceID: &clusterServiceID, + }, + } + for _, opt := range opts { + opt(cluster) + } + return cluster +} + +func newTestSPC(opts ...func(*api.ServiceProviderCluster)) *api.ServiceProviderCluster { + clusterResourceID := testClusterResourceID() + spcResourceID := api.Must(azcorearm.ParseResourceID( + clusterResourceID.String() + "/" + api.ServiceProviderClusterResourceTypeName + "/" + api.ServiceProviderClusterResourceName, + )) + + spc := &api.ServiceProviderCluster{ + CosmosMetadata: api.CosmosMetadata{ + ResourceID: spcResourceID, + }, + } + for _, opt := range opts { + opt(spc) + } + return spc +} + +func newTestManagementCluster() *fleet.ManagementCluster { + resourceID := testMgmtClusterResourceID() + return &fleet.ManagementCluster{ + CosmosMetadata: api.CosmosMetadata{ + ResourceID: resourceID, + }, + ResourceID: resourceID, + Status: fleet.ManagementClusterStatus{ + ClusterServiceProvisionShardID: ptr.To(api.Must(api.NewInternalID(testProvisionShardHREF(testProvisionShardIDStr)))), + }, + } +} + +// alwaysSyncCooldownChecker always allows syncing +type alwaysSyncCooldownChecker struct{} + +func (c *alwaysSyncCooldownChecker) CanSync(ctx context.Context, key any) bool { + return true +} + +func TestManagementClusterPlacementSyncer_SyncOnce(t *testing.T) { + testCases := []struct { + name string + cachedSPC *api.ServiceProviderCluster // SPC in cache, nil means use same as existingSPC + existingSPC *api.ServiceProviderCluster // SPC in cosmos + cachedCluster *api.HCPOpenShiftCluster // cluster in cache + csShard *arohcpv1alpha1.ProvisionShard + csError error + managementClusters []*fleet.ManagementCluster + expectCSCall bool + expectError bool + expectedManagementClusterResourceID string // empty means nil + }{ + { + name: "cache indicates no work needed - ManagementClusterResourceID already set", + cachedSPC: newTestSPC(func(spc *api.ServiceProviderCluster) { + spc.Status.ManagementClusterResourceID = testMgmtClusterResourceID() + }), + existingSPC: newTestSPC(func(spc *api.ServiceProviderCluster) { + spc.Status.ManagementClusterResourceID = testMgmtClusterResourceID() + }), + cachedCluster: newTestHCPCluster(), + expectCSCall: false, + expectError: false, + expectedManagementClusterResourceID: testMgmtClusterResourceID().String(), + }, + { + name: "cache says work needed but live data has ManagementClusterResourceID", + cachedSPC: newTestSPC(), // cache has no ManagementClusterResourceID + existingSPC: newTestSPC(func(spc *api.ServiceProviderCluster) { + // cosmos has it (cache is stale) + spc.Status.ManagementClusterResourceID = testMgmtClusterResourceID() + }), + cachedCluster: newTestHCPCluster(), + expectCSCall: false, + expectError: false, + expectedManagementClusterResourceID: testMgmtClusterResourceID().String(), + }, + { + name: "no cluster service ID - skip", + cachedSPC: newTestSPC(), + existingSPC: newTestSPC(), + cachedCluster: newTestHCPCluster(func(c *api.HCPOpenShiftCluster) { + c.ServiceProviderProperties.ClusterServiceID = &api.InternalID{} + }), + expectCSCall: false, + expectError: false, + expectedManagementClusterResourceID: "", + }, + { + name: "provision shard not allocated yet - skip", + cachedSPC: newTestSPC(), + existingSPC: newTestSPC(), + cachedCluster: newTestHCPCluster(), + csShard: api.Must(arohcpv1alpha1.NewProvisionShard().Build()), + managementClusters: []*fleet.ManagementCluster{newTestManagementCluster()}, + expectCSCall: true, + expectError: false, + expectedManagementClusterResourceID: "", + }, + { + name: "success - resolves provision shard to management cluster", + cachedSPC: newTestSPC(), + existingSPC: newTestSPC(), + cachedCluster: newTestHCPCluster(), + csShard: api.Must(arohcpv1alpha1.NewProvisionShard(). + HREF(testProvisionShardHREF(testProvisionShardIDStr)). + Build()), + managementClusters: []*fleet.ManagementCluster{newTestManagementCluster()}, + expectCSCall: true, + expectError: false, + expectedManagementClusterResourceID: testMgmtClusterResourceID().String(), + }, + { + name: "error - CS call fails", + cachedSPC: newTestSPC(), + existingSPC: newTestSPC(), + cachedCluster: newTestHCPCluster(), + csError: fmt.Errorf("connection refused"), + managementClusters: []*fleet.ManagementCluster{newTestManagementCluster()}, + expectCSCall: true, + expectError: true, + expectedManagementClusterResourceID: "", + }, + { + name: "error - invalid provision shard HREF", + cachedSPC: newTestSPC(), + existingSPC: newTestSPC(), + cachedCluster: newTestHCPCluster(), + csShard: api.Must(arohcpv1alpha1.NewProvisionShard(). + HREF("unknown-shard-id"). + Build()), + managementClusters: []*fleet.ManagementCluster{newTestManagementCluster()}, + expectCSCall: true, + expectError: true, + expectedManagementClusterResourceID: "", + }, + { + name: "error - no management cluster found for provision shard", + cachedSPC: newTestSPC(), + existingSPC: newTestSPC(), + cachedCluster: newTestHCPCluster(), + csShard: api.Must(arohcpv1alpha1.NewProvisionShard(). + HREF(testProvisionShardHREF(testProvisionShardIDStr)). + Build()), + managementClusters: []*fleet.ManagementCluster{}, // empty — no match + expectCSCall: true, + expectError: true, + expectedManagementClusterResourceID: "", + }, + { + name: "error - multiple management clusters for same provision shard", + cachedSPC: newTestSPC(), + existingSPC: newTestSPC(), + cachedCluster: newTestHCPCluster(), + csShard: api.Must(arohcpv1alpha1.NewProvisionShard(). + HREF(testProvisionShardHREF(testProvisionShardIDStr)). + Build()), + managementClusters: []*fleet.ManagementCluster{ + newTestManagementCluster(), + func() *fleet.ManagementCluster { + mc := newTestManagementCluster() + mc.ResourceID = api.Must(fleet.ToManagementClusterResourceID("mc2")) + return mc + }(), + }, + expectCSCall: true, + expectError: true, + expectedManagementClusterResourceID: "", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + ctx := context.Background() + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + // Setup mock DB + mockDB := databasetesting.NewMockResourcesDBClient() + + // Create the SPC in cosmos + spcCRUD := mockDB.ServiceProviderClusters(testClusterSubscriptionID, testClusterResourceGroup, testClusterName) + _, err := spcCRUD.Create(ctx, tc.existingSPC, nil) + require.NoError(t, err) + + // Setup SPC lister (cache) + cachedSPC := tc.cachedSPC + if cachedSPC == nil { + cachedSPC = tc.existingSPC + } + spcLister := &listertesting.SliceServiceProviderClusterLister{ + ServiceProviderClusters: []*api.ServiceProviderCluster{cachedSPC}, + } + + // Setup cluster lister (cache) + clusterLister := &listertesting.SliceClusterLister{ + Clusters: []*api.HCPOpenShiftCluster{tc.cachedCluster}, + } + + // Setup management cluster lister + mgmtClusterLister := &dblistertesting.SliceManagementClusterLister{ + ManagementClusters: tc.managementClusters, + } + + // Setup mock CS client + mockCSClient := ocm.NewMockClusterServiceClientSpec(ctrl) + if tc.expectCSCall { + mockCSClient.EXPECT(). + GetClusterProvisionShard(gomock.Any(), api.Must(api.NewInternalID(testClusterServiceIDStr))). + Return(tc.csShard, tc.csError) + } + + // Create syncer + syncer := &managementClusterPlacementSyncer{ + cooldownChecker: &alwaysSyncCooldownChecker{}, + serviceProviderClusterLister: spcLister, + clusterLister: clusterLister, + managementClusterLister: mgmtClusterLister, + cosmosClient: mockDB, + clusterServiceClient: mockCSClient, + } + + // Execute + key := controllerutils.HCPClusterKey{ + SubscriptionID: testClusterSubscriptionID, + ResourceGroupName: testClusterResourceGroup, + HCPClusterName: testClusterName, + } + err = syncer.SyncOnce(ctx, key) + + if tc.expectError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + + // Verify the SPC state in Cosmos + updatedSPC, err := spcCRUD.Get(ctx, api.ServiceProviderClusterResourceName) + require.NoError(t, err) + + if tc.expectedManagementClusterResourceID != "" { + require.NotNil(t, updatedSPC.Status.ManagementClusterResourceID) + assert.Equal(t, tc.expectedManagementClusterResourceID, updatedSPC.Status.ManagementClusterResourceID.String()) + } else { + assert.Nil(t, updatedSPC.Status.ManagementClusterResourceID) + } + }) + } +} diff --git a/config/config.schema.json b/config/config.schema.json index c2286a95407..86342eefac2 100644 --- a/config/config.schema.json +++ b/config/config.schema.json @@ -1430,6 +1430,9 @@ "locksContainerMaxScale": { "type": "integer" }, + "fleetContainerMaxScale": { + "type": "integer" + }, "zoneRedundantMode": { "$ref": "#/definitions/zoneRedundantMode" } diff --git a/config/config.yaml b/config/config.yaml index 8ce5e054e7b..18c1e70adc4 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -731,6 +731,7 @@ defaults: resourceContainerMaxScale: 4000 billingContainerMaxScale: 4000 locksContainerMaxScale: 4000 + fleetContainerMaxScale: 4000 cert: name: frontend-cert-{{ .ctx.environment }}-{{ .ctx.regionShort }} exitOnPanic: true diff --git a/config/rendered/dev/ci01/westus3.yaml b/config/rendered/dev/ci01/westus3.yaml index 52545a0818e..45aae93f731 100755 --- a/config/rendered/dev/ci01/westus3.yaml +++ b/config/rendered/dev/ci01/westus3.yaml @@ -261,6 +261,7 @@ frontend: billingContainerMaxScale: 4000 deploy: true disableLocalAuth: true + fleetContainerMaxScale: 4000 locksContainerMaxScale: 4000 name: arohcpci01-rp-j7654321 private: false diff --git a/config/rendered/dev/cspr/westus3.yaml b/config/rendered/dev/cspr/westus3.yaml index 45ed39dfe91..6cd71b5231e 100755 --- a/config/rendered/dev/cspr/westus3.yaml +++ b/config/rendered/dev/cspr/westus3.yaml @@ -261,6 +261,7 @@ frontend: billingContainerMaxScale: 4000 deploy: true disableLocalAuth: true + fleetContainerMaxScale: 4000 locksContainerMaxScale: 4000 name: arohcpcspr-rp-usw3 private: false diff --git a/config/rendered/dev/dev/westus3.yaml b/config/rendered/dev/dev/westus3.yaml index 83a5aa3d482..c085833a56c 100755 --- a/config/rendered/dev/dev/westus3.yaml +++ b/config/rendered/dev/dev/westus3.yaml @@ -261,6 +261,7 @@ frontend: billingContainerMaxScale: 4000 deploy: true disableLocalAuth: true + fleetContainerMaxScale: 4000 locksContainerMaxScale: 4000 name: arohcpdev-rp-usw3 private: false diff --git a/config/rendered/dev/perf/westus3.yaml b/config/rendered/dev/perf/westus3.yaml index d11f0a32272..1a32d716a99 100755 --- a/config/rendered/dev/perf/westus3.yaml +++ b/config/rendered/dev/perf/westus3.yaml @@ -261,6 +261,7 @@ frontend: billingContainerMaxScale: 4000 deploy: true disableLocalAuth: true + fleetContainerMaxScale: 4000 locksContainerMaxScale: 4000 name: arohcpperf-rp-usw3ptest private: true diff --git a/config/rendered/dev/pers/westus3.yaml b/config/rendered/dev/pers/westus3.yaml index 37115c37a73..be7a23d9d96 100755 --- a/config/rendered/dev/pers/westus3.yaml +++ b/config/rendered/dev/pers/westus3.yaml @@ -261,6 +261,7 @@ frontend: billingContainerMaxScale: 4000 deploy: true disableLocalAuth: true + fleetContainerMaxScale: 4000 locksContainerMaxScale: 4000 name: arohcppers-rp-usw3test private: false diff --git a/config/rendered/dev/prow/westus3.yaml b/config/rendered/dev/prow/westus3.yaml index a62f0ea1aa8..c36a3d79085 100755 --- a/config/rendered/dev/prow/westus3.yaml +++ b/config/rendered/dev/prow/westus3.yaml @@ -261,6 +261,7 @@ frontend: billingContainerMaxScale: 4000 deploy: true disableLocalAuth: true + fleetContainerMaxScale: 4000 locksContainerMaxScale: 4000 name: arohcpprow-rp-j7654321 private: false diff --git a/dev-infrastructure/configurations/svc-cluster.tmpl.bicepparam b/dev-infrastructure/configurations/svc-cluster.tmpl.bicepparam index 364a7b1a3c1..b7658e18d33 100644 --- a/dev-infrastructure/configurations/svc-cluster.tmpl.bicepparam +++ b/dev-infrastructure/configurations/svc-cluster.tmpl.bicepparam @@ -171,6 +171,7 @@ param owningTeamTagValue = '{{ .monitoring.alertRuleOwningTeamTag }}' param resourceContainerMaxScale = {{ .frontend.cosmosDB.resourceContainerMaxScale }} param billingContainerMaxScale = {{ .frontend.cosmosDB.billingContainerMaxScale }} param locksContainerMaxScale = {{ .frontend.cosmosDB.locksContainerMaxScale }} +param fleetContainerMaxScale = {{ .frontend.cosmosDB.fleetContainerMaxScale }} // Audit Logs Event Hub param auditLogsEventHubName = '{{ .auditLogsEventHub.name }}' diff --git a/dev-infrastructure/modules/rp-cosmos.bicep b/dev-infrastructure/modules/rp-cosmos.bicep index fe1690b59a0..8dc0d8cede9 100644 --- a/dev-infrastructure/modules/rp-cosmos.bicep +++ b/dev-infrastructure/modules/rp-cosmos.bicep @@ -13,6 +13,7 @@ param private bool param resourceContainerMaxScale int param billingContainerMaxScale int param locksContainerMaxScale int +param fleetContainerMaxScale int var containers = [ { @@ -33,6 +34,12 @@ var containers = [ partitionKeyPaths: ['/id'] maxThroughput: locksContainerMaxScale } + { + name: 'Fleet' + defaultTtl: -1 // On, no default expiration + partitionKeyPaths: ['/partitionKey'] + maxThroughput: fleetContainerMaxScale + } ] param roleDefinitionId string = '00000000-0000-0000-0000-000000000002' diff --git a/dev-infrastructure/templates/svc-cluster.bicep b/dev-infrastructure/templates/svc-cluster.bicep index 524b8251166..0d77e4b9153 100644 --- a/dev-infrastructure/templates/svc-cluster.bicep +++ b/dev-infrastructure/templates/svc-cluster.bicep @@ -428,6 +428,7 @@ param owningTeamTagValue string param resourceContainerMaxScale int param billingContainerMaxScale int param locksContainerMaxScale int +param fleetContainerMaxScale int @description('The name of the Exporter managed identity') param exporterMIName string @@ -768,6 +769,7 @@ module rpCosmosDb '../modules/rp-cosmos.bicep' = if (deployFrontendCosmos) { resourceContainerMaxScale: resourceContainerMaxScale billingContainerMaxScale: billingContainerMaxScale locksContainerMaxScale: locksContainerMaxScale + fleetContainerMaxScale: fleetContainerMaxScale } } diff --git a/hack/run-with-port-forward.sh b/hack/run-with-port-forward.sh index 58c44100be7..90691e41fb1 100755 --- a/hack/run-with-port-forward.sh +++ b/hack/run-with-port-forward.sh @@ -33,10 +33,22 @@ trap cleanup EXIT INT TERM echo "Port-forward established: localhost:$LOCAL_PORT -> $SERVICE_NAME.$NAMESPACE:$REMOTE_PORT" echo "PID: $PORT_FORWARD_PID" -# Wait a moment for port-forward to be ready -sleep 2 +# Wait for port-forward to be ready +for i in $(seq 1 30); do + if curl --silent --output /dev/null --max-time 1 "http://localhost:$LOCAL_PORT" 2>/dev/null; then + break + fi + if ! kill -0 "$PORT_FORWARD_PID" 2>/dev/null; then + echo "Port-forward process died unexpectedly" + exit 1 + fi + if [ "$i" -eq 30 ]; then + echo "Timed out waiting for port-forward on localhost:$LOCAL_PORT" + exit 1 + fi + sleep 1 +done -# Test the connection echo "Running command: $*" "$@" diff --git a/hack/update-deepcopy.sh b/hack/update-deepcopy.sh index d22eaca5ed1..0d90934ecda 100755 --- a/hack/update-deepcopy.sh +++ b/hack/update-deepcopy.sh @@ -29,7 +29,8 @@ DEEPCOPY_GEN="${DEEPCOPY_GEN:-deepcopy-gen}" --output-file zz_generated.deepcopy.go \ --go-header-file "${REPO_ROOT}/hack/boilerplate.go.txt" \ github.com/Azure/ARO-HCP/internal/api \ - github.com/Azure/ARO-HCP/internal/api/arm + github.com/Azure/ARO-HCP/internal/api/arm \ + github.com/Azure/ARO-HCP/internal/api/fleet # Post-process generated files. # @@ -45,7 +46,8 @@ DEEPCOPY_GEN="${DEEPCOPY_GEN:-deepcopy-gen}" # replace with a direct assignment. for f in \ "${REPO_ROOT}/internal/api/zz_generated.deepcopy.go" \ - "${REPO_ROOT}/internal/api/arm/zz_generated.deepcopy.go"; do + "${REPO_ROOT}/internal/api/arm/zz_generated.deepcopy.go" \ + "${REPO_ROOT}/internal/api/fleet/zz_generated.deepcopy.go"; do if [[ ! -f "${f}" ]]; then continue @@ -100,4 +102,8 @@ for f in \ os::util::sed \ 's/\(.*\)\.DeepCopyany()/\1/g' \ "${f}" + done + +# Format generated files so import ordering matches project conventions. +make -C "${REPO_ROOT}" fmt diff --git a/internal/api/fleet/doc.go b/internal/api/fleet/doc.go new file mode 100644 index 00000000000..6a2b3ec492e --- /dev/null +++ b/internal/api/fleet/doc.go @@ -0,0 +1,17 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +k8s:deepcopy-gen=package + +package fleet diff --git a/internal/api/fleet/partition.go b/internal/api/fleet/partition.go new file mode 100644 index 00000000000..00a6936cb8c --- /dev/null +++ b/internal/api/fleet/partition.go @@ -0,0 +1,32 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// this is a temporary solution for partition key flexibility in CRUDs until +// https://github.com/Azure/ARO-HCP/pull/5094 lands + +package fleet + +func (s *Stamp) GetStampIdentifier() string { + if s.CosmosMetadata.ResourceID == nil { + return "" + } + return s.CosmosMetadata.ResourceID.Name +} + +func (mc *ManagementCluster) GetStampIdentifier() string { + if mc.CosmosMetadata.ResourceID == nil || mc.CosmosMetadata.ResourceID.Parent == nil { + return "" + } + return mc.CosmosMetadata.ResourceID.Parent.Name +} diff --git a/internal/api/fleet/registry.go b/internal/api/fleet/registry.go new file mode 100644 index 00000000000..c6c304cbe12 --- /dev/null +++ b/internal/api/fleet/registry.go @@ -0,0 +1,35 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fleet + +import ( + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + "github.com/Azure/ARO-HCP/internal/api" +) + +const ( + StampResourceTypeName = "stamps" + + ManagementClusterResourceTypeName = "managementClusters" + ManagementClusterResourceName = "default" + ControllerResourceTypeName = "controllers" +) + +var ( + StampResourceType = azcorearm.NewResourceType(api.ProviderNamespace, StampResourceTypeName) + ManagementClusterResourceType = azcorearm.NewResourceType(api.ProviderNamespace, StampResourceTypeName+"/"+ManagementClusterResourceTypeName) + ManagementClusterControllerResourceType = azcorearm.NewResourceType(api.ProviderNamespace, StampResourceTypeName+"/"+ManagementClusterResourceTypeName+"/"+ControllerResourceTypeName) +) diff --git a/internal/api/fleet/types_cosmosdata.go b/internal/api/fleet/types_cosmosdata.go new file mode 100644 index 00000000000..275ddc2d988 --- /dev/null +++ b/internal/api/fleet/types_cosmosdata.go @@ -0,0 +1,51 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fleet + +import ( + "path" + "strings" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" +) + +// ToStampResourceID constructs the resource ID for a stamp: +// /providers/Microsoft.RedHatOpenShift/stamps/{stampIdentifier} +func ToStampResourceID(stampIdentifier string) (*azcorearm.ResourceID, error) { + return azcorearm.ParseResourceID(ToStampResourceIDString(stampIdentifier)) +} + +// ToStampResourceIDString returns the lowercased stamp resource ID string. +func ToStampResourceIDString(stampIdentifier string) string { + return strings.ToLower(path.Join( + "/providers", StampResourceType.String(), stampIdentifier, + )) +} + +// ToManagementClusterResourceID constructs the full resource ID for a +// management cluster singleton within a stamp: +// /providers/Microsoft.RedHatOpenShift/stamps/{stampIdentifier}/managementClusters/default +func ToManagementClusterResourceID(stampIdentifier string) (*azcorearm.ResourceID, error) { + return azcorearm.ParseResourceID(ToManagementClusterResourceIDString(stampIdentifier)) +} + +// ToManagementClusterResourceIDString returns the lowercased resource ID string +// for a management cluster singleton. +func ToManagementClusterResourceIDString(stampIdentifier string) string { + return strings.ToLower(path.Join( + "/providers", StampResourceType.String(), stampIdentifier, + ManagementClusterResourceTypeName, ManagementClusterResourceName, + )) +} diff --git a/internal/api/fleet/types_management_cluster.go b/internal/api/fleet/types_management_cluster.go new file mode 100644 index 00000000000..1a8fe29c51c --- /dev/null +++ b/internal/api/fleet/types_management_cluster.go @@ -0,0 +1,219 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fleet + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + "github.com/Azure/ARO-HCP/internal/api" +) + +// ManagementClusterConditionType represents the type of a management cluster condition. +// +// Condition lifecycle follows Kubernetes conventions: +// - Conditions are added on first evaluation and never removed. +// - Status is toggled between True/False/Unknown; absence means "not yet evaluated." +// - Each condition type is owned by exactly one controller (see ownership below). +type ManagementClusterConditionType string + +// ManagementClusterConditionReason represents the reason for a management cluster condition. +type ManagementClusterConditionReason string + +const ( + // ManagementClusterConditionReady indicates the management cluster is + // provisioned and operational. This is an aggregate condition: True only + // when both ClustersServiceRegistered and MaestroRegistered are True. + // Owner: ManagementClusterPromotionController. + ManagementClusterConditionReady ManagementClusterConditionType = "Ready" + + // ManagementClusterConditionClustersServiceRegistered indicates whether the + // provision shard exists and is configured correctly in ClustersService. + // Owner: ClustersServiceRegistrationController. + ManagementClusterConditionClustersServiceRegistered ManagementClusterConditionType = "ClustersServiceRegistered" + + // ManagementClusterConditionMaestroRegistered indicates whether the consumer + // exists and is configured correctly in Maestro. + // Owner: MaestroRegistrationController. + ManagementClusterConditionMaestroRegistered ManagementClusterConditionType = "MaestroRegistered" + + // ManagementClusterConditionReasonProvisionShardActive indicates the CS provision + // shard is active and the management cluster is ready for scheduling. + ManagementClusterConditionReasonProvisionShardActive ManagementClusterConditionReason = "ProvisionShardActive" + + // ManagementClusterConditionReasonProvisionShardMaintenance indicates the CS provision + // shard is in maintenance mode. + ManagementClusterConditionReasonProvisionShardMaintenance ManagementClusterConditionReason = "ProvisionShardMaintenance" + + // ManagementClusterConditionReasonProvisionShardOffline indicates the CS provision + // shard is offline. + ManagementClusterConditionReasonProvisionShardOffline ManagementClusterConditionReason = "ProvisionShardOffline" + + // ManagementClusterConditionReasonProvisionShardStatusUnknown indicates the CS provision + // shard has an unknown status. + ManagementClusterConditionReasonProvisionShardStatusUnknown ManagementClusterConditionReason = "ProvisionShardStatusUnknown" + + // ManagementClusterConditionReasonRegistered indicates the downstream system + // (ClustersService or Maestro) is configured correctly. + ManagementClusterConditionReasonRegistered ManagementClusterConditionReason = "Registered" + + // ManagementClusterConditionReasonRegistrationFailed indicates the downstream system + // registration failed and could not be reestablished. + ManagementClusterConditionReasonRegistrationFailed ManagementClusterConditionReason = "RegistrationFailed" + + // ManagementClusterConditionReasonAllRegistered indicates all sub-conditions + // (ClustersServiceRegistered, MaestroRegistered) are True. + ManagementClusterConditionReasonAllRegistered ManagementClusterConditionReason = "AllRegistered" + + // ManagementClusterConditionReasonRegistrationIncomplete indicates one or more + // sub-conditions are not True. + ManagementClusterConditionReasonRegistrationIncomplete ManagementClusterConditionReason = "RegistrationIncomplete" +) + +// ManagementClusterSchedulingPolicy controls whether new hosted control planes +// may be scheduled onto a management cluster. Follows the Kubernetes typed +// string enum pattern (like TaintEffect, RestartPolicy). +type ManagementClusterSchedulingPolicy string + +const ( + // ManagementClusterSchedulingPolicySchedulable allows new HCPs to be + // scheduled on the cluster (subject to Ready condition and capacity). + ManagementClusterSchedulingPolicySchedulable ManagementClusterSchedulingPolicy = "Schedulable" + + // ManagementClusterSchedulingPolicyUnschedulable prevents new HCPs from + // being scheduled regardless of capacity. Analogous to cordoning a + // Kubernetes Node via kubectl cordon. + ManagementClusterSchedulingPolicyUnschedulable ManagementClusterSchedulingPolicy = "Unschedulable" +) + +// ValidManagementClusterSchedulingPolicies is the set of valid values for +// ManagementClusterSchedulingPolicy. +var ValidManagementClusterSchedulingPolicies = sets.New( + ManagementClusterSchedulingPolicySchedulable, + ManagementClusterSchedulingPolicyUnschedulable, +) + +// ManagementCluster is a target for provisioning hosted control planes. +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +type ManagementCluster struct { + // CosmosMetadata ResourceID is nested under the cluster so that association and cleanup work as expected + // it will be the ManagementClusterResourceTypeName + api.CosmosMetadata `json:"cosmosMetadata"` + + // ResourceID exists to match cosmosMetadata.resourceID until we're able to transition all types to use cosmosMetadata, + // at which point we will stop using properties.resourceId in our queries. + // Example: "/providers/microsoft.redhatopenshift/stamps/1/managementclusters/default" + // + // +required, immutable once set. + ResourceID *azcorearm.ResourceID `json:"resourceId,omitempty"` + + // Spec contains the desired state of the management cluster + Spec ManagementClusterSpec `json:"spec"` + + // Status contains the observed state of the management cluster + Status ManagementClusterStatus `json:"status"` +} + +// ManagementClusterSpec contains the desired state of a management cluster. +type ManagementClusterSpec struct { + // SchedulingPolicy controls whether new hosted control planes can be scheduled + // on this management cluster. + // + // Valid values: + // - "Schedulable": management cluster accepts new HCPs (subject to Ready + // condition and capacity constraints) + // - "Unschedulable": management cluster rejects new HCPs regardless of capacity + // (analogous to cordoning a Kubernetes Node via kubectl cordon) + // + // Must be set explicitly. Empty string is not allowed. + // + // Ownership: currently synced from Cluster Service provision shard status + // by ManagementClusterMigrationController (temporary, during CS-to-Cosmos migration). + // Will transition to being owned by the admin API via a Geneva Action for + // SRE-initiated cordon/uncordon operations. + SchedulingPolicy ManagementClusterSchedulingPolicy `json:"schedulingPolicy"` +} + +// ManagementClusterStatus contains the observed state of a management cluster. +type ManagementClusterStatus struct { + // Conditions is a list of conditions tracking the lifecycle of the management cluster. + // Known condition types are defined as ManagementClusterConditionType constants: + // Ready, ClustersServiceRegistered, MaestroRegistered. + // + // Conditions are added on first evaluation and never removed. Status is toggled + // between True/False/Unknown. Absence of a condition means "not yet evaluated." + // Each condition type is owned by exactly one controller to avoid write conflicts. + // + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` + + // AKSResourceID is the Azure resource ID of the AKS management cluster. + // Example: "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/clustername" + // + // +required, validated as a well-formed ARM resource ID, immutable once set. + AKSResourceID *azcorearm.ResourceID `json:"aksResourceID,omitempty"` + + // PublicDNSZoneResourceID is the Azure resource ID of the public DNS zone for the management cluster. + // Example: "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/dns-rg/providers/Microsoft.Network/dnszones/example.com" + // + // +required, validated as a well-formed ARM resource ID, immutable once set. + PublicDNSZoneResourceID *azcorearm.ResourceID `json:"publicDNSZoneResourceID,omitempty"` + + // HostedClustersSecretsKeyVaultURL is the URL of the key vault containing secrets for hosted clusters on this management cluster. + // Example: "https://kv-hc-secrets.vault.azure.net" + // + // +required, validated as a well-formed URL, immutable once set. + HostedClustersSecretsKeyVaultURL string `json:"hostedClustersSecretsKeyVaultURL,omitempty"` + + // HostedClustersManagedIdentitiesKeyVaultURL is the URL of the key vault containing managed identity backing certificates for hosted clusters. + // Example: "https://kv-hc-mi.vault.azure.net" + // + // +required, validated as a well-formed URL, immutable once set. + HostedClustersManagedIdentitiesKeyVaultURL string `json:"hostedClustersManagedIdentitiesKeyVaultURL,omitempty"` + + // HostedClustersSecretsKeyVaultManagedIdentityClientID is the client ID of the managed identity + // used to access the hosted clusters secrets key vault. + // Example: "12345678-1234-1234-1234-123456789012" + // + // +required, validated as a UUID, immutable once set. + HostedClustersSecretsKeyVaultManagedIdentityClientID string `json:"hostedClustersSecretsKeyVaultManagedIdentityClientID,omitempty"` + + // MaestroConsumerName is the consumer name of the management cluster in Maestro. + // Typically derived from the management cluster stamp identifier. + // Example: "hcp-underlay-westus3-mgmt-1" + // + // +required, immutable once set. + MaestroConsumerName string `json:"maestroConsumerName,omitempty"` + + // MaestroRESTAPIURL is the URL of the Maestro REST API. + // Example: "http://maestro.maestro.svc.cluster.local:8000" + // + // +required, validated as a well-formed URL, immutable once set. + MaestroRESTAPIURL string `json:"maestroRESTAPIURL,omitempty"` + + // MaestroGRPCTarget is the gRPC dial target (host:port) of the Maestro GRPC API. + // Example: "maestro-grpc.maestro.svc.cluster.local:8090" + // + // +required, immutable once set. + MaestroGRPCTarget string `json:"maestroGRPCTarget,omitempty"` + + // ClusterServiceProvisionShardID is the Cluster Service provision shard HREF for this management cluster. + // Example: "/api/aro_hcp/v1alpha1/provision_shards/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + // + // +required, immutable once set. + ClusterServiceProvisionShardID *api.InternalID `json:"clusterServiceProvisionShardID,omitempty"` +} diff --git a/internal/api/fleet/types_runtime.go b/internal/api/fleet/types_runtime.go new file mode 100644 index 00000000000..0c3ffd5a012 --- /dev/null +++ b/internal/api/fleet/types_runtime.go @@ -0,0 +1,82 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fleet + +import ( + "strings" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +var ( + _ runtime.Object = &Stamp{} + _ metav1.ObjectMetaAccessor = &Stamp{} + _ runtime.Object = &ManagementCluster{} + _ metav1.ObjectMetaAccessor = &ManagementCluster{} +) + +func (o *Stamp) GetObjectKind() schema.ObjectKind { + return schema.EmptyObjectKind +} + +func (o *Stamp) GetObjectMeta() metav1.Object { + om := &metav1.ObjectMeta{} + if o.GetResourceID() != nil { + om.Name = strings.ToLower(o.GetResourceID().String()) + } + return om +} + +// StampList is a list of Stamp resources. +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +type StampList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []Stamp `json:"items"` +} + +var _ runtime.Object = &StampList{} + +func (l *StampList) GetObjectKind() schema.ObjectKind { + return &l.TypeMeta +} + +func (o *ManagementCluster) GetObjectKind() schema.ObjectKind { + return schema.EmptyObjectKind +} + +func (o *ManagementCluster) GetObjectMeta() metav1.Object { + om := &metav1.ObjectMeta{} + if o.GetResourceID() != nil { + om.Name = strings.ToLower(o.GetResourceID().String()) + } + return om +} + +// ManagementClusterList is a list of ManagementCluster resources. +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +type ManagementClusterList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ManagementCluster `json:"items"` +} + +var _ runtime.Object = &ManagementClusterList{} + +func (l *ManagementClusterList) GetObjectKind() schema.ObjectKind { + return &l.TypeMeta +} diff --git a/internal/api/fleet/types_stamp.go b/internal/api/fleet/types_stamp.go new file mode 100644 index 00000000000..4d20a4ecf4a --- /dev/null +++ b/internal/api/fleet/types_stamp.go @@ -0,0 +1,77 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fleet + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + "github.com/Azure/ARO-HCP/internal/api" +) + +// StampConditionType represents the type of a stamp condition. +type StampConditionType string + +// StampConditionReason represents the reason for a stamp condition. +type StampConditionReason string + +const ( + // StampConditionApproved indicates whether the stamp has been approved + // for promotion to a ManagementCluster. + StampConditionApproved StampConditionType = "Approved" + + // StampConditionReasonAutoApproved indicates the stamp was automatically + // approved (non-production environments). + StampConditionReasonAutoApproved StampConditionReason = "AutoApproved" + + // StampConditionReasonManuallyApproved indicates the stamp was approved + // by an SRE via the admin API. + StampConditionReasonManuallyApproved StampConditionReason = "ManuallyApproved" + + // StampConditionReasonApprovalRevoked indicates approval was revoked + // via the admin API. + StampConditionReasonApprovalRevoked StampConditionReason = "ApprovalRevoked" +) + +// Stamp is the parent scope for management cluster resources, analogous to +// a CAPI Machine. It represents provisioning intent and lifecycle state. +// The ManagementCluster sub-resource is the Node — the operational record +// that the RP consumes. +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +type Stamp struct { + api.CosmosMetadata `json:"cosmosMetadata"` + + // ResourceID exists to match cosmosMetadata.resourceID until we're able to transition all types to use cosmosMetadata, + // at which point we will stop using properties.resourceId in our queries. + // Example: "/providers/microsoft.redhatopenshift/stamps/1" + // + // +required, immutable once set. + ResourceID *azcorearm.ResourceID `json:"resourceId,omitempty"` + + Spec StampSpec `json:"spec"` + Status StampStatus `json:"status"` +} + +// StampSpec contains the desired state of a stamp. +// Reserved for future provisioning intent (constraints, features, sizing). +type StampSpec struct{} + +// StampStatus contains the observed state of a stamp. +type StampStatus struct { + // Conditions tracks the stamp's lifecycle progression. + // Known condition types: Approved. + Conditions []metav1.Condition `json:"conditions,omitempty"` +} diff --git a/internal/api/fleet/zz_generated.deepcopy.go b/internal/api/fleet/zz_generated.deepcopy.go new file mode 100644 index 00000000000..81f4f4b5720 --- /dev/null +++ b/internal/api/fleet/zz_generated.deepcopy.go @@ -0,0 +1,247 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by deepcopy-gen-v0.34. DO NOT EDIT. + +package fleet + +import ( + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + + api "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/arm" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ManagementCluster) DeepCopyInto(out *ManagementCluster) { + *out = *in + in.CosmosMetadata.DeepCopyInto(&out.CosmosMetadata) + if in.ResourceID != nil { + in, out := &in.ResourceID, &out.ResourceID + *out = arm.DeepCopyResourceID(*in) + } + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManagementCluster. +func (in *ManagementCluster) DeepCopy() *ManagementCluster { + if in == nil { + return nil + } + out := new(ManagementCluster) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ManagementCluster) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ManagementClusterList) DeepCopyInto(out *ManagementClusterList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ManagementCluster, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManagementClusterList. +func (in *ManagementClusterList) DeepCopy() *ManagementClusterList { + if in == nil { + return nil + } + out := new(ManagementClusterList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ManagementClusterList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ManagementClusterSpec) DeepCopyInto(out *ManagementClusterSpec) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManagementClusterSpec. +func (in *ManagementClusterSpec) DeepCopy() *ManagementClusterSpec { + if in == nil { + return nil + } + out := new(ManagementClusterSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ManagementClusterStatus) DeepCopyInto(out *ManagementClusterStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.AKSResourceID != nil { + in, out := &in.AKSResourceID, &out.AKSResourceID + *out = arm.DeepCopyResourceID(*in) + } + if in.PublicDNSZoneResourceID != nil { + in, out := &in.PublicDNSZoneResourceID, &out.PublicDNSZoneResourceID + *out = arm.DeepCopyResourceID(*in) + } + if in.ClusterServiceProvisionShardID != nil { + in, out := &in.ClusterServiceProvisionShardID, &out.ClusterServiceProvisionShardID + *out = new(api.InternalID) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManagementClusterStatus. +func (in *ManagementClusterStatus) DeepCopy() *ManagementClusterStatus { + if in == nil { + return nil + } + out := new(ManagementClusterStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Stamp) DeepCopyInto(out *Stamp) { + *out = *in + in.CosmosMetadata.DeepCopyInto(&out.CosmosMetadata) + if in.ResourceID != nil { + in, out := &in.ResourceID, &out.ResourceID + *out = arm.DeepCopyResourceID(*in) + } + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Stamp. +func (in *Stamp) DeepCopy() *Stamp { + if in == nil { + return nil + } + out := new(Stamp) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *Stamp) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StampList) DeepCopyInto(out *StampList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]Stamp, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StampList. +func (in *StampList) DeepCopy() *StampList { + if in == nil { + return nil + } + out := new(StampList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *StampList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StampSpec) DeepCopyInto(out *StampSpec) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StampSpec. +func (in *StampSpec) DeepCopy() *StampSpec { + if in == nil { + return nil + } + out := new(StampSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StampStatus) DeepCopyInto(out *StampStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StampStatus. +func (in *StampStatus) DeepCopy() *StampStatus { + if in == nil { + return nil + } + out := new(StampStatus) + in.DeepCopyInto(out) + return out +} diff --git a/internal/api/types_cosmosdata.go b/internal/api/types_cosmosdata.go index 688d7450c5c..077a9f1fe4f 100644 --- a/internal/api/types_cosmosdata.go +++ b/internal/api/types_cosmosdata.go @@ -25,8 +25,12 @@ import ( type CosmosMetadata = arm.CosmosMetadata -func ToResourceGroupResourceIDString(subscriptionName, resourcGroupName string) string { - return strings.ToLower(path.Join("/subscriptions", subscriptionName, "resourceGroups", resourcGroupName)) +func ToResourceGroupResourceIDString(subscriptionName, resourceGroupName string) string { + return strings.ToLower(path.Join("/subscriptions", subscriptionName, "resourceGroups", resourceGroupName)) +} + +func ToResourceGroupResourceID(subscriptionID, resourceGroupName string) (*azcorearm.ResourceID, error) { + return azcorearm.ParseResourceID(ToResourceGroupResourceIDString(subscriptionID, resourceGroupName)) } func ToClusterResourceID(subscriptionName, resourceGroupName, clusterName string) (*azcorearm.ResourceID, error) { diff --git a/internal/api/types_internalid.go b/internal/api/types_internalid.go index ed3c3fc6684..ad71f2e5a78 100644 --- a/internal/api/types_internalid.go +++ b/internal/api/types_internalid.go @@ -30,6 +30,7 @@ const ( externalAuthKey = "external_auth_config/external_auths" breakGlassCredentialKey = "break_glass_credentials" clusterProvisionShardKey = "provision_shard" + provisionShardKey = "provision_shards" ) var ( @@ -44,6 +45,7 @@ var ( aroHcpV1Alpha1NodePoolPattern = path.Join(aroHcpV1Alpha1ClusterPattern, nodePoolKey, "*") aroHcpV1Alpha1ExternalAuthPattern = path.Join(aroHcpV1Alpha1ClusterPattern, externalAuthKey, "*") aroHcpV1Alpha1ClusterProvisionShardPattern = path.Join(aroHcpV1Alpha1ClusterPattern, clusterProvisionShardKey) + "$" + aroHcpV1Alpha1ProvisionShardPattern = path.Join(aroHcpV1Alpha1Pattern, provisionShardKey, "*") ) // InternalID represents a Cluster Service resource. @@ -101,6 +103,11 @@ func (id *InternalID) validate() error { return nil } + if match, _ = path.Match(aroHcpV1Alpha1ProvisionShardPattern, id.path); match { + id.kind = arohcpv1alpha1.ProvisionShardKind + return nil + } + return fmt.Errorf("invalid InternalID: %q", id.path) } @@ -139,6 +146,7 @@ func (id *InternalID) ID() string { return path.Base(id.path) } +// Path returns the full API path of the resource. func (id *InternalID) Path() string { return id.path } diff --git a/internal/api/types_serviceprovider_cluster.go b/internal/api/types_serviceprovider_cluster.go index b86b961239b..b76669583da 100644 --- a/internal/api/types_serviceprovider_cluster.go +++ b/internal/api/types_serviceprovider_cluster.go @@ -125,6 +125,10 @@ type ServiceProviderClusterStatus struct { // The reference contains a mapping between the logical name we give to the Maestro bundle internally // and the Maestro Bundle Name and ID at the Maestro API level. MaestroReadonlyBundles MaestroBundleReferenceList `json:"maestroReadonlyBundles,omitempty"` + // ManagementClusterResourceID is the resource ID of the management cluster + // this HCP is placed on. Nil means placement has not been resolved yet. + // Once set, this field is immutable. + ManagementClusterResourceID *azcorearm.ResourceID `json:"managementClusterResourceID,omitempty"` } // ServiceProviderClusterStatusVersion contains the actual version information. diff --git a/internal/api/zz_generated.deepcopy.go b/internal/api/zz_generated.deepcopy.go index 24c64d0e841..a2f2d1271dd 100644 --- a/internal/api/zz_generated.deepcopy.go +++ b/internal/api/zz_generated.deepcopy.go @@ -1384,6 +1384,10 @@ func (in *ServiceProviderClusterStatus) DeepCopyInto(out *ServiceProviderCluster } } } + if in.ManagementClusterResourceID != nil { + in, out := &in.ManagementClusterResourceID, &out.ManagementClusterResourceID + *out = arm.DeepCopyResourceID(*in) + } return } diff --git a/internal/database/convert_any.go b/internal/database/convert_any.go index 4feeb355ad1..2ae0b4c5496 100644 --- a/internal/database/convert_any.go +++ b/internal/database/convert_any.go @@ -21,6 +21,7 @@ import ( azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/fleet" "github.com/Azure/ARO-HCP/internal/utils" ) @@ -93,6 +94,12 @@ func InternalToCosmos[InternalAPIType, CosmosAPIType any](obj *InternalAPIType) case *api.HCPOpenShiftClusterNodePool: cosmosObj, err = InternalToCosmosNodePool(internalObj) + case *fleet.Stamp: + cosmosObj, err = InternalToCosmosFleet(internalObj) + + case *fleet.ManagementCluster: + cosmosObj, err = InternalToCosmosFleet(internalObj) + case *TypedDocument: var expectedObj CosmosAPIType switch castObj := any(expectedObj).(type) { diff --git a/internal/database/convert_fleet.go b/internal/database/convert_fleet.go new file mode 100644 index 00000000000..03a05cc7cde --- /dev/null +++ b/internal/database/convert_fleet.go @@ -0,0 +1,56 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package database + +import ( + "fmt" + "strings" + + "github.com/Azure/ARO-HCP/internal/api/arm" +) + +// InternalToCosmosFleet wraps a fleet resource in a GenericDocument envelope whose +// partitionKey is the top-level ancestor resource name rather than the subscription ID. +// Once https://github.com/Azure/ARO-HCP/pull/5094 lands, this func becomes obsolete +// and we can use InternalToCosmosGeneric instead. +func InternalToCosmosFleet[InternalAPIType any]( + internalObj *InternalAPIType, +) (*GenericDocument[InternalAPIType], error) { + if internalObj == nil { + return nil, nil + } + + metadata, ok := any(internalObj).(arm.CosmosMetadataAccessor) + if !ok { + return nil, fmt.Errorf("internalObj must be an arm.CosmosMetadataAccessor: %T", internalObj) + } + + partitionKey := topLevelResourceName(metadata.GetResourceID()) + if len(partitionKey) == 0 { + return nil, fmt.Errorf("fleet object %T has no top-level resource name in its resource ID", internalObj) + } + + return &GenericDocument[InternalAPIType]{ + TypedDocument: TypedDocument{ + BaseDocument: BaseDocument{ + ID: metadata.GetCosmosUID(), + }, + PartitionKey: strings.ToLower(partitionKey), + ResourceID: metadata.GetResourceID(), + ResourceType: metadata.GetResourceID().ResourceType.String(), + }, + Content: *internalObj, + }, nil +} diff --git a/internal/database/crud_fleet.go b/internal/database/crud_fleet.go new file mode 100644 index 00000000000..2aebe7791d8 --- /dev/null +++ b/internal/database/crud_fleet.go @@ -0,0 +1,183 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package database + +import ( + "context" + "fmt" + "strings" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + "github.com/Azure/azure-sdk-for-go/sdk/data/azcosmos" + + "github.com/Azure/ARO-HCP/internal/api/arm" +) + +// fleetResourceCRUD is a ResourceCRUD for Cosmos containers +// partitioned by the name of the top-level ancestor resource. The partition +// key is never stored — it is derived at operation time from the object's +// resource ID hierarchy or from the resource name parameter. +// +// This CRUD will be replaced once https://github.com/Azure/ARO-HCP/pull/5094 +// lands, which generalizes partition key handling in CosmosMetadata. At that +// point partition key derivation moves into the shared infrastructure and +// this type can be merged with nestedCosmosResourceCRUD. +type fleetResourceCRUD[InternalAPIType, CosmosAPIType any] struct { + containerClient *azcosmos.ContainerClient + parentResourceID *azcorearm.ResourceID + resourceType azcorearm.ResourceType +} + +// topLevelResourceName walks a resource ID to its root ancestor and returns +// its name. This is the partition key for containers partitioned by their +// top-level resource name. +func topLevelResourceName(rid *azcorearm.ResourceID) string { + if rid == nil { + return "" + } + curr := rid + for curr.Parent != nil && len(curr.Parent.Name) > 0 { + curr = curr.Parent + } + return strings.ToLower(curr.Name) +} + +// partitionKeyFromObject extracts the partition key from an object's +// CosmosMetadata resource ID by walking to the top-level ancestor. +func partitionKeyFromObject[InternalAPIType any](obj *InternalAPIType) (string, error) { + persistable, ok := any(obj).(arm.CosmosPersistable) + if !ok { + return "", fmt.Errorf("type %T does not implement CosmosPersistable", obj) + } + partitionKey := topLevelResourceName(persistable.GetCosmosData().GetResourceID()) + if len(partitionKey) == 0 { + return "", fmt.Errorf("cannot derive partition key from type %T: no top-level resource name", obj) + } + return partitionKey, nil +} + +// partitionKeyFromParentOrName derives the partition key for read/delete +// operations. For child resources the top-level ancestor is in the parent +// resource ID; for top-level resources the resource name IS the partition key. +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) partitionKeyFromParentOrName(resourceName string) string { + if partitionKey := topLevelResourceName(d.parentResourceID); len(partitionKey) > 0 { + return partitionKey + } + return strings.ToLower(resourceName) +} + +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) makeResourceIDPath( + resourceName string, +) (*azcorearm.ResourceID, error) { + var base string + if d.parentResourceID != nil { + base = d.parentResourceID.String() + "/" + d.resourceType.Types[len(d.resourceType.Types)-1] + } else { + base = "/providers/" + d.resourceType.String() + } + if len(resourceName) > 0 { + base += "/" + resourceName + } + return azcorearm.ParseResourceID(strings.ToLower(base)) +} + +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) GetByID( + ctx context.Context, cosmosID string, +) (*InternalAPIType, error) { + if strings.ToLower(cosmosID) != cosmosID { + return nil, fmt.Errorf("cosmosID must be lowercase, not: %q", cosmosID) + } + partitionKey := topLevelResourceName(d.parentResourceID) + if len(partitionKey) == 0 { + return nil, fmt.Errorf("GetByID requires a parent-scoped CRUD with a known partition key") + } + return getByItemID[InternalAPIType, CosmosAPIType](ctx, d.containerClient, partitionKey, cosmosID) +} + +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) Get( + ctx context.Context, resourceName string, +) (*InternalAPIType, error) { + partitionKey := d.partitionKeyFromParentOrName(resourceName) + resourceID, err := d.makeResourceIDPath(resourceName) + if err != nil { + return nil, fmt.Errorf("failed to make ResourceID path for '%s': %w", resourceName, err) + } + return get[InternalAPIType, CosmosAPIType](ctx, d.containerClient, partitionKey, resourceID) +} + +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) List( + ctx context.Context, options *DBClientListResourceDocsOptions, +) (DBClientIterator[InternalAPIType], error) { + partitionKey := topLevelResourceName(d.parentResourceID) + if len(partitionKey) == 0 { + return nil, fmt.Errorf("List requires a parent-scoped CRUD with a known partition key") + } + prefix, err := d.makeResourceIDPath("") + if err != nil { + return nil, fmt.Errorf("failed to make ResourceID prefix: %w", err) + } + return list[InternalAPIType, CosmosAPIType]( + ctx, d.containerClient, partitionKey, &d.resourceType, prefix, options, false, + ) +} + +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) Create( + ctx context.Context, newObj *InternalAPIType, options *azcosmos.ItemOptions, +) (*InternalAPIType, error) { + partitionKey, err := partitionKeyFromObject(newObj) + if err != nil { + return nil, err + } + return createFleetItem[InternalAPIType, CosmosAPIType](ctx, d.containerClient, partitionKey, newObj, options) +} + +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) Replace( + ctx context.Context, newObj *InternalAPIType, options *azcosmos.ItemOptions, +) (*InternalAPIType, error) { + partitionKey, err := partitionKeyFromObject(newObj) + if err != nil { + return nil, err + } + return replaceFleetItem[InternalAPIType, CosmosAPIType](ctx, d.containerClient, partitionKey, newObj, options) +} + +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) Delete( + ctx context.Context, resourceName string, +) error { + partitionKey := d.partitionKeyFromParentOrName(resourceName) + resourceID, err := d.makeResourceIDPath(resourceName) + if err != nil { + return fmt.Errorf("failed to make ResourceID path for '%s': %w", resourceName, err) + } + return deleteResource(ctx, d.containerClient, partitionKey, resourceID) +} + +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) AddCreateToTransaction( + _ context.Context, + _ DBTransaction, + _ *InternalAPIType, + _ *azcosmos.TransactionalBatchItemOptions, +) (string, error) { + return "", fmt.Errorf("AddCreateToTransaction is not implemented for fleet resources") +} + +func (d *fleetResourceCRUD[InternalAPIType, CosmosAPIType]) AddReplaceToTransaction( + _ context.Context, + _ DBTransaction, + _ *InternalAPIType, + _ *azcosmos.TransactionalBatchItemOptions, +) (string, error) { + return "", fmt.Errorf("AddReplaceToTransaction is not implemented for fleet resources") +} diff --git a/internal/database/crud_fleet_helpers.go b/internal/database/crud_fleet_helpers.go new file mode 100644 index 00000000000..06d0873f295 --- /dev/null +++ b/internal/database/crud_fleet_helpers.go @@ -0,0 +1,148 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package database + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/data/azcosmos" + + "github.com/Azure/ARO-HCP/internal/api/arm" +) + +// partitionKeySetter is a temporary interface used to override the partition key +// on serialized Cosmos documents for the Fleet container. The conversion layer +// (InternalToCosmos) defaults partition keys to subscription ID, which is wrong +// for fleet types where the partition key is the stamp identifier. +// +// This interface and TypedDocument.SetPartitionKey will be removed once +// https://github.com/Azure/ARO-HCP/pull/5094 lands, which adds partition key +// as a first-class field on CosmosMetadata with Get/SetPartitionKey. At that +// point the CRUD layer sets the partition key on CosmosMetadata directly before +// serialization, and the override is no longer needed. +type partitionKeySetter interface { + SetPartitionKey(pk string) +} + +func (td *TypedDocument) SetPartitionKey(pk string) { + td.PartitionKey = pk +} + +// serializeFleetItem serializes an object for the Fleet Cosmos container. +// The partition key is provided by the CRUD layer rather than extracted from +// the object, so any type that implements CosmosPersistable can be stored in +// the Fleet container regardless of whether it carries fleet-specific accessors. +func serializeFleetItem[InternalAPIType, CosmosAPIType any]( + partitionKeyString string, + newObj *InternalAPIType, +) (*arm.CosmosMetadata, []byte, error) { + cosmosPersistable, ok := any(newObj).(arm.CosmosPersistable) + if !ok { + return nil, nil, fmt.Errorf("type %T does not implement CosmosPersistable interface", newObj) + } + cosmosData := cosmosPersistable.GetCosmosData() + cosmosUID := cosmosData.GetCosmosUID() + if len(cosmosUID) == 0 { + return nil, nil, fmt.Errorf("no cosmos id found in object") + } + if !strings.EqualFold(cosmosUID, strings.ToLower(cosmosUID)) { + return nil, nil, fmt.Errorf("invalid cosmos id found in object") + } + + cosmosObj, err := InternalToCosmos[InternalAPIType, CosmosAPIType](newObj) + if err != nil { + return nil, nil, fmt.Errorf("failed to convert internal object to Cosmos object: %w", err) + } + + // The conversion layer may have set the wrong partition key (e.g. + // subscription ID for generic types). Override with the CRUD's + // partition key which is always the stamp identifier for Fleet. + // replace this with the functionality that will be introduced + // by https://github.com/Azure/ARO-HCP/pull/5094 + if doc, ok := any(cosmosObj).(partitionKeySetter); ok { + doc.SetPartitionKey(partitionKeyString) + } + + data, err := json.Marshal(cosmosObj) + if err != nil { + return nil, nil, fmt.Errorf("failed to marshal Cosmos DB item for '%s': %w", cosmosData.ResourceID, err) + } + + return cosmosData, data, nil +} + +func createFleetItem[InternalAPIType, CosmosAPIType any]( + ctx context.Context, + containerClient *azcosmos.ContainerClient, + partitionKeyString string, + newObj *InternalAPIType, + opts *azcosmos.ItemOptions, +) (*InternalAPIType, error) { + if strings.ToLower(partitionKeyString) != partitionKeyString { + return nil, fmt.Errorf("partitionKeyString must be lowercase, not: %q", partitionKeyString) + } + cosmosMetadata, data, err := serializeFleetItem[InternalAPIType, CosmosAPIType](partitionKeyString, newObj) + if err != nil { + return nil, err + } + + if opts == nil { + opts = &azcosmos.ItemOptions{} + } + opts.EnableContentResponseOnWrite = true + + responseItem, err := containerClient.CreateItem(ctx, azcosmos.NewPartitionKeyString(partitionKeyString), data, opts) + if err != nil { + return nil, err + } + + return responseItemToInternalObj[InternalAPIType, CosmosAPIType](ctx, cosmosMetadata.GetCosmosUID(), responseItem) +} + +func replaceFleetItem[InternalAPIType, CosmosAPIType any]( + ctx context.Context, + containerClient *azcosmos.ContainerClient, + partitionKeyString string, + newObj *InternalAPIType, + opts *azcosmos.ItemOptions, +) (*InternalAPIType, error) { + if strings.ToLower(partitionKeyString) != partitionKeyString { + return nil, fmt.Errorf("partitionKeyString must be lowercase, not: %q", partitionKeyString) + } + cosmosMetadata, data, err := serializeFleetItem[InternalAPIType, CosmosAPIType](partitionKeyString, newObj) + if err != nil { + return nil, err + } + + if opts == nil { + opts = &azcosmos.ItemOptions{} + } + if len(cosmosMetadata.CosmosETag) > 0 { + opts.IfMatchEtag = &cosmosMetadata.CosmosETag + } + opts.EnableContentResponseOnWrite = true + + responseItem, err := containerClient.ReplaceItem( + ctx, azcosmos.NewPartitionKeyString(partitionKeyString), cosmosMetadata.GetCosmosUID(), data, opts, + ) + if err != nil { + return nil, err + } + + return responseItemToInternalObj[InternalAPIType, CosmosAPIType](ctx, cosmosMetadata.GetCosmosUID(), responseItem) +} diff --git a/internal/database/crud_fleet_test.go b/internal/database/crud_fleet_test.go new file mode 100644 index 00000000000..6ee3e95da1d --- /dev/null +++ b/internal/database/crud_fleet_test.go @@ -0,0 +1,76 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package database + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" +) + +func TestTopLevelResourceName(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + rid *azcorearm.ResourceID + expected string + }{ + { + name: "nil resource ID returns empty string", + rid: nil, + expected: "", + }, + { + name: "stamp resource ID returns stamp name", + rid: mustParseResourceID(t, "/providers/microsoft.redhatopenshift/stamps/1"), + expected: "1", + }, + { + name: "management cluster returns top-level stamp name", + rid: mustParseResourceID(t, "/providers/microsoft.redhatopenshift/stamps/abc/managementClusters/default"), + expected: "abc", + }, + { + name: "controller returns top-level stamp name", + rid: mustParseResourceID(t, "/providers/microsoft.redhatopenshift/stamps/1/managementClusters/default/controllers/MyController"), + expected: "1", + }, + { + name: "mixed case stamp name is lowercased", + rid: mustParseResourceID(t, "/providers/Microsoft.RedHatOpenShift/stamps/ABC"), + expected: "abc", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + actual := topLevelResourceName(tt.rid) + assert.Equal(t, tt.expected, actual) + }) + } +} + +func mustParseResourceID(t *testing.T, rawID string) *azcorearm.ResourceID { + t.Helper() + rid, err := azcorearm.ParseResourceID(rawID) + if err != nil { + t.Fatalf("failed to parse resource ID %q: %v", rawID, err) + } + return rid +} diff --git a/internal/database/crud_nested_resource.go b/internal/database/crud_nested_resource.go index c736078002f..8ad746c36d4 100644 --- a/internal/database/crud_nested_resource.go +++ b/internal/database/crud_nested_resource.go @@ -39,6 +39,15 @@ type ResourceCRUD[InternalAPIType any] interface { AddReplaceToTransaction(ctx context.Context, transaction DBTransaction, newObj *InternalAPIType, opts *azcosmos.TransactionalBatchItemOptions) (string, error) } +type ValidatingResourceCRUD[InternalAPIType any] interface { + GetByID(ctx context.Context, cosmosID string) (*InternalAPIType, error) + Get(ctx context.Context, resourceID string) (*InternalAPIType, error) + List(ctx context.Context, opts *DBClientListResourceDocsOptions) (DBClientIterator[InternalAPIType], error) + Create(ctx context.Context, newObj *InternalAPIType, options *azcosmos.ItemOptions) (*InternalAPIType, error) + Replace(ctx context.Context, newObj *InternalAPIType, oldObj *InternalAPIType, options *azcosmos.ItemOptions) (*InternalAPIType, error) + Delete(ctx context.Context, resourceID string) error +} + type nestedCosmosResourceCRUD[InternalAPIType, CosmosAPIType any] struct { containerClient *azcosmos.ContainerClient diff --git a/internal/database/crud_validating.go b/internal/database/crud_validating.go new file mode 100644 index 00000000000..d8dc4a85c73 --- /dev/null +++ b/internal/database/crud_validating.go @@ -0,0 +1,75 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package database + +import ( + "context" + "fmt" + + "k8s.io/apimachinery/pkg/util/validation/field" + + "github.com/Azure/azure-sdk-for-go/sdk/data/azcosmos" +) + +// validatingCRUD wraps a ResourceCRUD and runs type-specific validation before +// Create and Replace. The wrapped CRUD is never exposed directly, so validation +// cannot be bypassed. +type validatingCRUD[InternalAPIType any] struct { + inner ResourceCRUD[InternalAPIType] + validateCreate func(context.Context, *InternalAPIType) field.ErrorList + validateReplace func(context.Context, *InternalAPIType, *InternalAPIType) field.ErrorList +} + +func NewValidatingCRUD[InternalAPIType any]( + inner ResourceCRUD[InternalAPIType], + validateCreate func(context.Context, *InternalAPIType) field.ErrorList, + validateReplace func(context.Context, *InternalAPIType, *InternalAPIType) field.ErrorList, +) ValidatingResourceCRUD[InternalAPIType] { + return &validatingCRUD[InternalAPIType]{ + inner: inner, + validateCreate: validateCreate, + validateReplace: validateReplace, + } +} + +func (v *validatingCRUD[InternalAPIType]) GetByID(ctx context.Context, cosmosID string) (*InternalAPIType, error) { + return v.inner.GetByID(ctx, cosmosID) +} + +func (v *validatingCRUD[InternalAPIType]) Get(ctx context.Context, resourceID string) (*InternalAPIType, error) { + return v.inner.Get(ctx, resourceID) +} + +func (v *validatingCRUD[InternalAPIType]) List(ctx context.Context, opts *DBClientListResourceDocsOptions) (DBClientIterator[InternalAPIType], error) { + return v.inner.List(ctx, opts) +} + +func (v *validatingCRUD[InternalAPIType]) Create(ctx context.Context, newObj *InternalAPIType, options *azcosmos.ItemOptions) (*InternalAPIType, error) { + if errs := v.validateCreate(ctx, newObj); errs.ToAggregate() != nil { + return nil, fmt.Errorf("create validation failed: %w", errs.ToAggregate()) + } + return v.inner.Create(ctx, newObj, options) +} + +func (v *validatingCRUD[InternalAPIType]) Replace(ctx context.Context, newObj, oldObj *InternalAPIType, options *azcosmos.ItemOptions) (*InternalAPIType, error) { + if errs := v.validateReplace(ctx, newObj, oldObj); errs.ToAggregate() != nil { + return nil, fmt.Errorf("replace validation failed: %w", errs.ToAggregate()) + } + return v.inner.Replace(ctx, newObj, options) +} + +func (v *validatingCRUD[InternalAPIType]) Delete(ctx context.Context, resourceID string) error { + return v.inner.Delete(ctx, resourceID) +} diff --git a/internal/database/fleet_client.go b/internal/database/fleet_client.go new file mode 100644 index 00000000000..5acce086661 --- /dev/null +++ b/internal/database/fleet_client.go @@ -0,0 +1,158 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package database + +import ( + "fmt" + + "github.com/Azure/azure-sdk-for-go/sdk/data/azcosmos" + + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/utils" + "github.com/Azure/ARO-HCP/internal/validation" +) + +const fleetContainer = "Fleet" + +// FleetDBClient is the database surface for the Fleet Cosmos container. +// It is intentionally separate from ResourcesDBClient because the Fleet +// container holds management cluster inventory data with its own access +// patterns and credential scoping. +type FleetDBClient interface { + Stamps() StampsCRUD + GlobalListers() FleetGlobalListers +} + +// StampsCRUD provides CRUD operations for stamps and access to their +// nested management cluster sub-resources. +type StampsCRUD interface { + ValidatingResourceCRUD[fleet.Stamp] + ManagementClusters(stampIdentifier string) ManagementClustersCRUD +} + +// ManagementClustersCRUD provides CRUD operations for management clusters +// and access to their nested controller status documents. +type ManagementClustersCRUD interface { + ValidatingResourceCRUD[fleet.ManagementCluster] + Controllers() ResourceCRUD[api.Controller] +} + +// FleetGlobalListers provides cross-partition listers for fleet resource types. +type FleetGlobalListers interface { + Stamps() GlobalLister[fleet.Stamp] + ManagementClusters() GlobalLister[fleet.ManagementCluster] +} + +type cosmosFleetDBClient struct { + container *azcosmos.ContainerClient +} + +var _ FleetDBClient = &cosmosFleetDBClient{} + +// NewFleetDBClient instantiates a FleetDBClient from a Cosmos DatabaseClient. +func NewFleetDBClient(database *azcosmos.DatabaseClient) (FleetDBClient, error) { + container, err := database.NewContainer(fleetContainer) + if err != nil { + return nil, utils.TrackError(err) + } + return &cosmosFleetDBClient{container: container}, nil +} + +// NewFleetDBClientFromContainer wraps an already-opened container client. +func NewFleetDBClientFromContainer(container *azcosmos.ContainerClient) FleetDBClient { + return &cosmosFleetDBClient{container: container} +} + +func (c *cosmosFleetDBClient) Stamps() StampsCRUD { + inner := &fleetResourceCRUD[fleet.Stamp, GenericDocument[fleet.Stamp]]{ + containerClient: c.container, + resourceType: fleet.StampResourceType, + } + return &cosmosStampsCRUD{ + ValidatingResourceCRUD: NewValidatingCRUD(inner, + validation.ValidateStampCreate, + validation.ValidateStampUpdate, + ), + containerClient: c.container, + } +} + +func (c *cosmosFleetDBClient) GlobalListers() FleetGlobalListers { + return &cosmosFleetGlobalListers{container: c.container} +} + +type cosmosStampsCRUD struct { + ValidatingResourceCRUD[fleet.Stamp] + containerClient *azcosmos.ContainerClient +} + +func (s *cosmosStampsCRUD) ManagementClusters(stampIdentifier string) ManagementClustersCRUD { + stampResourceID, err := fleet.ToStampResourceID(stampIdentifier) + if err != nil { + panic(fmt.Sprintf("invalid stamp identifier %q: %v", stampIdentifier, err)) + } + inner := &fleetResourceCRUD[fleet.ManagementCluster, GenericDocument[fleet.ManagementCluster]]{ + containerClient: s.containerClient, + parentResourceID: stampResourceID, + resourceType: fleet.ManagementClusterResourceType, + } + return &cosmosManagementClustersCRUD{ + ValidatingResourceCRUD: NewValidatingCRUD(inner, + validation.ValidateManagementClusterCreate, + validation.ValidateManagementClusterUpdate, + ), + containerClient: s.containerClient, + stampIdentifier: stampIdentifier, + } +} + +type cosmosManagementClustersCRUD struct { + ValidatingResourceCRUD[fleet.ManagementCluster] + containerClient *azcosmos.ContainerClient + stampIdentifier string +} + +func (m *cosmosManagementClustersCRUD) Controllers() ResourceCRUD[api.Controller] { + mcResourceID, err := fleet.ToManagementClusterResourceID(m.stampIdentifier) + if err != nil { + panic(fmt.Sprintf("invalid stamp identifier %q: %v", m.stampIdentifier, err)) + } + return &fleetResourceCRUD[api.Controller, GenericDocument[api.Controller]]{ + containerClient: m.containerClient, + parentResourceID: mcResourceID, + resourceType: fleet.ManagementClusterControllerResourceType, + } +} + +type cosmosFleetGlobalListers struct { + container *azcosmos.ContainerClient +} + +var _ FleetGlobalListers = &cosmosFleetGlobalListers{} + +func (g *cosmosFleetGlobalListers) Stamps() GlobalLister[fleet.Stamp] { + return &cosmosGlobalLister[fleet.Stamp, GenericDocument[fleet.Stamp]]{ + containerClient: g.container, + resourceType: fleet.StampResourceType, + } +} + +func (g *cosmosFleetGlobalListers) ManagementClusters() GlobalLister[fleet.ManagementCluster] { + return &cosmosGlobalLister[fleet.ManagementCluster, GenericDocument[fleet.ManagementCluster]]{ + containerClient: g.container, + resourceType: fleet.ManagementClusterResourceType, + } +} diff --git a/internal/database/informers/fleet_index_funcs.go b/internal/database/informers/fleet_index_funcs.go new file mode 100644 index 00000000000..69c40775435 --- /dev/null +++ b/internal/database/informers/fleet_index_funcs.go @@ -0,0 +1,33 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package informers + +import ( + "fmt" + + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/utils" +) + +func managementClusterProvisionShardIDIndexFunc(obj interface{}) ([]string, error) { + mc, ok := obj.(*fleet.ManagementCluster) + if !ok { + return nil, utils.TrackError(fmt.Errorf("expected *fleet.ManagementCluster, got %T", obj)) + } + if mc.Status.ClusterServiceProvisionShardID == nil || len(mc.Status.ClusterServiceProvisionShardID.ID()) == 0 { + return nil, nil + } + return []string{mc.Status.ClusterServiceProvisionShardID.ID()}, nil +} diff --git a/internal/database/informers/fleet_informers.go b/internal/database/informers/fleet_informers.go new file mode 100644 index 00000000000..babf5491312 --- /dev/null +++ b/internal/database/informers/fleet_informers.go @@ -0,0 +1,128 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package informers + +import ( + "context" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/tools/cache" + + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/database/listers" + "github.com/Azure/ARO-HCP/internal/utils" +) + +const ( + StampRelistDuration = 2 * time.Minute + ManagementClusterRelistDuration = 2 * time.Minute +) + +// NewStampInformer creates an unstarted SharedIndexInformer for stamps +// with the default relist duration. +func NewStampInformer(lister database.GlobalLister[fleet.Stamp]) cache.SharedIndexInformer { + return NewStampInformerWithRelistDuration(lister, StampRelistDuration) +} + +// NewStampInformerWithRelistDuration creates an unstarted SharedIndexInformer for stamps +// with a configurable relist duration. +func NewStampInformerWithRelistDuration(lister database.GlobalLister[fleet.Stamp], relistDuration time.Duration) cache.SharedIndexInformer { + lw := &cache.ListWatch{ + ListWithContextFunc: func(ctx context.Context, options metav1.ListOptions) (runtime.Object, error) { + logger := utils.LoggerFromContext(ctx) + logger.Info("listing stamps") + defer logger.Info("finished listing stamps") + + iter, err := lister.List(ctx, nil) + if err != nil { + return nil, err + } + + list := &fleet.StampList{} + list.ResourceVersion = "0" + for _, s := range iter.Items(ctx) { + list.Items = append(list.Items, *s) + } + if err := iter.GetError(); err != nil { + return nil, err + } + + return list, nil + }, + WatchFuncWithContext: func(ctx context.Context, options metav1.ListOptions) (watch.Interface, error) { + return newExpiringWatcher(ctx, relistDuration), nil + }, + } + + return cache.NewSharedIndexInformerWithOptions( + &listWatchWithoutWatchListSemantics{lw}, + &fleet.Stamp{}, + cache.SharedIndexInformerOptions{ + ResyncPeriod: 1 * time.Hour, + }, + ) +} + +// NewManagementClusterInformer creates an unstarted SharedIndexInformer for management clusters +// with the default relist duration. +func NewManagementClusterInformer(lister database.GlobalLister[fleet.ManagementCluster]) cache.SharedIndexInformer { + return NewManagementClusterInformerWithRelistDuration(lister, ManagementClusterRelistDuration) +} + +// NewManagementClusterInformerWithRelistDuration creates an unstarted SharedIndexInformer for management clusters +// with a configurable relist duration. +func NewManagementClusterInformerWithRelistDuration(lister database.GlobalLister[fleet.ManagementCluster], relistDuration time.Duration) cache.SharedIndexInformer { + lw := &cache.ListWatch{ + ListWithContextFunc: func(ctx context.Context, options metav1.ListOptions) (runtime.Object, error) { + logger := utils.LoggerFromContext(ctx) + logger.Info("listing management clusters") + defer logger.Info("finished listing management clusters") + + iter, err := lister.List(ctx, nil) + if err != nil { + return nil, err + } + + list := &fleet.ManagementClusterList{} + list.ResourceVersion = "0" + for _, mc := range iter.Items(ctx) { + list.Items = append(list.Items, *mc) + } + if err := iter.GetError(); err != nil { + return nil, err + } + + return list, nil + }, + WatchFuncWithContext: func(ctx context.Context, options metav1.ListOptions) (watch.Interface, error) { + return newExpiringWatcher(ctx, relistDuration), nil + }, + } + + return cache.NewSharedIndexInformerWithOptions( + &listWatchWithoutWatchListSemantics{lw}, + &fleet.ManagementCluster{}, + cache.SharedIndexInformerOptions{ + ResyncPeriod: 1 * time.Hour, + Indexers: cache.Indexers{ + listers.ByCSProvisionShard: managementClusterProvisionShardIDIndexFunc, + }, + }, + ) +} diff --git a/internal/database/informers/fleet_types.go b/internal/database/informers/fleet_types.go new file mode 100644 index 00000000000..00715601342 --- /dev/null +++ b/internal/database/informers/fleet_types.go @@ -0,0 +1,98 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package informers + +import ( + "context" + "sync" + "time" + + "k8s.io/client-go/tools/cache" + + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/database/listers" + "github.com/Azure/ARO-HCP/internal/utils" +) + +// FleetInformers bundles one SharedIndexInformer per fleet type plus the +// matching listers. Both the fleet management binary (future) and the +// backend (cross-partition) construct one of these with the appropriate +// database.FleetGlobalListers — the factory does not care which. +type FleetInformers interface { + Stamps() (cache.SharedIndexInformer, listers.StampLister) + ManagementClusters() (cache.SharedIndexInformer, listers.ManagementClusterLister) + RunWithContext(ctx context.Context) +} + +type fleetInformers struct { + stampInformer cache.SharedIndexInformer + stampLister listers.StampLister + managementClusterInformer cache.SharedIndexInformer + managementClusterLister listers.ManagementClusterLister +} + +func (f *fleetInformers) Stamps() (cache.SharedIndexInformer, listers.StampLister) { + return f.stampInformer, f.stampLister +} + +func (f *fleetInformers) ManagementClusters() (cache.SharedIndexInformer, listers.ManagementClusterLister) { + return f.managementClusterInformer, f.managementClusterLister +} + +// NewFleetInformers creates FleetInformers with default relist durations. +func NewFleetInformers(ctx context.Context, gl database.FleetGlobalListers) FleetInformers { + return NewFleetInformersWithRelistDuration(ctx, gl, nil) +} + +// NewFleetInformersWithRelistDuration creates FleetInformers with a configurable relist duration. +func NewFleetInformersWithRelistDuration(ctx context.Context, gl database.FleetGlobalListers, relistDuration *time.Duration) FleetInformers { + stampRelistDuration := StampRelistDuration + managementClusterRelistDuration := ManagementClusterRelistDuration + if relistDuration != nil { + stampRelistDuration = *relistDuration + managementClusterRelistDuration = *relistDuration + } + + ret := &fleetInformers{} + ret.stampInformer = NewStampInformerWithRelistDuration(gl.Stamps(), stampRelistDuration) + ret.stampLister = listers.NewStampLister(ret.stampInformer.GetIndexer()) + ret.managementClusterInformer = NewManagementClusterInformerWithRelistDuration(gl.ManagementClusters(), managementClusterRelistDuration) + ret.managementClusterLister = listers.NewManagementClusterLister(ret.managementClusterInformer.GetIndexer()) + + return ret +} + +func (f *fleetInformers) RunWithContext(ctx context.Context) { + logger := utils.LoggerFromContext(ctx) + logger.Info("starting fleet informers") + defer logger.Info("stopped fleet informers") + + wg := sync.WaitGroup{} + + wg.Add(1) + go func() { + defer wg.Done() + f.stampInformer.RunWithContext(ctx) + }() + + wg.Add(1) + go func() { + defer wg.Done() + f.managementClusterInformer.RunWithContext(ctx) + }() + + <-ctx.Done() + wg.Wait() +} diff --git a/internal/database/informers/list_watch.go b/internal/database/informers/list_watch.go new file mode 100644 index 00000000000..3eda5f29c61 --- /dev/null +++ b/internal/database/informers/list_watch.go @@ -0,0 +1,81 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package informers + +import ( + "context" + "net/http" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/tools/cache" +) + +// listWatchWithoutWatchListSemantics opts out of WatchListClient semantics. +// Mirrors the unexported wrapper from client-go/tools/cache/listwatch.go. +// Cosmos-backed informers use newExpiringWatcher which does not support +// the bookmark protocol that WatchListClient requires. +type listWatchWithoutWatchListSemantics struct { + *cache.ListWatch +} + +func (listWatchWithoutWatchListSemantics) IsWatchListSemanticsUnSupported() bool { return true } + +// expiringWatcher implements watch.Interface and sends an expired error after +// the configured duration to cause the reflector to relist. This drives +// SharedInformers backed by non-Kubernetes data sources like Cosmos that have +// no native watch protocol. +type expiringWatcher struct { + result chan watch.Event + done chan struct{} +} + +// newExpiringWatcher creates a watcher that terminates after the given +// duration by sending an HTTP 410 Gone / StatusReasonExpired error. +func newExpiringWatcher(ctx context.Context, expiry time.Duration) watch.Interface { + w := &expiringWatcher{ + result: make(chan watch.Event), + done: make(chan struct{}), + } + go func() { + select { + case <-time.After(expiry): + w.result <- watch.Event{ + Type: watch.Error, + Object: &metav1.Status{ + Status: metav1.StatusFailure, + Code: http.StatusGone, + Reason: metav1.StatusReasonExpired, + Message: "watch expired", + }, + } + case <-w.done: + case <-ctx.Done(): + } + close(w.result) + }() + return w +} + +func (w *expiringWatcher) Stop() { + select { + case <-w.done: + default: + close(w.done) + } +} + +func (w *expiringWatcher) ResultChan() <-chan watch.Event { return w.result } diff --git a/internal/database/listers/management_cluster_lister.go b/internal/database/listers/management_cluster_lister.go new file mode 100644 index 00000000000..00bd9bc8bf3 --- /dev/null +++ b/internal/database/listers/management_cluster_lister.go @@ -0,0 +1,70 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package listers + +import ( + "context" + "fmt" + + "k8s.io/client-go/tools/cache" + + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/database" +) + +// ManagementClusterLister lists and gets management clusters from an informer's indexer. +type ManagementClusterLister interface { + List(ctx context.Context) ([]*fleet.ManagementCluster, error) + Get(ctx context.Context, stampIdentifier string) (*fleet.ManagementCluster, error) + GetByCSProvisionShardID(ctx context.Context, shardID string) (*fleet.ManagementCluster, error) +} + +// informerBasedManagementClusterLister implements ManagementClusterLister backed by a SharedIndexInformer. +type informerBasedManagementClusterLister struct { + indexer cache.Indexer +} + +// NewManagementClusterLister creates a ManagementClusterLister from a SharedIndexInformer's indexer. +func NewManagementClusterLister(indexer cache.Indexer) ManagementClusterLister { + return &informerBasedManagementClusterLister{ + indexer: indexer, + } +} + +func (l *informerBasedManagementClusterLister) List(ctx context.Context) ([]*fleet.ManagementCluster, error) { + return listAll[fleet.ManagementCluster](l.indexer) +} + +// Get retrieves a single management cluster by stamp identifier. +func (l *informerBasedManagementClusterLister) Get(ctx context.Context, stampIdentifier string) (*fleet.ManagementCluster, error) { + key := fleet.ToManagementClusterResourceIDString(stampIdentifier) + return getByKey[fleet.ManagementCluster](l.indexer, key) +} + +// GetByCSProvisionShardID retrieves a single management cluster by its CS provision shard ID. +func (l *informerBasedManagementClusterLister) GetByCSProvisionShardID(ctx context.Context, shardID string) (*fleet.ManagementCluster, error) { + results, err := listFromIndex[fleet.ManagementCluster](l.indexer, ByCSProvisionShard, shardID) + if err != nil { + return nil, err + } + switch len(results) { + case 0: + return nil, database.NewNotFoundError() + case 1: + return results[0], nil + default: + return nil, fmt.Errorf("expected at most 1 management cluster for CS provision shard ID %q, got %d", shardID, len(results)) + } +} diff --git a/internal/database/listers/stamp_lister.go b/internal/database/listers/stamp_lister.go new file mode 100644 index 00000000000..8aa3a0b2ee3 --- /dev/null +++ b/internal/database/listers/stamp_lister.go @@ -0,0 +1,49 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package listers + +import ( + "context" + + "k8s.io/client-go/tools/cache" + + "github.com/Azure/ARO-HCP/internal/api/fleet" +) + +// StampLister lists and gets stamps from an informer's indexer. +type StampLister interface { + List(ctx context.Context) ([]*fleet.Stamp, error) + Get(ctx context.Context, stampIdentifier string) (*fleet.Stamp, error) +} + +type informerBasedStampLister struct { + indexer cache.Indexer +} + +// NewStampLister creates a StampLister from a SharedIndexInformer's indexer. +func NewStampLister(indexer cache.Indexer) StampLister { + return &informerBasedStampLister{ + indexer: indexer, + } +} + +func (l *informerBasedStampLister) List(ctx context.Context) ([]*fleet.Stamp, error) { + return listAll[fleet.Stamp](l.indexer) +} + +func (l *informerBasedStampLister) Get(ctx context.Context, stampIdentifier string) (*fleet.Stamp, error) { + key := fleet.ToStampResourceIDString(stampIdentifier) + return getByKey[fleet.Stamp](l.indexer, key) +} diff --git a/internal/database/listers/types.go b/internal/database/listers/types.go new file mode 100644 index 00000000000..267d4b016a2 --- /dev/null +++ b/internal/database/listers/types.go @@ -0,0 +1,75 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package listers + +import ( + "fmt" + + "k8s.io/client-go/tools/cache" + + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/utils" +) + +const ( + ByCSProvisionShard = "byCSProvisionShard" +) + +// listAll retrieves all items from a store, casting each to *T. +func listAll[T any](store cache.Store) ([]*T, error) { + items := store.List() + result := make([]*T, 0, len(items)) + for _, item := range items { + typed, ok := item.(*T) + if !ok { + return nil, utils.TrackError(fmt.Errorf("expected *%T, got %T", *new(T), item)) + } + result = append(result, typed) + } + return result, nil +} + +// getByKey retrieves a single item from an indexer by key, casting it to *T. +func getByKey[T any](indexer cache.Indexer, key string) (*T, error) { + item, exists, err := indexer.GetByKey(key) + if err != nil { + return nil, utils.TrackError(err) + } + if !exists { + return nil, database.NewNotFoundError() + } + typed, ok := item.(*T) + if !ok { + return nil, utils.TrackError(fmt.Errorf("expected *%T, got %T", *new(T), item)) + } + return typed, nil +} + +// listFromIndex retrieves items from an indexer by index name and key, casting each to *T. +func listFromIndex[T any](indexer cache.Indexer, indexName, key string) ([]*T, error) { + items, err := indexer.ByIndex(indexName, key) + if err != nil { + return nil, utils.TrackError(err) + } + result := make([]*T, 0, len(items)) + for _, item := range items { + typed, ok := item.(*T) + if !ok { + return nil, utils.TrackError(fmt.Errorf("expected *%T, got %T", *new(T), item)) + } + result = append(result, typed) + } + return result, nil +} diff --git a/internal/database/listertesting/slice_listers.go b/internal/database/listertesting/slice_listers.go new file mode 100644 index 00000000000..ce3ee8567e0 --- /dev/null +++ b/internal/database/listertesting/slice_listers.go @@ -0,0 +1,84 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package listertesting + +import ( + "context" + "fmt" + "strings" + + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/database" + dblisters "github.com/Azure/ARO-HCP/internal/database/listers" +) + +// SliceStampLister implements dblisters.StampLister backed by a slice. +type SliceStampLister struct { + Stamps []*fleet.Stamp +} + +var _ dblisters.StampLister = &SliceStampLister{} + +func (l *SliceStampLister) List(ctx context.Context) ([]*fleet.Stamp, error) { + return l.Stamps, nil +} + +func (l *SliceStampLister) Get(ctx context.Context, stampIdentifier string) (*fleet.Stamp, error) { + key := fleet.ToStampResourceIDString(stampIdentifier) + for _, s := range l.Stamps { + if s.CosmosMetadata.ResourceID != nil && strings.EqualFold(s.CosmosMetadata.ResourceID.String(), key) { + return s, nil + } + } + return nil, database.NewNotFoundError() +} + +// SliceManagementClusterLister implements dblisters.ManagementClusterLister backed by a slice. +type SliceManagementClusterLister struct { + ManagementClusters []*fleet.ManagementCluster +} + +var _ dblisters.ManagementClusterLister = &SliceManagementClusterLister{} + +func (l *SliceManagementClusterLister) List(ctx context.Context) ([]*fleet.ManagementCluster, error) { + return l.ManagementClusters, nil +} + +func (l *SliceManagementClusterLister) Get(ctx context.Context, stampIdentifier string) (*fleet.ManagementCluster, error) { + key := fleet.ToManagementClusterResourceIDString(stampIdentifier) + for _, mc := range l.ManagementClusters { + if mc.ResourceID != nil && strings.EqualFold(mc.ResourceID.String(), key) { + return mc, nil + } + } + return nil, database.NewNotFoundError() +} + +func (l *SliceManagementClusterLister) GetByCSProvisionShardID(ctx context.Context, shardID string) (*fleet.ManagementCluster, error) { + var matches []*fleet.ManagementCluster + for _, mc := range l.ManagementClusters { + if mc.Status.ClusterServiceProvisionShardID != nil && mc.Status.ClusterServiceProvisionShardID.ID() == shardID { + matches = append(matches, mc) + } + } + switch len(matches) { + case 0: + return nil, database.NewNotFoundError() + case 1: + return matches[0], nil + default: + return nil, fmt.Errorf("expected at most 1 management cluster for CS provision shard ID %q, got %d", shardID, len(matches)) + } +} diff --git a/internal/database/listertesting/slice_listers_test.go b/internal/database/listertesting/slice_listers_test.go new file mode 100644 index 00000000000..73b68a0bc33 --- /dev/null +++ b/internal/database/listertesting/slice_listers_test.go @@ -0,0 +1,96 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package listertesting + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "k8s.io/utils/ptr" + + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/database" +) + +func newTestManagementCluster(name, shardID string) *fleet.ManagementCluster { + resourceID := api.Must(fleet.ToManagementClusterResourceID(name)) + return &fleet.ManagementCluster{ + CosmosMetadata: api.CosmosMetadata{ + ResourceID: resourceID, + }, + ResourceID: resourceID, + Status: fleet.ManagementClusterStatus{ + ClusterServiceProvisionShardID: ptr.To(api.Must(api.NewInternalID("/api/aro_hcp/v1alpha1/provision_shards/" + shardID))), + }, + } +} + +func TestSliceManagementClusterLister(t *testing.T) { + mc1 := newTestManagementCluster("m1", "11111111-1111-1111-1111-111111111111") + mc2 := newTestManagementCluster("m2", "22222222-2222-2222-2222-222222222222") + + lister := &SliceManagementClusterLister{ + ManagementClusters: []*fleet.ManagementCluster{mc1, mc2}, + } + + ctx := context.Background() + + t.Run("List returns all management clusters", func(t *testing.T) { + result, err := lister.List(ctx) + require.NoError(t, err) + assert.Len(t, result, 2) + }) + + t.Run("Get returns matching management cluster", func(t *testing.T) { + result, err := lister.Get(ctx, "m1") + require.NoError(t, err) + assert.Equal(t, "m1", result.ResourceID.Parent.Name) + }) + + t.Run("Get returns not found for non-existent management cluster", func(t *testing.T) { + _, err := lister.Get(ctx, "non-existent") + require.Error(t, err) + assert.True(t, database.IsNotFoundError(err)) + }) + + t.Run("GetByCSProvisionShard returns matching management cluster", func(t *testing.T) { + csShardID := api.Must(api.NewInternalID("/api/aro_hcp/v1alpha1/provision_shards/11111111-1111-1111-1111-111111111111")) + result, err := lister.GetByCSProvisionShardID(ctx, csShardID.ID()) + require.NoError(t, err) + assert.Equal(t, "m1", result.ResourceID.Parent.Name) + }) + + t.Run("GetByCSProvisionShard returns not found for non-existent shard", func(t *testing.T) { + csShardID := api.Must(api.NewInternalID("/api/aro_hcp/v1alpha1/provision_shards/99999999-9999-9999-9999-999999999999")) + _, err := lister.GetByCSProvisionShardID(ctx, csShardID.ID()) + require.Error(t, err) + assert.True(t, database.IsNotFoundError(err)) + }) + + t.Run("GetByCSProvisionShard returns error for duplicate shards", func(t *testing.T) { + mc3 := newTestManagementCluster("m3", "11111111-1111-1111-1111-111111111111") + dupLister := &SliceManagementClusterLister{ + ManagementClusters: []*fleet.ManagementCluster{mc1, mc3}, + } + csShardID := api.Must(api.NewInternalID("/api/aro_hcp/v1alpha1/provision_shards/11111111-1111-1111-1111-111111111111")) + _, err := dupLister.GetByCSProvisionShardID(ctx, csShardID.ID()) + require.Error(t, err) + assert.Contains(t, err.Error(), "expected at most 1") + }) +} diff --git a/internal/databasetesting/mock_fleet_client.go b/internal/databasetesting/mock_fleet_client.go new file mode 100644 index 00000000000..d7d3b5c0f20 --- /dev/null +++ b/internal/databasetesting/mock_fleet_client.go @@ -0,0 +1,260 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package databasetesting + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "sync" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/fleet" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/validation" +) + +// MockFleetDBClient is the in-memory test double for database.FleetDBClient. +// It owns its own document store, separate from MockResourcesDBClient — +// production has the fleet container live in a different Cosmos container +// (and behind different credentials), and the mock mirrors that boundary. +type MockFleetDBClient struct { + mu sync.RWMutex + documents map[string]json.RawMessage +} + +var _ database.FleetDBClient = &MockFleetDBClient{} +var _ mockDocumentStore = &MockFleetDBClient{} + +// NewMockFleetDBClient creates an empty MockFleetDBClient. +func NewMockFleetDBClient() *MockFleetDBClient { + return &MockFleetDBClient{ + documents: make(map[string]json.RawMessage), + } +} + +// NewMockFleetDBClientWithResources creates a MockFleetDBClient and populates +// it with the given resources. Supported types: +// - *fleet.Stamp +// - *fleet.ManagementCluster +func NewMockFleetDBClientWithResources(ctx context.Context, resources []any) (*MockFleetDBClient, error) { + mock := NewMockFleetDBClient() + for i, r := range resources { + if err := mock.addResource(ctx, r); err != nil { + return nil, fmt.Errorf("failed to add resource at index %d: %w", i, err) + } + } + return mock, nil +} + +func (m *MockFleetDBClient) addResource(ctx context.Context, resource any) error { + switch r := resource.(type) { + case *fleet.Stamp: + return m.addStamp(ctx, r) + case *fleet.ManagementCluster: + return m.addManagementCluster(ctx, r) + default: + return fmt.Errorf("unsupported resource type for MockFleetDBClient: %T", resource) + } +} + +func (m *MockFleetDBClient) addStamp(ctx context.Context, stamp *fleet.Stamp) error { + stampIdentifier := stamp.GetStampIdentifier() + if len(stampIdentifier) == 0 { + return fmt.Errorf("stamp has empty stamp identifier") + } + crud := m.Stamps() + _, err := crud.Create(ctx, stamp, nil) + return err +} + +func (m *MockFleetDBClient) addManagementCluster(ctx context.Context, mc *fleet.ManagementCluster) error { + stampIdentifier := mc.GetStampIdentifier() + if len(stampIdentifier) == 0 { + return fmt.Errorf("management cluster has empty stamp identifier") + } + crud := m.Stamps().ManagementClusters(stampIdentifier) + _, err := crud.Create(ctx, mc, nil) + return err +} + +// --- mockDocumentStore implementation --- + +func (m *MockFleetDBClient) GetDocument(cosmosID string) (json.RawMessage, bool) { + m.mu.RLock() + defer m.mu.RUnlock() + data, ok := m.documents[strings.ToLower(cosmosID)] + return data, ok +} + +func (m *MockFleetDBClient) StoreDocument(cosmosID string, data json.RawMessage) { + m.mu.Lock() + defer m.mu.Unlock() + m.documents[strings.ToLower(cosmosID)] = data +} + +func (m *MockFleetDBClient) DeleteDocument(cosmosID string) { + m.mu.Lock() + defer m.mu.Unlock() + delete(m.documents, strings.ToLower(cosmosID)) +} + +func (m *MockFleetDBClient) ListDocuments(resourceType *azcorearm.ResourceType, prefix string) []json.RawMessage { + m.mu.RLock() + defer m.mu.RUnlock() + var results []json.RawMessage + for _, data := range m.documents { + var td database.TypedDocument + if err := json.Unmarshal(data, &td); err != nil { + continue + } + if resourceType != nil && !strings.EqualFold(td.ResourceType, resourceType.String()) { + continue + } + if len(prefix) != 0 && td.ResourceID != nil && + !strings.HasPrefix(strings.ToLower(td.ResourceID.String()), strings.ToLower(prefix)) { + continue + } + results = append(results, data) + } + return results +} + +func (m *MockFleetDBClient) GetAllDocuments() map[string]json.RawMessage { + m.mu.RLock() + defer m.mu.RUnlock() + out := make(map[string]json.RawMessage, len(m.documents)) + for k, v := range m.documents { + out[k] = v + } + return out +} + +// newMockFleetResourceCRUD creates a mockResourceCRUD with path construction +// that mirrors fleetResourceCRUD. Fleet resources live outside the subscription +// hierarchy (e.g. /providers/Microsoft.RedHatOpenShift/stamps/{id}), so the +// standard subscription-scoped mockResourceCRUD path logic does not apply. +func newMockFleetResourceCRUD[InternalAPIType, CosmosAPIType any]( + client mockDocumentStore, parentResourceID *azcorearm.ResourceID, resourceType azcorearm.ResourceType, +) *mockResourceCRUD[InternalAPIType, CosmosAPIType] { + m := newMockResourceCRUD[InternalAPIType, CosmosAPIType](client, parentResourceID, resourceType) + m.makeResourceIDPath = func(resourceName string) (*azcorearm.ResourceID, error) { + var base string + if parentResourceID != nil { + base = parentResourceID.String() + "/" + resourceType.Types[len(resourceType.Types)-1] + } else { + base = "/providers/" + resourceType.String() + } + if len(resourceName) > 0 { + base += "/" + resourceName + } + return azcorearm.ParseResourceID(strings.ToLower(base)) + } + m.getListPrefix = func() (string, error) { + rid, err := m.makeResourceIDPath("") + if err != nil { + return "", err + } + return rid.String() + "/", nil + } + return m +} + +// --- FleetDBClient implementation --- + +func (m *MockFleetDBClient) Stamps() database.StampsCRUD { + inner := newMockFleetResourceCRUD[fleet.Stamp, database.GenericDocument[fleet.Stamp]]( + m, nil, fleet.StampResourceType, + ) + return &mockStampsCRUD{ + ValidatingResourceCRUD: database.NewValidatingCRUD(inner, + validation.ValidateStampCreate, + validation.ValidateStampUpdate, + ), + store: m, + } +} + +func (m *MockFleetDBClient) GlobalListers() database.FleetGlobalListers { + return &mockFleetGlobalListers{client: m} +} + +// --- StampsCRUD --- + +type mockStampsCRUD struct { + database.ValidatingResourceCRUD[fleet.Stamp] + store *MockFleetDBClient +} + +func (s *mockStampsCRUD) ManagementClusters(stampIdentifier string) database.ManagementClustersCRUD { + parentResourceID, err := fleet.ToStampResourceID(stampIdentifier) + if err != nil { + panic(fmt.Sprintf("invalid stamp identifier %q: %v", stampIdentifier, err)) + } + inner := newMockFleetResourceCRUD[fleet.ManagementCluster, database.GenericDocument[fleet.ManagementCluster]]( + s.store, parentResourceID, fleet.ManagementClusterResourceType, + ) + return &mockManagementClustersCRUD{ + ValidatingResourceCRUD: database.NewValidatingCRUD(inner, + validation.ValidateManagementClusterCreate, + validation.ValidateManagementClusterUpdate, + ), + store: s.store, + stampIdentifier: stampIdentifier, + } +} + +// --- ManagementClustersCRUD --- + +type mockManagementClustersCRUD struct { + database.ValidatingResourceCRUD[fleet.ManagementCluster] + store *MockFleetDBClient + stampIdentifier string +} + +func (m *mockManagementClustersCRUD) Controllers() database.ResourceCRUD[api.Controller] { + mcResourceID, err := fleet.ToManagementClusterResourceID(m.stampIdentifier) + if err != nil { + panic(fmt.Sprintf("invalid stamp identifier %q: %v", m.stampIdentifier, err)) + } + return newMockFleetResourceCRUD[api.Controller, database.GenericDocument[api.Controller]]( + m.store, mcResourceID, fleet.ManagementClusterControllerResourceType, + ) +} + +// --- FleetGlobalListers --- + +type mockFleetGlobalListers struct { + client mockDocumentStore +} + +var _ database.FleetGlobalListers = &mockFleetGlobalListers{} + +func (g *mockFleetGlobalListers) Stamps() database.GlobalLister[fleet.Stamp] { + return &mockTypedGlobalLister[fleet.Stamp, database.GenericDocument[fleet.Stamp]]{ + client: g.client, + resourceType: fleet.StampResourceType, + } +} + +func (g *mockFleetGlobalListers) ManagementClusters() database.GlobalLister[fleet.ManagementCluster] { + return &mockTypedGlobalLister[fleet.ManagementCluster, database.GenericDocument[fleet.ManagementCluster]]{ + client: g.client, + resourceType: fleet.ManagementClusterResourceType, + } +} diff --git a/internal/databasetesting/mock_resources_crud.go b/internal/databasetesting/mock_resources_crud.go index edf10781b34..05eec2130cc 100644 --- a/internal/databasetesting/mock_resources_crud.go +++ b/internal/databasetesting/mock_resources_crud.go @@ -34,9 +34,22 @@ import ( "github.com/Azure/ARO-HCP/internal/utils" ) +// mockDocumentStore is the slice of MockDBClient that mockResourceCRUD actually +// uses. Extracting this interface lets mockResourceCRUD power both MockDBClient +// (the existing in-memory store for the regular containers), MockKubeApplierClient +// (the in-memory store for the kube-applier container) and MockFleetClient +// (the in-memory store for the fleet container) without code duplication. +type mockDocumentStore interface { + GetDocument(cosmosID string) (json.RawMessage, bool) + StoreDocument(cosmosID string, data json.RawMessage) + DeleteDocument(cosmosID string) + ListDocuments(resourceType *azcorearm.ResourceType, prefix string) []json.RawMessage + GetAllDocuments() map[string]json.RawMessage +} + // mockResourceCRUD is a generic mock implementation of database.ResourceCRUD. type mockResourceCRUD[InternalAPIType, CosmosAPIType any] struct { - client *MockResourcesDBClient + client mockDocumentStore parentResourceID *azcorearm.ResourceID resourceType azcorearm.ResourceType // makeResourceIDPath constructs the full resource ID path from a resource name. @@ -48,7 +61,7 @@ type mockResourceCRUD[InternalAPIType, CosmosAPIType any] struct { } func newMockResourceCRUD[InternalAPIType, CosmosAPIType any]( - client *MockResourcesDBClient, parentResourceID *azcorearm.ResourceID, resourceType azcorearm.ResourceType) *mockResourceCRUD[InternalAPIType, CosmosAPIType] { + client mockDocumentStore, parentResourceID *azcorearm.ResourceID, resourceType azcorearm.ResourceType) *mockResourceCRUD[InternalAPIType, CosmosAPIType] { m := &mockResourceCRUD[InternalAPIType, CosmosAPIType]{ client: client, @@ -692,7 +705,7 @@ type mockManagementClusterContentCRUD struct { *mockResourceCRUD[api.ManagementClusterContent, database.GenericDocument[api.ManagementClusterContent]] } -func newMockManagementClusterContentCRUD(client *MockResourcesDBClient, parentResourceID *azcorearm.ResourceID, resourceType azcorearm.ResourceType) *mockManagementClusterContentCRUD { +func newMockManagementClusterContentCRUD(client mockDocumentStore, parentResourceID *azcorearm.ResourceID, resourceType azcorearm.ResourceType) *mockManagementClusterContentCRUD { return &mockManagementClusterContentCRUD{ mockResourceCRUD: newMockResourceCRUD[api.ManagementClusterContent, database.GenericDocument[api.ManagementClusterContent]]( client, parentResourceID, resourceType), diff --git a/internal/databasetesting/mock_resources_global_lister.go b/internal/databasetesting/mock_resources_global_lister.go index 22ed8b7e98e..81ab1fd00bf 100644 --- a/internal/databasetesting/mock_resources_global_lister.go +++ b/internal/databasetesting/mock_resources_global_lister.go @@ -136,7 +136,7 @@ func (l *mockSubscriptionGlobalLister) List(ctx context.Context, options *databa // mockTypedGlobalLister is a generic mock global lister that lists all resources // of a given type across all partitions. type mockTypedGlobalLister[InternalAPIType, CosmosAPIType any] struct { - client *MockResourcesDBClient + client mockDocumentStore resourceType azcorearm.ResourceType } diff --git a/internal/go.mod b/internal/go.mod index cf2ad9f3ad4..f2a718a9131 100644 --- a/internal/go.mod +++ b/internal/go.mod @@ -34,6 +34,7 @@ require ( go.uber.org/mock v0.6.0 gotest.tools v2.2.0+incompatible k8s.io/apimachinery v0.35.3 + k8s.io/client-go v0.35.3 k8s.io/component-base v0.35.3 k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 sigs.k8s.io/randfill v1.0.0 @@ -132,7 +133,6 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/api v0.35.3 // indirect - k8s.io/client-go v0.35.3 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect diff --git a/internal/ocm/convert.go b/internal/ocm/convert.go index 98b7910782a..9e42315179c 100644 --- a/internal/ocm/convert.go +++ b/internal/ocm/convert.go @@ -22,6 +22,7 @@ import ( "strings" "time" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" @@ -32,6 +33,7 @@ import ( "github.com/Azure/ARO-HCP/internal/api" "github.com/Azure/ARO-HCP/internal/api/arm" + "github.com/Azure/ARO-HCP/internal/api/fleet" "github.com/Azure/ARO-HCP/internal/utils" ) @@ -66,21 +68,24 @@ const ( // The OCM SDK does not provide these constants. - csCustomerManagedEncryptionTypeKms string = "kms" - csEncryptionAtHostStateDisabled string = "disabled" - csEncryptionAtHostStateEnabled string = "enabled" - csImageRegistryStateDisabled string = "disabled" - csImageRegistryStateEnabled string = "enabled" - csKeyManagementModeCustomerManaged string = "customer_managed" - csKeyManagementModePlatformManaged string = "platform_managed" - csNodeDrainGracePeriodUnit string = "minutes" - csOutboundType string = "load_balancer" - csUsernameClaimPrefixPolicyNoPrefix string = "NoPrefix" - csUsernameClaimPrefixPolicyPrefix string = "Prefix" - csCIDRBlockAllowAccessModeAllowAll string = "allow_all" - csCIDRBlockAllowAccessModeAllowList string = "allow_list" - csOsDiskPersistencePersistent string = "persistent" - csOsDiskPersistenceEphemeral string = "ephemeral" + csCustomerManagedEncryptionTypeKms string = "kms" + csEncryptionAtHostStateDisabled string = "disabled" + csEncryptionAtHostStateEnabled string = "enabled" + csImageRegistryStateDisabled string = "disabled" + csImageRegistryStateEnabled string = "enabled" + csKeyManagementModeCustomerManaged string = "customer_managed" + csKeyManagementModePlatformManaged string = "platform_managed" + csNodeDrainGracePeriodUnit string = "minutes" + csOutboundType string = "load_balancer" + csUsernameClaimPrefixPolicyNoPrefix string = "NoPrefix" + csUsernameClaimPrefixPolicyPrefix string = "Prefix" + csCIDRBlockAllowAccessModeAllowAll string = "allow_all" + csCIDRBlockAllowAccessModeAllowList string = "allow_list" + csOsDiskPersistencePersistent string = "persistent" + csOsDiskPersistenceEphemeral string = "ephemeral" + csProvisioningShardStatusActive string = "active" + csProvisioningShardStatusMaintenance string = "maintenance" + csProvisioningShardStatusOffline string = "offline" ) // Sentinel error for use with errors.Is @@ -868,3 +873,125 @@ func CSErrorToCloudError(err error, resourceID *azcorearm.ResourceID) *arm.Cloud return arm.NewInternalServerError() } + +// ConvertCSManagementClusterToInternal converts a Cluster Service ProvisionShard +// to the internal ManagementCluster representation. +func ConvertCSManagementClusterToInternal(csShard *arohcpv1alpha1.ProvisionShard) (*fleet.ManagementCluster, error) { + if csShard == nil { + return nil, fmt.Errorf("provision shard is nil") + } + + shardHREF := csShard.HREF() + if len(shardHREF) == 0 { + return nil, fmt.Errorf("provision shard has empty HREF") + } + shardID, err := api.NewInternalID(shardHREF) + if err != nil { + return nil, fmt.Errorf("provision shard has invalid HREF %q: %w", shardHREF, err) + } + + azureShard := csShard.AzureShard() + if azureShard == nil { + return nil, fmt.Errorf("provision shard %q has no azure shard", shardID) + } + + managementClusterAKSResourceID, err := azcorearm.ParseResourceID(azureShard.AksManagementClusterResourceId()) + if err != nil { + return nil, fmt.Errorf("failed to parse management cluster AKS resource ID %q: %w", azureShard.AksManagementClusterResourceId(), err) + } + + publicDNSZoneResourceID, err := azcorearm.ParseResourceID(azureShard.PublicDnsZoneResourceId()) + if err != nil { + return nil, fmt.Errorf("failed to parse public DNS zone resource ID %q: %w", azureShard.PublicDnsZoneResourceId(), err) + } + + maestroConfig := csShard.MaestroConfig() + if maestroConfig == nil { + return nil, fmt.Errorf("management cluster %q has no maestro config", shardID) + } + restConfig := maestroConfig.RestApiConfig() + if restConfig == nil { + return nil, fmt.Errorf("management cluster %q has no maestro REST API config", shardID) + } + grpcConfig := maestroConfig.GrpcApiConfig() + if grpcConfig == nil { + return nil, fmt.Errorf("management cluster %q has no maestro GRPC API config", shardID) + } + + hostedClustersSecretsKeyVaultURL := azureShard.CxSecretsKeyVaultUrl() + hostedClustersManagedIdentitiesKeyVaultURL := azureShard.CxManagedIdentitiesKeyVaultUrl() + hostedClustersSecretsKeyVaultManagedIdentityClientID := azureShard.CxSecretsKeyVaultManagedIdentityClientId() + + readyCondition := metav1.Condition{ + Type: string(fleet.ManagementClusterConditionReady), + LastTransitionTime: metav1.Now(), + } + switch csShard.Status() { + case csProvisioningShardStatusActive: + readyCondition.Status = metav1.ConditionTrue + readyCondition.Reason = string(fleet.ManagementClusterConditionReasonProvisionShardActive) + case csProvisioningShardStatusMaintenance: + readyCondition.Status = metav1.ConditionFalse + readyCondition.Reason = string(fleet.ManagementClusterConditionReasonProvisionShardMaintenance) + readyCondition.Message = fmt.Sprintf("provision shard status is %q", csShard.Status()) + case csProvisioningShardStatusOffline: + readyCondition.Status = metav1.ConditionFalse + readyCondition.Reason = string(fleet.ManagementClusterConditionReasonProvisionShardOffline) + readyCondition.Message = fmt.Sprintf("provision shard status is %q", csShard.Status()) + default: + readyCondition.Status = metav1.ConditionUnknown + readyCondition.Reason = string(fleet.ManagementClusterConditionReasonProvisionShardStatusUnknown) + readyCondition.Message = fmt.Sprintf("provision shard has unrecognized status %q", csShard.Status()) + } + + // The stamp identifier is derived from the AKS cluster name, which must + // follow the {env}-{region}-mgmt-{stamp} convention (e.g. "prod-westus3-mgmt-1" + // yields stamp identifier "1"). This pattern is enforced by our rollout pipelines. + // Once the mgmt cluster enhancement enters phase 2, we can remove this logic + // and use the original stamp identifier to fill mgmt clusters instead of deriving + // it from the AKS cluster name. + aksName := managementClusterAKSResourceID.Name + lastDash := strings.LastIndex(aksName, "-") + if lastDash < 0 || lastDash == len(aksName)-1 { + return nil, fmt.Errorf("AKS cluster name %q does not contain a stamp suffix after the last '-'", aksName) + } + stampIdentifier := aksName[lastDash+1:] + + resourceID, err := fleet.ToManagementClusterResourceID(stampIdentifier) + if err != nil { + return nil, fmt.Errorf("failed to construct management cluster resource ID from stamp identifier %q: %w", stampIdentifier, err) + } + + mc := &fleet.ManagementCluster{ + CosmosMetadata: api.CosmosMetadata{ + ResourceID: resourceID, + }, + ResourceID: resourceID, + Spec: fleet.ManagementClusterSpec{ + SchedulingPolicy: convertShardStatusToSchedulingPolicy(csShard.Status()), + }, + Status: fleet.ManagementClusterStatus{ + AKSResourceID: managementClusterAKSResourceID, + PublicDNSZoneResourceID: publicDNSZoneResourceID, + HostedClustersSecretsKeyVaultURL: hostedClustersSecretsKeyVaultURL, + HostedClustersManagedIdentitiesKeyVaultURL: hostedClustersManagedIdentitiesKeyVaultURL, + HostedClustersSecretsKeyVaultManagedIdentityClientID: hostedClustersSecretsKeyVaultManagedIdentityClientID, + MaestroConsumerName: maestroConfig.ConsumerName(), + MaestroRESTAPIURL: restConfig.Url(), + MaestroGRPCTarget: grpcConfig.Url(), + ClusterServiceProvisionShardID: &shardID, + Conditions: []metav1.Condition{readyCondition}, + }, + } + + return mc, nil +} + +// convertShardStatusToSchedulingPolicy maps a Cluster Service provision shard +// status to a ManagementClusterSchedulingPolicy. +func convertShardStatusToSchedulingPolicy(status string) fleet.ManagementClusterSchedulingPolicy { + if status == csProvisioningShardStatusActive { + return fleet.ManagementClusterSchedulingPolicySchedulable + } + return fleet.ManagementClusterSchedulingPolicyUnschedulable +} diff --git a/internal/ocm/convert_test.go b/internal/ocm/convert_test.go index ba9ce0ada4e..563b18fb435 100644 --- a/internal/ocm/convert_test.go +++ b/internal/ocm/convert_test.go @@ -25,6 +25,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" @@ -33,6 +34,7 @@ import ( "github.com/Azure/ARO-HCP/internal/api" "github.com/Azure/ARO-HCP/internal/api/arm" + "github.com/Azure/ARO-HCP/internal/api/fleet" ) const ( @@ -1094,3 +1096,203 @@ func TestBuildCSCluster(t *testing.T) { }) } } + +// validProvisionShardBuilder returns a builder pre-populated with all required fields +// for a valid management cluster conversion. Tests can override individual fields. +func validProvisionShardBuilder(t *testing.T) *arohcpv1alpha1.ProvisionShardBuilder { + t.Helper() + return arohcpv1alpha1.NewProvisionShard(). + ID("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"). + HREF("/api/aro_hcp/v1alpha1/provision_shards/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"). + Status(csProvisioningShardStatusActive). + Topology("shared"). + AzureShard(arohcpv1alpha1.NewAzureShard(). + AksManagementClusterResourceId("/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/test-westus3-mgmt-1"). + PublicDnsZoneResourceId("/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/dns-rg/providers/Microsoft.Network/dnszones/test.example.com"). + CxSecretsKeyVaultUrl("https://cx-kv.vault.azure.net/"). + CxManagedIdentitiesKeyVaultUrl("https://mi-kv.vault.azure.net/"). + CxSecretsKeyVaultManagedIdentityClientId("c2bde1aa-d904-48cd-a728-9de33e3ddca9"), + ). + MaestroConfig( + arohcpv1alpha1.NewProvisionShardMaestroConfig(). + ConsumerName("test-consumer"). + RestApiConfig(arohcpv1alpha1.NewProvisionShardMaestroRestApiConfig(). + Url("http://maestro.maestro.svc.cluster.local:8000")). + GrpcApiConfig(arohcpv1alpha1.NewProvisionShardMaestroGrpcApiConfig(). + Url("maestro-grpc.maestro.svc.cluster.local:8090")), + ) +} + +func TestConvertCSManagementClusterToInternal(t *testing.T) { + tests := []struct { + name string + build func(t *testing.T) *arohcpv1alpha1.ProvisionShard + expectedErrorSubstr string + validate func(t *testing.T, mc *fleet.ManagementCluster) + }{ + { + name: "nil shard", + build: func(t *testing.T) *arohcpv1alpha1.ProvisionShard { + return nil + }, + expectedErrorSubstr: "provision shard is nil", + }, + { + name: "empty shard HREF", + build: func(t *testing.T) *arohcpv1alpha1.ProvisionShard { + shard, err := arohcpv1alpha1.NewProvisionShard().Build() + require.NoError(t, err) + return shard + }, + expectedErrorSubstr: "provision shard has empty HREF", + }, + { + name: "invalid AKS resource ID", + build: func(t *testing.T) *arohcpv1alpha1.ProvisionShard { + shard, err := arohcpv1alpha1.NewProvisionShard(). + ID("11111111-2222-3333-4444-555555555555"). + HREF("/api/aro_hcp/v1alpha1/provision_shards/11111111-2222-3333-4444-555555555555"). + AzureShard(arohcpv1alpha1.NewAzureShard(). + AksManagementClusterResourceId("not-a-valid-resource-id")). + Build() + require.NoError(t, err) + return shard + }, + expectedErrorSubstr: "failed to parse management cluster AKS resource ID", + }, + { + name: "invalid public DNS zone resource ID", + build: func(t *testing.T) *arohcpv1alpha1.ProvisionShard { + shard, err := arohcpv1alpha1.NewProvisionShard(). + ID("11111111-2222-3333-4444-555555555555"). + HREF("/api/aro_hcp/v1alpha1/provision_shards/11111111-2222-3333-4444-555555555555"). + AzureShard(arohcpv1alpha1.NewAzureShard(). + AksManagementClusterResourceId("/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/test-westus3-mgmt-1"). + PublicDnsZoneResourceId("not-valid")). + Build() + require.NoError(t, err) + return shard + }, + expectedErrorSubstr: "failed to parse public DNS zone resource ID", + }, + { + name: "missing maestro config", + build: func(t *testing.T) *arohcpv1alpha1.ProvisionShard { + shard, err := arohcpv1alpha1.NewProvisionShard(). + ID("11111111-2222-3333-4444-555555555555"). + HREF("/api/aro_hcp/v1alpha1/provision_shards/11111111-2222-3333-4444-555555555555"). + AzureShard(arohcpv1alpha1.NewAzureShard(). + AksManagementClusterResourceId("/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/test-westus3-mgmt-1"). + PublicDnsZoneResourceId("/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/dns-rg/providers/Microsoft.Network/dnszones/test.example.com"). + CxSecretsKeyVaultUrl("https://cx-kv.vault.azure.net/"). + CxManagedIdentitiesKeyVaultUrl("https://mi-kv.vault.azure.net/"). + CxSecretsKeyVaultManagedIdentityClientId("c2bde1aa-d904-48cd-a728-9de33e3ddca9"), + ). + Build() + require.NoError(t, err) + return shard + }, + expectedErrorSubstr: "no maestro config", + }, + { + name: "successful conversion populates all fields", + build: func(t *testing.T) *arohcpv1alpha1.ProvisionShard { + shard, err := validProvisionShardBuilder(t).Build() + require.NoError(t, err) + return shard + }, + validate: func(t *testing.T, mc *fleet.ManagementCluster) { + // ResourceID + expectedResourceID := api.Must(fleet.ToManagementClusterResourceID("1")) + require.NotNil(t, mc.ResourceID) + assert.Equal(t, expectedResourceID.String(), mc.ResourceID.String()) + assert.Equal(t, mc.ResourceID, mc.CosmosMetadata.ResourceID) + + assert.Equal(t, "1", mc.GetStampIdentifier(), "stamp identifier should be suffix after last '-' in AKS cluster name") + assert.Equal(t, fleet.ManagementClusterSchedulingPolicySchedulable, mc.Spec.SchedulingPolicy, "active shard should be schedulable") + + // Status + require.NotNil(t, mc.Status.AKSResourceID) + assert.Equal(t, "test-westus3-mgmt-1", mc.Status.AKSResourceID.Name) + require.NotNil(t, mc.Status.PublicDNSZoneResourceID) + assert.Equal(t, "https://cx-kv.vault.azure.net/", mc.Status.HostedClustersSecretsKeyVaultURL) + assert.Equal(t, "https://mi-kv.vault.azure.net/", mc.Status.HostedClustersManagedIdentitiesKeyVaultURL) + assert.Equal(t, "c2bde1aa-d904-48cd-a728-9de33e3ddca9", mc.Status.HostedClustersSecretsKeyVaultManagedIdentityClientID) + require.NotNil(t, mc.Status.ClusterServiceProvisionShardID) + assert.Equal(t, api.Must(api.NewInternalID("/api/aro_hcp/v1alpha1/provision_shards/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")), *mc.Status.ClusterServiceProvisionShardID) + + // Maestro config + assert.Equal(t, "test-consumer", mc.Status.MaestroConsumerName) + assert.Equal(t, "http://maestro.maestro.svc.cluster.local:8000", mc.Status.MaestroRESTAPIURL) + assert.Equal(t, "maestro-grpc.maestro.svc.cluster.local:8090", mc.Status.MaestroGRPCTarget) + }, + }, + { + name: "maintenance shard is unschedulable", + build: func(t *testing.T) *arohcpv1alpha1.ProvisionShard { + shard, err := validProvisionShardBuilder(t).Status("maintenance").Build() + require.NoError(t, err) + return shard + }, + validate: func(t *testing.T, mc *fleet.ManagementCluster) { + assert.Equal(t, fleet.ManagementClusterSchedulingPolicyUnschedulable, mc.Spec.SchedulingPolicy, "maintenance shard should be unschedulable") + require.Len(t, mc.Status.Conditions, 1) + assert.Equal(t, string(fleet.ManagementClusterConditionReady), mc.Status.Conditions[0].Type) + assert.Equal(t, metav1.ConditionFalse, mc.Status.Conditions[0].Status) + assert.Equal(t, string(fleet.ManagementClusterConditionReasonProvisionShardMaintenance), mc.Status.Conditions[0].Reason) + assert.Contains(t, mc.Status.Conditions[0].Message, "maintenance") + }, + }, + { + name: "offline shard is unschedulable", + build: func(t *testing.T) *arohcpv1alpha1.ProvisionShard { + shard, err := validProvisionShardBuilder(t).Status("offline").Build() + require.NoError(t, err) + return shard + }, + validate: func(t *testing.T, mc *fleet.ManagementCluster) { + assert.Equal(t, fleet.ManagementClusterSchedulingPolicyUnschedulable, mc.Spec.SchedulingPolicy, "offline shard should be unschedulable") + require.Len(t, mc.Status.Conditions, 1) + assert.Equal(t, string(fleet.ManagementClusterConditionReady), mc.Status.Conditions[0].Type) + assert.Equal(t, metav1.ConditionFalse, mc.Status.Conditions[0].Status) + assert.Equal(t, string(fleet.ManagementClusterConditionReasonProvisionShardOffline), mc.Status.Conditions[0].Reason) + assert.Contains(t, mc.Status.Conditions[0].Message, "offline") + }, + }, + { + name: "unknown shard status produces ConditionUnknown", + build: func(t *testing.T) *arohcpv1alpha1.ProvisionShard { + shard, err := validProvisionShardBuilder(t).Status("some-new-status").Build() + require.NoError(t, err) + return shard + }, + validate: func(t *testing.T, mc *fleet.ManagementCluster) { + assert.Equal(t, fleet.ManagementClusterSchedulingPolicyUnschedulable, mc.Spec.SchedulingPolicy, "unknown status shard should be unschedulable") + require.Len(t, mc.Status.Conditions, 1) + assert.Equal(t, string(fleet.ManagementClusterConditionReady), mc.Status.Conditions[0].Type) + assert.Equal(t, metav1.ConditionUnknown, mc.Status.Conditions[0].Status) + assert.Equal(t, string(fleet.ManagementClusterConditionReasonProvisionShardStatusUnknown), mc.Status.Conditions[0].Reason) + assert.Contains(t, mc.Status.Conditions[0].Message, "some-new-status") + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + shard := tt.build(t) + mc, err := ConvertCSManagementClusterToInternal(shard) + if len(tt.expectedErrorSubstr) > 0 { + require.Error(t, err) + assert.Contains(t, err.Error(), tt.expectedErrorSubstr) + assert.Nil(t, mc) + } else { + require.NoError(t, err) + require.NotNil(t, mc) + if tt.validate != nil { + tt.validate(t, mc) + } + } + }) + } +} diff --git a/internal/validation/validate_management_cluster.go b/internal/validation/validate_management_cluster.go new file mode 100644 index 00000000000..d76d5d5d8de --- /dev/null +++ b/internal/validation/validate_management_cluster.go @@ -0,0 +1,160 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validation + +import ( + "context" + "regexp" + + "k8s.io/apimachinery/pkg/api/operation" + "k8s.io/apimachinery/pkg/api/safe" + "k8s.io/apimachinery/pkg/api/validate" + "k8s.io/apimachinery/pkg/util/validation/field" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/fleet" +) + +// ValidateManagementClusterCreate validates a ManagementCluster for creation. +func ValidateManagementClusterCreate(ctx context.Context, newObj *fleet.ManagementCluster) field.ErrorList { + op := operation.Operation{Type: operation.Create} + return validateManagementCluster(ctx, op, newObj, nil) +} + +// ValidateManagementClusterUpdate validates a ManagementCluster for update. +func ValidateManagementClusterUpdate(ctx context.Context, newObj, oldObj *fleet.ManagementCluster) field.ErrorList { + op := operation.Operation{Type: operation.Update} + return validateManagementCluster(ctx, op, newObj, oldObj) +} + +var ( + toManagementClusterResourceID = func(oldObj *fleet.ManagementCluster) *azcorearm.ResourceID { return oldObj.ResourceID } + toManagementClusterSpec = func(oldObj *fleet.ManagementCluster) *fleet.ManagementClusterSpec { return &oldObj.Spec } + toManagementClusterStatus = func(oldObj *fleet.ManagementCluster) *fleet.ManagementClusterStatus { return &oldObj.Status } +) + +var stampIdentifierRegex = regexp.MustCompile(`^[a-z0-9]{1,3}$`) + +func validateManagementCluster(ctx context.Context, op operation.Operation, newObj, oldObj *fleet.ManagementCluster) field.ErrorList { + errs := field.ErrorList{} + + // ResourceID (top-level, mirrors CosmosMetadata.ResourceID) + errs = append(errs, validate.RequiredPointer(ctx, op, field.NewPath("resourceId"), newObj.ResourceID, safe.Field(oldObj, toManagementClusterResourceID))...) + errs = append(errs, immutableByReflect(ctx, op, field.NewPath("resourceId"), newObj.ResourceID, safe.Field(oldObj, toManagementClusterResourceID))...) + if newObj.ResourceID != nil && newObj.ResourceID.Parent != nil { + errs = append(errs, MatchesRegex(ctx, op, field.NewPath("resourceId", "parent", "name"), &newObj.ResourceID.Parent.Name, nil, stampIdentifierRegex, "must be 1-3 lowercase alphanumeric characters")...) + } + + // Spec + errs = append(errs, validateManagementClusterSpec(ctx, op, field.NewPath("spec"), &newObj.Spec, safe.Field(oldObj, toManagementClusterSpec))...) + + // Status + errs = append(errs, validateManagementClusterStatus(ctx, op, field.NewPath("status"), &newObj.Status, safe.Field(oldObj, toManagementClusterStatus))...) + + return errs +} + +var ( + toManagementClusterSpecSchedulingPolicy = func(oldObj *fleet.ManagementClusterSpec) *fleet.ManagementClusterSchedulingPolicy { + return &oldObj.SchedulingPolicy + } +) + +func validateManagementClusterSpec(ctx context.Context, op operation.Operation, fldPath *field.Path, newObj, oldObj *fleet.ManagementClusterSpec) field.ErrorList { + errs := field.ErrorList{} + + // SchedulingPolicy — required, must be a valid value + errs = append(errs, validate.RequiredValue(ctx, op, fldPath.Child("schedulingPolicy"), &newObj.SchedulingPolicy, safe.Field(oldObj, toManagementClusterSpecSchedulingPolicy))...) + errs = append(errs, validate.Enum(ctx, op, fldPath.Child("schedulingPolicy"), &newObj.SchedulingPolicy, safe.Field(oldObj, toManagementClusterSpecSchedulingPolicy), fleet.ValidManagementClusterSchedulingPolicies, nil)...) + + return errs +} + +var ( + toManagementClusterStatusAKSResourceID = func(oldObj *fleet.ManagementClusterStatus) *azcorearm.ResourceID { return oldObj.AKSResourceID } + toManagementClusterStatusPublicDNSZoneResourceID = func(oldObj *fleet.ManagementClusterStatus) *azcorearm.ResourceID { + return oldObj.PublicDNSZoneResourceID + } + toManagementClusterStatusHostedClustersSecretsKeyVaultURL = func(oldObj *fleet.ManagementClusterStatus) *string { return &oldObj.HostedClustersSecretsKeyVaultURL } + toManagementClusterStatusHostedClustersManagedIdentitiesKeyVaultURL = func(oldObj *fleet.ManagementClusterStatus) *string { + return &oldObj.HostedClustersManagedIdentitiesKeyVaultURL + } + toManagementClusterStatusHostedClustersSecretsKeyVaultManagedIdentityClientID = func(oldObj *fleet.ManagementClusterStatus) *string { + return &oldObj.HostedClustersSecretsKeyVaultManagedIdentityClientID + } + toManagementClusterStatusClusterServiceProvisionShardID = func(oldObj *fleet.ManagementClusterStatus) *api.InternalID { + return oldObj.ClusterServiceProvisionShardID + } + toManagementClusterStatusMaestroConsumerName = func(oldObj *fleet.ManagementClusterStatus) *string { + return &oldObj.MaestroConsumerName + } + toManagementClusterStatusMaestroRESTAPIURL = func(oldObj *fleet.ManagementClusterStatus) *string { + return &oldObj.MaestroRESTAPIURL + } + toManagementClusterStatusMaestroGRPCTarget = func(oldObj *fleet.ManagementClusterStatus) *string { + return &oldObj.MaestroGRPCTarget + } +) + +func validateManagementClusterStatus(ctx context.Context, op operation.Operation, fldPath *field.Path, newObj, oldObj *fleet.ManagementClusterStatus) field.ErrorList { + errs := field.ErrorList{} + + // AKSResourceID — required, validated as AKS resource type, immutable + errs = append(errs, validate.RequiredPointer(ctx, op, fldPath.Child("aksResourceID"), newObj.AKSResourceID, safe.Field(oldObj, toManagementClusterStatusAKSResourceID))...) + errs = append(errs, RestrictedResourceIDWithResourceGroup(ctx, op, fldPath.Child("aksResourceID"), newObj.AKSResourceID, safe.Field(oldObj, toManagementClusterStatusAKSResourceID), "Microsoft.ContainerService/managedClusters")...) + errs = append(errs, immutableByReflect(ctx, op, fldPath.Child("aksResourceID"), newObj.AKSResourceID, safe.Field(oldObj, toManagementClusterStatusAKSResourceID))...) + + // PublicDNSZoneResourceID — required, validated as DNS zone resource type, immutable + errs = append(errs, validate.RequiredPointer(ctx, op, fldPath.Child("publicDNSZoneResourceID"), newObj.PublicDNSZoneResourceID, safe.Field(oldObj, toManagementClusterStatusPublicDNSZoneResourceID))...) + errs = append(errs, RestrictedResourceIDWithResourceGroup(ctx, op, fldPath.Child("publicDNSZoneResourceID"), newObj.PublicDNSZoneResourceID, safe.Field(oldObj, toManagementClusterStatusPublicDNSZoneResourceID), "Microsoft.Network/dnszones")...) + errs = append(errs, immutableByReflect(ctx, op, fldPath.Child("publicDNSZoneResourceID"), newObj.PublicDNSZoneResourceID, safe.Field(oldObj, toManagementClusterStatusPublicDNSZoneResourceID))...) + + // HostedClustersSecretsKeyVaultURL — required, validated as URL, immutable + errs = append(errs, validate.RequiredValue(ctx, op, fldPath.Child("hostedClustersSecretsKeyVaultURL"), &newObj.HostedClustersSecretsKeyVaultURL, safe.Field(oldObj, toManagementClusterStatusHostedClustersSecretsKeyVaultURL))...) + errs = append(errs, URL(ctx, op, fldPath.Child("hostedClustersSecretsKeyVaultURL"), &newObj.HostedClustersSecretsKeyVaultURL, safe.Field(oldObj, toManagementClusterStatusHostedClustersSecretsKeyVaultURL))...) + errs = append(errs, immutableByCompare(ctx, op, fldPath.Child("hostedClustersSecretsKeyVaultURL"), &newObj.HostedClustersSecretsKeyVaultURL, safe.Field(oldObj, toManagementClusterStatusHostedClustersSecretsKeyVaultURL))...) + + // HostedClustersManagedIdentitiesKeyVaultURL — required, validated as URL, immutable + errs = append(errs, validate.RequiredValue(ctx, op, fldPath.Child("hostedClustersManagedIdentitiesKeyVaultURL"), &newObj.HostedClustersManagedIdentitiesKeyVaultURL, safe.Field(oldObj, toManagementClusterStatusHostedClustersManagedIdentitiesKeyVaultURL))...) + errs = append(errs, URL(ctx, op, fldPath.Child("hostedClustersManagedIdentitiesKeyVaultURL"), &newObj.HostedClustersManagedIdentitiesKeyVaultURL, safe.Field(oldObj, toManagementClusterStatusHostedClustersManagedIdentitiesKeyVaultURL))...) + errs = append(errs, immutableByCompare(ctx, op, fldPath.Child("hostedClustersManagedIdentitiesKeyVaultURL"), &newObj.HostedClustersManagedIdentitiesKeyVaultURL, safe.Field(oldObj, toManagementClusterStatusHostedClustersManagedIdentitiesKeyVaultURL))...) + + // HostedClustersSecretsKeyVaultManagedIdentityClientID — required, validated as UUID, immutable + errs = append(errs, validate.RequiredValue(ctx, op, fldPath.Child("hostedClustersSecretsKeyVaultManagedIdentityClientID"), &newObj.HostedClustersSecretsKeyVaultManagedIdentityClientID, safe.Field(oldObj, toManagementClusterStatusHostedClustersSecretsKeyVaultManagedIdentityClientID))...) + errs = append(errs, ValidateUUID(ctx, op, fldPath.Child("hostedClustersSecretsKeyVaultManagedIdentityClientID"), &newObj.HostedClustersSecretsKeyVaultManagedIdentityClientID, safe.Field(oldObj, toManagementClusterStatusHostedClustersSecretsKeyVaultManagedIdentityClientID))...) + errs = append(errs, immutableByCompare(ctx, op, fldPath.Child("hostedClustersSecretsKeyVaultManagedIdentityClientID"), &newObj.HostedClustersSecretsKeyVaultManagedIdentityClientID, safe.Field(oldObj, toManagementClusterStatusHostedClustersSecretsKeyVaultManagedIdentityClientID))...) + + // ClusterServiceProvisionShardID — required, immutable + errs = append(errs, validate.RequiredPointer(ctx, op, fldPath.Child("clusterServiceProvisionShardID"), newObj.ClusterServiceProvisionShardID, safe.Field(oldObj, toManagementClusterStatusClusterServiceProvisionShardID))...) + errs = append(errs, immutableByReflect(ctx, op, fldPath.Child("clusterServiceProvisionShardID"), newObj.ClusterServiceProvisionShardID, safe.Field(oldObj, toManagementClusterStatusClusterServiceProvisionShardID))...) + + // MaestroConsumerName — required, immutable + errs = append(errs, validate.RequiredValue(ctx, op, fldPath.Child("maestroConsumerName"), &newObj.MaestroConsumerName, safe.Field(oldObj, toManagementClusterStatusMaestroConsumerName))...) + errs = append(errs, immutableByCompare(ctx, op, fldPath.Child("maestroConsumerName"), &newObj.MaestroConsumerName, safe.Field(oldObj, toManagementClusterStatusMaestroConsumerName))...) + + // MaestroRESTAPIURL — required, validated as URL, immutable + errs = append(errs, validate.RequiredValue(ctx, op, fldPath.Child("maestroRESTAPIURL"), &newObj.MaestroRESTAPIURL, safe.Field(oldObj, toManagementClusterStatusMaestroRESTAPIURL))...) + errs = append(errs, URL(ctx, op, fldPath.Child("maestroRESTAPIURL"), &newObj.MaestroRESTAPIURL, safe.Field(oldObj, toManagementClusterStatusMaestroRESTAPIURL))...) + errs = append(errs, immutableByCompare(ctx, op, fldPath.Child("maestroRESTAPIURL"), &newObj.MaestroRESTAPIURL, safe.Field(oldObj, toManagementClusterStatusMaestroRESTAPIURL))...) + + // MaestroGRPCTarget — required, validated as HostPort, immutable + errs = append(errs, validate.RequiredValue(ctx, op, fldPath.Child("maestroGRPCTarget"), &newObj.MaestroGRPCTarget, safe.Field(oldObj, toManagementClusterStatusMaestroGRPCTarget))...) + errs = append(errs, HostPort(ctx, op, fldPath.Child("maestroGRPCTarget"), &newObj.MaestroGRPCTarget, safe.Field(oldObj, toManagementClusterStatusMaestroGRPCTarget))...) + errs = append(errs, immutableByCompare(ctx, op, fldPath.Child("maestroGRPCTarget"), &newObj.MaestroGRPCTarget, safe.Field(oldObj, toManagementClusterStatusMaestroGRPCTarget))...) + + return errs +} diff --git a/internal/validation/validate_management_cluster_test.go b/internal/validation/validate_management_cluster_test.go new file mode 100644 index 00000000000..3cfccea2dd0 --- /dev/null +++ b/internal/validation/validate_management_cluster_test.go @@ -0,0 +1,389 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validation + +import ( + "context" + "strings" + "testing" + + "k8s.io/utils/ptr" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/arm" + "github.com/Azure/ARO-HCP/internal/api/fleet" +) + +func validManagementCluster(t *testing.T) *fleet.ManagementCluster { + t.Helper() + resourceID := api.Must(fleet.ToManagementClusterResourceID("1")) + return &fleet.ManagementCluster{ + CosmosMetadata: arm.CosmosMetadata{ + ResourceID: resourceID, + }, + ResourceID: resourceID, + Spec: fleet.ManagementClusterSpec{ + SchedulingPolicy: fleet.ManagementClusterSchedulingPolicySchedulable, + }, + Status: fleet.ManagementClusterStatus{ + AKSResourceID: api.Must(azcorearm.ParseResourceID("/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/pers-westus3-mgmt-1")), + PublicDNSZoneResourceID: api.Must(azcorearm.ParseResourceID("/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg/providers/Microsoft.Network/dnszones/example.com")), + HostedClustersSecretsKeyVaultURL: "https://kv-cx-secrets.vault.azure.net", + HostedClustersManagedIdentitiesKeyVaultURL: "https://kv-cx-mi.vault.azure.net", + HostedClustersSecretsKeyVaultManagedIdentityClientID: "12345678-1234-1234-1234-123456789012", + ClusterServiceProvisionShardID: ptr.To(api.Must(api.NewInternalID("/api/aro_hcp/v1alpha1/provision_shards/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"))), + MaestroConsumerName: "hcp-underlay-westus3-mgmt-1", + MaestroRESTAPIURL: "http://maestro.maestro.svc.cluster.local:8000", + MaestroGRPCTarget: "maestro-grpc.maestro.svc.cluster.local:8090", + }, + } +} + +func TestValidateManagementClusterCreate(t *testing.T) { + t.Parallel() + + type expectedError struct { + message string + fieldPath string + } + + tests := []struct { + name string + modify func(t *testing.T, mc *fleet.ManagementCluster) + expectErrors []expectedError + }{ + { + name: "valid create", + modify: func(t *testing.T, mc *fleet.ManagementCluster) {}, + expectErrors: nil, + }, + // ResourceID + { + name: "missing resourceId", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.ResourceID = nil + }, + expectErrors: []expectedError{ + {fieldPath: "resourceId", message: "Required"}, + }, + }, + // Stamp identifier (resourceId.parent.name) — must be 1-3 lowercase alphanumeric + { + name: "stamp identifier with uppercase rejected", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.ResourceID = api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/ABC/managementClusters/default")) + mc.CosmosMetadata.ResourceID = mc.ResourceID + }, + expectErrors: []expectedError{ + {fieldPath: "resourceId.parent.name", message: "must be 1-3 lowercase alphanumeric characters"}, + }, + }, + { + name: "stamp identifier too long rejected", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.ResourceID = api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/abcd/managementClusters/default")) + mc.CosmosMetadata.ResourceID = mc.ResourceID + }, + expectErrors: []expectedError{ + {fieldPath: "resourceId.parent.name", message: "must be 1-3 lowercase alphanumeric characters"}, + }, + }, + { + name: "stamp identifier with special chars rejected", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.ResourceID = api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/a-b/managementClusters/default")) + mc.CosmosMetadata.ResourceID = mc.ResourceID + }, + expectErrors: []expectedError{ + {fieldPath: "resourceId.parent.name", message: "must be 1-3 lowercase alphanumeric characters"}, + }, + }, + { + name: "stamp identifier single char accepted", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.ResourceID = api.Must(fleet.ToManagementClusterResourceID("a")) + mc.CosmosMetadata.ResourceID = mc.ResourceID + }, + expectErrors: nil, + }, + { + name: "stamp identifier three chars accepted", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.ResourceID = api.Must(fleet.ToManagementClusterResourceID("ab3")) + mc.CosmosMetadata.ResourceID = mc.ResourceID + }, + expectErrors: nil, + }, + // SchedulingPolicy + { + name: "empty schedulingPolicy rejected", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Spec.SchedulingPolicy = "" + }, + expectErrors: []expectedError{ + {fieldPath: "spec.schedulingPolicy", message: "Required"}, + {fieldPath: "spec.schedulingPolicy", message: "Unsupported value"}, + }, + }, + { + name: "invalid schedulingPolicy rejected", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Spec.SchedulingPolicy = "InvalidValue" + }, + expectErrors: []expectedError{ + {fieldPath: "spec.schedulingPolicy", message: "Unsupported value"}, + }, + }, + { + name: "Unschedulable schedulingPolicy accepted", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Spec.SchedulingPolicy = fleet.ManagementClusterSchedulingPolicyUnschedulable + }, + expectErrors: nil, + }, + // Status — all fields required + { + name: "empty status rejected", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status = fleet.ManagementClusterStatus{} + }, + expectErrors: []expectedError{ + {fieldPath: "status.aksResourceID", message: "Required"}, + {fieldPath: "status.publicDNSZoneResourceID", message: "Required"}, + {fieldPath: "status.hostedClustersSecretsKeyVaultURL", message: "Required"}, + {fieldPath: "status.hostedClustersManagedIdentitiesKeyVaultURL", message: "Required"}, + {fieldPath: "status.hostedClustersSecretsKeyVaultManagedIdentityClientID", message: "Required"}, + {fieldPath: "status.clusterServiceProvisionShardID", message: "Required"}, + {fieldPath: "status.maestroConsumerName", message: "Required"}, + {fieldPath: "status.maestroRESTAPIURL", message: "Required"}, + {fieldPath: "status.maestroGRPCTarget", message: "Required"}, + }, + }, + { + name: "missing aksResourceID rejected", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.AKSResourceID = nil + }, + expectErrors: []expectedError{ + {fieldPath: "status.aksResourceID", message: "Required"}, + }, + }, + { + name: "invalid hostedClustersSecretsKeyVaultManagedIdentityClientID", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.HostedClustersSecretsKeyVaultManagedIdentityClientID = "not-a-uuid" + }, + expectErrors: []expectedError{ + {fieldPath: "status.hostedClustersSecretsKeyVaultManagedIdentityClientID", message: "invalid"}, + }, + }, + { + name: "invalid maestroGRPCTarget format rejected", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.MaestroGRPCTarget = "missing-port" + }, + expectErrors: []expectedError{ + {fieldPath: "status.maestroGRPCTarget", message: "must be host:port"}, + }, + }, + { + name: "invalid maestroGRPCTarget host rejected", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.MaestroGRPCTarget = "not_a_valid_host:8090" + }, + expectErrors: []expectedError{ + {fieldPath: "status.maestroGRPCTarget", message: "invalid host"}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + mc := validManagementCluster(t) + tt.modify(t, mc) + errs := ValidateManagementClusterCreate(context.Background(), mc) + + if len(tt.expectErrors) == 0 { + if len(errs) != 0 { + t.Errorf("expected no errors, got %d: %v", len(errs), errs) + } + return + } + for _, expectedErr := range tt.expectErrors { + found := false + for _, err := range errs { + if strings.Contains(err.Error(), expectedErr.message) && strings.Contains(err.Field, expectedErr.fieldPath) { + found = true + break + } + } + if !found { + t.Errorf("expected error containing message %q at field %q but not found in: %v", expectedErr.message, expectedErr.fieldPath, errs) + } + } + }) + } +} + +func TestValidateManagementClusterUpdate(t *testing.T) { + t.Parallel() + + type expectedError struct { + message string + fieldPath string + } + + tests := []struct { + name string + modify func(t *testing.T, mc *fleet.ManagementCluster) + expectErrors []expectedError + }{ + { + name: "valid update - no changes", + modify: func(t *testing.T, mc *fleet.ManagementCluster) {}, + expectErrors: nil, + }, + { + name: "valid update - change schedulingPolicy", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Spec.SchedulingPolicy = fleet.ManagementClusterSchedulingPolicyUnschedulable + }, + expectErrors: nil, + }, + // Immutability checks + { + name: "aksResourceID changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.AKSResourceID = api.Must(azcorearm.ParseResourceID("/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/different-name")) + }, + expectErrors: []expectedError{ + {fieldPath: "status.aksResourceID", message: "immutable"}, + }, + }, + { + name: "publicDNSZoneResourceID changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.PublicDNSZoneResourceID = api.Must(azcorearm.ParseResourceID("/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg/providers/Microsoft.Network/dnszones/other.com")) + }, + expectErrors: []expectedError{ + {fieldPath: "status.publicDNSZoneResourceID", message: "immutable"}, + }, + }, + { + name: "hostedClustersSecretsKeyVaultURL changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.HostedClustersSecretsKeyVaultURL = "https://kv-other.vault.azure.net" + }, + expectErrors: []expectedError{ + {fieldPath: "status.hostedClustersSecretsKeyVaultURL", message: "immutable"}, + }, + }, + { + name: "hostedClustersManagedIdentitiesKeyVaultURL changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.HostedClustersManagedIdentitiesKeyVaultURL = "https://kv-other.vault.azure.net" + }, + expectErrors: []expectedError{ + {fieldPath: "status.hostedClustersManagedIdentitiesKeyVaultURL", message: "immutable"}, + }, + }, + { + name: "hostedClustersSecretsKeyVaultManagedIdentityClientID changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.HostedClustersSecretsKeyVaultManagedIdentityClientID = "99999999-9999-9999-9999-999999999999" + }, + expectErrors: []expectedError{ + {fieldPath: "status.hostedClustersSecretsKeyVaultManagedIdentityClientID", message: "immutable"}, + }, + }, + { + name: "clusterServiceProvisionShardID changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.ClusterServiceProvisionShardID = ptr.To(api.Must(api.NewInternalID("/api/aro_hcp/v1alpha1/provision_shards/11111111-2222-3333-4444-555555555555"))) + }, + expectErrors: []expectedError{ + {fieldPath: "status.clusterServiceProvisionShardID", message: "immutable"}, + }, + }, + { + name: "maestroConsumerName changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.MaestroConsumerName = "different-consumer" + }, + expectErrors: []expectedError{ + {fieldPath: "status.maestroConsumerName", message: "immutable"}, + }, + }, + { + name: "maestroRESTAPIURL changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.MaestroRESTAPIURL = "http://different:8000" + }, + expectErrors: []expectedError{ + {fieldPath: "status.maestroRESTAPIURL", message: "immutable"}, + }, + }, + { + name: "maestroGRPCTarget changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.Status.MaestroGRPCTarget = "different:8090" + }, + expectErrors: []expectedError{ + {fieldPath: "status.maestroGRPCTarget", message: "immutable"}, + }, + }, + { + name: "resourceId changed", + modify: func(t *testing.T, mc *fleet.ManagementCluster) { + mc.ResourceID = api.Must(fleet.ToManagementClusterResourceID("x2")) + }, + expectErrors: []expectedError{ + {fieldPath: "resourceId", message: "immutable"}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + oldObj := validManagementCluster(t) + newObj := validManagementCluster(t) + tt.modify(t, newObj) + errs := ValidateManagementClusterUpdate(context.Background(), newObj, oldObj) + + if len(tt.expectErrors) == 0 { + if len(errs) != 0 { + t.Errorf("expected no errors, got %d: %v", len(errs), errs) + } + return + } + for _, expectedErr := range tt.expectErrors { + found := false + for _, err := range errs { + if strings.Contains(err.Error(), expectedErr.message) && strings.Contains(err.Field, expectedErr.fieldPath) { + found = true + break + } + } + if !found { + t.Errorf("expected error containing message %q at field %q but not found in: %v", expectedErr.message, expectedErr.fieldPath, errs) + } + } + }) + } +} diff --git a/internal/validation/validate_stamp.go b/internal/validation/validate_stamp.go new file mode 100644 index 00000000000..bed6cacf7c0 --- /dev/null +++ b/internal/validation/validate_stamp.go @@ -0,0 +1,48 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validation + +import ( + "context" + + "k8s.io/apimachinery/pkg/util/validation/field" + + "github.com/Azure/ARO-HCP/internal/api/fleet" +) + +func ValidateStampCreate(_ context.Context, stamp *fleet.Stamp) field.ErrorList { + var errs field.ErrorList + errs = append(errs, validateStampIdentifier(stamp)...) + return errs +} + +func ValidateStampUpdate(_ context.Context, newStamp *fleet.Stamp, _ *fleet.Stamp) field.ErrorList { + var errs field.ErrorList + errs = append(errs, validateStampIdentifier(newStamp)...) + return errs +} + +func validateStampIdentifier(stamp *fleet.Stamp) field.ErrorList { + var errs field.ErrorList + stampIdentifier := stamp.GetStampIdentifier() + if !stampIdentifierRegex.MatchString(stampIdentifier) { + errs = append(errs, field.Invalid( + field.NewPath("cosmosMetadata", "resourceID"), + stampIdentifier, + "stamp identifier must match [0-9a-z]{1,3}", + )) + } + return errs +} diff --git a/internal/validation/validate_stamp_test.go b/internal/validation/validate_stamp_test.go new file mode 100644 index 00000000000..4763ecd783f --- /dev/null +++ b/internal/validation/validate_stamp_test.go @@ -0,0 +1,320 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validation + +import ( + "context" + "strings" + "testing" + + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/api/fleet" +) + +func validStamp(t *testing.T) *fleet.Stamp { + t.Helper() + resourceID := api.Must(fleet.ToStampResourceID("1")) + return &fleet.Stamp{ + CosmosMetadata: api.CosmosMetadata{ + ResourceID: resourceID, + }, + ResourceID: resourceID, + } +} + +func TestValidateStampCreate(t *testing.T) { + t.Parallel() + + type expectedError struct { + message string + fieldPath string + } + + tests := []struct { + name string + modify func(t *testing.T, s *fleet.Stamp) + expectErrors []expectedError + }{ + // Valid cases + { + name: "valid single char digit", + modify: func(t *testing.T, s *fleet.Stamp) {}, + expectErrors: nil, + }, + { + name: "valid two chars letters", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(fleet.ToStampResourceID("ab")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: nil, + }, + { + name: "valid three chars mixed", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(fleet.ToStampResourceID("1a2")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: nil, + }, + { + name: "valid three chars all digits", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(fleet.ToStampResourceID("123")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: nil, + }, + { + name: "valid three chars all letters", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(fleet.ToStampResourceID("abc")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: nil, + }, + // Invalid cases + { + name: "empty stamp identifier rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + s.CosmosMetadata.ResourceID = nil + s.ResourceID = nil + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + { + name: "four chars rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/abcd")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + { + name: "uppercase rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/ABC")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + { + name: "special chars rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/a-b")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + { + name: "spaces rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/a b")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + s := validStamp(t) + tt.modify(t, s) + errs := ValidateStampCreate(context.Background(), s) + + if len(tt.expectErrors) == 0 { + if len(errs) != 0 { + t.Errorf("expected no errors, got %d: %v", len(errs), errs) + } + return + } + for _, expectedErr := range tt.expectErrors { + found := false + for _, err := range errs { + if strings.Contains(err.Error(), expectedErr.message) && strings.Contains(err.Field, expectedErr.fieldPath) { + found = true + break + } + } + if !found { + t.Errorf("expected error containing message %q at field %q but not found in: %v", expectedErr.message, expectedErr.fieldPath, errs) + } + } + }) + } +} + +func TestValidateStampUpdate(t *testing.T) { + t.Parallel() + + type expectedError struct { + message string + fieldPath string + } + + tests := []struct { + name string + modify func(t *testing.T, s *fleet.Stamp) + expectErrors []expectedError + }{ + // Valid cases + { + name: "valid update no changes", + modify: func(t *testing.T, s *fleet.Stamp) {}, + expectErrors: nil, + }, + { + name: "valid two chars letters", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(fleet.ToStampResourceID("ab")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: nil, + }, + { + name: "valid three chars mixed", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(fleet.ToStampResourceID("1a2")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: nil, + }, + { + name: "valid three chars all digits", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(fleet.ToStampResourceID("123")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: nil, + }, + { + name: "valid three chars all letters", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(fleet.ToStampResourceID("abc")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: nil, + }, + // Invalid cases + { + name: "empty stamp identifier rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + s.CosmosMetadata.ResourceID = nil + s.ResourceID = nil + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + { + name: "four chars rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/abcd")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + { + name: "uppercase rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/ABC")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + { + name: "special chars rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/a-b")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + { + name: "spaces rejected", + modify: func(t *testing.T, s *fleet.Stamp) { + resourceID := api.Must(azcorearm.ParseResourceID("/providers/Microsoft.RedHatOpenShift/stamps/a b")) + s.CosmosMetadata.ResourceID = resourceID + s.ResourceID = resourceID + }, + expectErrors: []expectedError{ + {fieldPath: "cosmosMetadata.resourceID", message: "stamp identifier must match"}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + oldObj := validStamp(t) + newObj := validStamp(t) + tt.modify(t, newObj) + errs := ValidateStampUpdate(context.Background(), newObj, oldObj) + + if len(tt.expectErrors) == 0 { + if len(errs) != 0 { + t.Errorf("expected no errors, got %d: %v", len(errs), errs) + } + return + } + for _, expectedErr := range tt.expectErrors { + found := false + for _, err := range errs { + if strings.Contains(err.Error(), expectedErr.message) && strings.Contains(err.Field, expectedErr.fieldPath) { + found = true + break + } + } + if !found { + t.Errorf("expected error containing message %q at field %q but not found in: %v", expectedErr.message, expectedErr.fieldPath, errs) + } + } + }) + } +} diff --git a/internal/validation/validators.go b/internal/validation/validators.go index 65971d6a7ce..ea90908a610 100644 --- a/internal/validation/validators.go +++ b/internal/validation/validators.go @@ -414,6 +414,34 @@ func IPv4(_ context.Context, _ operation.Operation, fldPath *field.Path, value, return nil } +func HostPort(_ context.Context, _ operation.Operation, fldPath *field.Path, value, _ *string) field.ErrorList { + if value == nil || len(*value) == 0 { + return nil + } + + host, _, err := net.SplitHostPort(*value) + if err != nil { + return field.ErrorList{field.Invalid(fldPath, *value, fmt.Sprintf("must be host:port: %s", err))} + } + if len(host) == 0 { + return field.ErrorList{field.Invalid(fldPath, *value, "host must not be empty")} + } + + if isIpAddress(host) { + return nil + } + + if errMsgs := k8svalidation.IsDNS1123Subdomain(host); len(errMsgs) > 0 { + errs := field.ErrorList{} + for _, msg := range errMsgs { + errs = append(errs, field.Invalid(fldPath, *value, fmt.Sprintf("invalid host %q: %s", host, msg))) + } + return errs + } + + return nil +} + func URL(_ context.Context, _ operation.Operation, fldPath *field.Path, value, _ *string) field.ErrorList { if value == nil { return nil diff --git a/internal/validation/validators_test.go b/internal/validation/validators_test.go new file mode 100644 index 00000000000..2e20d6564a4 --- /dev/null +++ b/internal/validation/validators_test.go @@ -0,0 +1,119 @@ +// Copyright 2026 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validation + +import ( + "context" + "strings" + "testing" + + "k8s.io/apimachinery/pkg/api/operation" + "k8s.io/apimachinery/pkg/util/validation/field" + "k8s.io/utils/ptr" +) + +func TestHostPort(t *testing.T) { + ctx := context.Background() + op := operation.Operation{Type: operation.Create} + fldPath := field.NewPath("target") + + tests := []struct { + name string + value *string + expectError bool + errContains string + }{ + { + name: "nil value accepted", + value: nil, + }, + { + name: "empty value accepted", + value: ptr.To(""), + }, + { + name: "valid DNS host with port", + value: ptr.To("maestro.example.com:8090"), + }, + { + name: "valid short DNS host with port", + value: ptr.To("maestro:8090"), + }, + { + name: "valid IPv4 with port", + value: ptr.To("10.0.0.1:8090"), + }, + { + name: "valid IPv6 with port", + value: ptr.To("[::1]:8090"), + }, + { + name: "missing port rejected", + value: ptr.To("maestro.example.com"), + expectError: true, + errContains: "must be host:port", + }, + { + name: "empty host rejected", + value: ptr.To(":8090"), + expectError: true, + errContains: "host must not be empty", + }, + { + name: "underscore in host rejected", + value: ptr.To("not_valid:8090"), + expectError: true, + errContains: "invalid host", + }, + { + name: "uppercase in host rejected", + value: ptr.To("NOT-VALID:8090"), + expectError: true, + errContains: "invalid host", + }, + { + name: "trailing dot in host rejected", + value: ptr.To("invalid.:8090"), + expectError: true, + errContains: "invalid host", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + errs := HostPort(ctx, op, fldPath, tt.value, nil) + if tt.expectError { + if len(errs) == 0 { + t.Errorf("expected error containing %q, got none", tt.errContains) + return + } + found := false + for _, e := range errs { + if strings.Contains(e.Error(), tt.errContains) { + found = true + break + } + } + if !found { + t.Errorf("expected error containing %q, got %v", tt.errContains, errs) + } + } else { + if len(errs) != 0 { + t.Errorf("expected no errors, got %v", errs) + } + } + }) + } +} diff --git a/test-integration/admin/artifacts/AdminCRUD/HCP/breakglass/01-loadClusterService-initial-state/02-provisionshard.json b/test-integration/admin/artifacts/AdminCRUD/HCP/breakglass/01-loadClusterService-initial-state/02-provisionshard.json index b3990bfcd0c..0b60486cc69 100644 --- a/test-integration/admin/artifacts/AdminCRUD/HCP/breakglass/01-loadClusterService-initial-state/02-provisionshard.json +++ b/test-integration/admin/artifacts/AdminCRUD/HCP/breakglass/01-loadClusterService-initial-state/02-provisionshard.json @@ -13,8 +13,8 @@ "kind": "CloudProvider" }, "creation_timestamp": "2026-02-02T10:21:43.084787Z", - "href": "/api/aro_hcp/v1alpha1/provision_shards/fixed-shard-value", - "id": "fixed-shard-value", + "href": "/api/aro_hcp/v1alpha1/provision_shards/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee", + "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee", "kind": "ProvisionShard", "maestro_config": { "consumer_name": "hcp-underlay-usw3gobe-mgmt-1", diff --git a/test-integration/backend/launch/metrics_test.go b/test-integration/backend/launch/metrics_test.go index 67b1d682943..3b63b9ff6ce 100644 --- a/test-integration/backend/launch/metrics_test.go +++ b/test-integration/backend/launch/metrics_test.go @@ -41,6 +41,7 @@ import ( "github.com/Azure/ARO-HCP/backend/pkg/app" "github.com/Azure/ARO-HCP/internal/api" "github.com/Azure/ARO-HCP/internal/api/arm" + "github.com/Azure/ARO-HCP/internal/databasetesting" "github.com/Azure/ARO-HCP/internal/ocm" "github.com/Azure/ARO-HCP/internal/utils" "github.com/Azure/ARO-HCP/test-integration/utils/integrationutils" @@ -103,6 +104,7 @@ func TestBackendExposesMetrics(t *testing.T) { LeaderElectionLock: newFakeLeaderElectionLock("metrics-test"), ResourcesDBClient: resourcesDBClient, BillingDBClient: billingDBClient, + FleetDBClient: databasetesting.NewMockFleetDBClient(), ClustersServiceClient: clusterServiceMock.MockClusterServiceClient, MetricsRegisterer: registry, MetricsGatherer: registry, diff --git a/test-integration/utils/integrationutils/cluster_service_mock.go b/test-integration/utils/integrationutils/cluster_service_mock.go index d59a5673a57..aa5e6d02161 100644 --- a/test-integration/utils/integrationutils/cluster_service_mock.go +++ b/test-integration/utils/integrationutils/cluster_service_mock.go @@ -255,6 +255,17 @@ func (s *ClusterServiceMock) setupMockClusterService(t *testing.T) { } return ret, nil }).AnyTimes() + s.MockClusterServiceClient.EXPECT().ListProvisionShards().DoAndReturn(func() ocm.ProvisionShardListIterator { + allObjs := []*csarhcpv1alpha1.ProvisionShard{} + for _, key := range sets.StringKeySet(internalIDToProvisionShard).List() { + obj, err := mergeClusterServiceInstance[csarhcpv1alpha1.ProvisionShard](internalIDToProvisionShard[key]) + if err != nil { + panic(fmt.Errorf("failed to merge provision shard id %q: %w", key, err)) + } + allObjs = append(allObjs, obj) + } + return ocm.NewSimpleProvisionShardListIterator(allObjs, nil) + }).AnyTimes() s.MockClusterServiceClient.EXPECT().GetClusterHypershiftDetails(gomock.Any(), gomock.Any()).DoAndReturn(func(ctx context.Context, id ocm.InternalID) (*cmv1.HypershiftConfig, error) { ret, err := mergeClusterServiceInstance[cmv1.HypershiftConfig](internalIDToHypershiftDetails[id.String()]) if err != nil {