From feebecc0a97dfa195481b78655f25e2259853b02 Mon Sep 17 00:00:00 2001 From: iaroslav-reflection Date: Mon, 1 Jun 2026 15:36:02 +0000 Subject: [PATCH] Fix ResourceClaim cleanup on v1beta2 clusters --- .../components/resourceclaim/resourceclaim.go | 11 +- .../components/resourceclaim/resourceclaim.go | 11 +- operator/internal/resourceclaim/reconcile.go | 44 +++++++- .../internal/resourceclaim/reconcile_test.go | 106 ++++++++++++++++++ .../internal/resourceclaim/resolve_test.go | 2 + 5 files changed, 158 insertions(+), 16 deletions(-) diff --git a/operator/internal/controller/podclique/components/resourceclaim/resourceclaim.go b/operator/internal/controller/podclique/components/resourceclaim/resourceclaim.go index 2bb362709..f89c1c458 100644 --- a/operator/internal/controller/podclique/components/resourceclaim/resourceclaim.go +++ b/operator/internal/controller/podclique/components/resourceclaim/resourceclaim.go @@ -30,7 +30,6 @@ import ( k8sutils "github.com/ai-dynamo/grove/operator/internal/utils/kubernetes" "github.com/go-logr/logr" - resourcev1 "k8s.io/api/resource/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -57,13 +56,11 @@ func New(client client.Client, scheme *runtime.Scheme) component.Operator[grovec // GetExistingResourceNames returns the names of PCLQ-level ResourceClaims // by selecting on the grove.io/podclique label that Sync stamps on each RC. func (r _resource) GetExistingResourceNames(ctx context.Context, _ logr.Logger, pclqObjMeta metav1.ObjectMeta) ([]string, error) { - objMetaList := &metav1.PartialObjectMetadataList{} - objMetaList.SetGroupVersionKind(resourcev1.SchemeGroupVersion.WithKind("ResourceClaim")) - if err := r.client.List(ctx, - objMetaList, + objMetaList, err := resourceclaim.ListResourceClaimMetadata(ctx, r.client, client.InNamespace(pclqObjMeta.Namespace), client.MatchingLabels(pclqResourceClaimLabels(pclqObjMeta)), - ); err != nil { + ) + if err != nil { return nil, groveerr.WrapError(err, errSyncPCLQLevelRC, component.OperationGetExistingResourceNames, @@ -190,7 +187,7 @@ func pclqResourceClaimLabels(pclqObjMeta metav1.ObjectMeta) map[string]string { // GC would create a deadlock since GC only fires after the PCLQ is fully deleted. func (r _resource) Delete(ctx context.Context, _ logr.Logger, pclqObjMeta metav1.ObjectMeta) error { labels := pclqResourceClaimLabels(pclqObjMeta) - if err := r.client.DeleteAllOf(ctx, &resourcev1.ResourceClaim{}, + if err := resourceclaim.DeleteResourceClaims(ctx, r.client, client.InNamespace(pclqObjMeta.Namespace), client.MatchingLabels(labels), ); err != nil { diff --git a/operator/internal/controller/podcliqueset/components/resourceclaim/resourceclaim.go b/operator/internal/controller/podcliqueset/components/resourceclaim/resourceclaim.go index 1d7525fbc..91c4790f8 100644 --- a/operator/internal/controller/podcliqueset/components/resourceclaim/resourceclaim.go +++ b/operator/internal/controller/podcliqueset/components/resourceclaim/resourceclaim.go @@ -29,7 +29,6 @@ import ( k8sutils "github.com/ai-dynamo/grove/operator/internal/utils/kubernetes" "github.com/go-logr/logr" - resourcev1 "k8s.io/api/resource/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -55,13 +54,11 @@ func New(client client.Client, scheme *runtime.Scheme) component.Operator[grovec // GetExistingResourceNames returns the names of all ResourceClaims owned by this PCS. func (r _resource) GetExistingResourceNames(ctx context.Context, logger logr.Logger, pcsObjMeta metav1.ObjectMeta) ([]string, error) { - objMetaList := &metav1.PartialObjectMetadataList{} - objMetaList.SetGroupVersionKind(resourcev1.SchemeGroupVersion.WithKind("ResourceClaim")) - if err := r.client.List(ctx, - objMetaList, + objMetaList, err := resourceclaim.ListResourceClaimMetadata(ctx, r.client, client.InNamespace(pcsObjMeta.Namespace), client.MatchingLabels(resourceclaim.ResourceClaimLabels(pcsObjMeta.Name)), - ); err != nil { + ) + if err != nil { return nil, groveerr.WrapError(err, errSyncPCSResourceClaim, component.OperationGetExistingResourceNames, @@ -163,7 +160,7 @@ func (r _resource) cleanupStaleResourceClaims(ctx context.Context, _ logr.Logger // levels: PCS, PCSG, and PCLQ). This is safe because Delete is only called during // PCS deletion when the entire hierarchy is being torn down. func (r _resource) Delete(ctx context.Context, _ logr.Logger, pcsObjMeta metav1.ObjectMeta) error { - if err := r.client.DeleteAllOf(ctx, &resourcev1.ResourceClaim{}, + if err := resourceclaim.DeleteResourceClaims(ctx, r.client, client.InNamespace(pcsObjMeta.Namespace), client.MatchingLabels(resourceclaim.ResourceClaimLabels(pcsObjMeta.Name)), ); err != nil { diff --git a/operator/internal/resourceclaim/reconcile.go b/operator/internal/resourceclaim/reconcile.go index b978889ca..3a64addbf 100644 --- a/operator/internal/resourceclaim/reconcile.go +++ b/operator/internal/resourceclaim/reconcile.go @@ -26,16 +26,21 @@ import ( "github.com/samber/lo" corev1 "k8s.io/api/core/v1" resourcev1 "k8s.io/api/resource/v1" + resourcev1beta2 "k8s.io/api/resource/v1beta2" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) +const resourceClaimKind = "ResourceClaim" + // ResourceSharer is the common interface for all level-specific resource sharing // types (PCS, PCSG, PCLQ). It enables the reconciler to operate uniformly over // different resource sharing specs regardless of their filter type. @@ -125,7 +130,42 @@ func DeleteResourceClaim(ctx context.Context, cl client.Client, name, namespace rc := &resourcev1.ResourceClaim{ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, } - return client.IgnoreNotFound(cl.Delete(ctx, rc)) + err := cl.Delete(ctx, rc) + if !meta.IsNoMatchError(err) { + return client.IgnoreNotFound(err) + } + + rcBeta := &resourcev1beta2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, + } + return client.IgnoreNotFound(cl.Delete(ctx, rcBeta)) +} + +// DeleteResourceClaims deletes all ResourceClaims matching the given options. +func DeleteResourceClaims(ctx context.Context, cl client.Client, opts ...client.DeleteAllOfOption) error { + err := cl.DeleteAllOf(ctx, &resourcev1.ResourceClaim{}, opts...) + if !meta.IsNoMatchError(err) { + return err + } + return cl.DeleteAllOf(ctx, &resourcev1beta2.ResourceClaim{}, opts...) +} + +// ListResourceClaimMetadata lists ResourceClaims matching the given options. +func ListResourceClaimMetadata(ctx context.Context, cl client.Client, opts ...client.ListOption) (*metav1.PartialObjectMetadataList, error) { + objMetaList := newResourceClaimMetadataList(resourcev1.SchemeGroupVersion) + err := cl.List(ctx, objMetaList, opts...) + if !meta.IsNoMatchError(err) { + return objMetaList, err + } + + objMetaList = newResourceClaimMetadataList(resourcev1beta2.SchemeGroupVersion) + return objMetaList, cl.List(ctx, objMetaList, opts...) +} + +func newResourceClaimMetadataList(groupVersion schema.GroupVersion) *metav1.PartialObjectMetadataList { + objMetaList := &metav1.PartialObjectMetadataList{} + objMetaList.SetGroupVersionKind(groupVersion.WithKind(resourceClaimKind)) + return objMetaList } // EnsureResourceClaims creates ResourceClaims for a list of ResourceSharer entries @@ -326,7 +366,7 @@ func CleanupStalePerReplicaRCs( sel = sel.Add(*notInReq) } - return cl.DeleteAllOf(ctx, &resourcev1.ResourceClaim{}, + return DeleteResourceClaims(ctx, cl, client.InNamespace(namespace), client.MatchingLabelsSelector{Selector: sel}, ) diff --git a/operator/internal/resourceclaim/reconcile_test.go b/operator/internal/resourceclaim/reconcile_test.go index d15791c73..1d6c43694 100644 --- a/operator/internal/resourceclaim/reconcile_test.go +++ b/operator/internal/resourceclaim/reconcile_test.go @@ -26,10 +26,14 @@ import ( "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" resourcev1 "k8s.io/api/resource/v1" + resourcev1beta2 "k8s.io/api/resource/v1beta2" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" ) // --- RCName --- @@ -490,6 +494,101 @@ func TestDeleteResourceClaim(t *testing.T) { err := DeleteResourceClaim(context.Background(), cl, "nonexistent", "default") require.NoError(t, err) }) + + t.Run("falls back to v1beta2 when v1 API is not served", func(t *testing.T) { + var calls []string + cl := fake.NewClientBuilder().WithScheme(scheme).WithInterceptorFuncs(interceptor.Funcs{ + Delete: func(_ context.Context, _ client.WithWatch, obj client.Object, _ ...client.DeleteOption) error { + switch obj.(type) { + case *resourcev1.ResourceClaim: + calls = append(calls, resourcev1.SchemeGroupVersion.String()) + return resourceClaimNoKindMatch(resourcev1.SchemeGroupVersion.Version) + case *resourcev1beta2.ResourceClaim: + calls = append(calls, resourcev1beta2.SchemeGroupVersion.String()) + return nil + default: + return fmt.Errorf("unexpected delete object type %T", obj) + } + }, + }).Build() + + err := DeleteResourceClaim(context.Background(), cl, "my-rc", "default") + require.NoError(t, err) + assert.Equal(t, []string{ + resourcev1.SchemeGroupVersion.String(), + resourcev1beta2.SchemeGroupVersion.String(), + }, calls) + }) +} + +func TestDeleteResourceClaims(t *testing.T) { + scheme := newTestScheme() + + t.Run("falls back to v1beta2 when v1 API is not served", func(t *testing.T) { + var calls []string + cl := fake.NewClientBuilder().WithScheme(scheme).WithInterceptorFuncs(interceptor.Funcs{ + DeleteAllOf: func(_ context.Context, _ client.WithWatch, obj client.Object, _ ...client.DeleteAllOfOption) error { + switch obj.(type) { + case *resourcev1.ResourceClaim: + calls = append(calls, resourcev1.SchemeGroupVersion.String()) + return resourceClaimNoKindMatch(resourcev1.SchemeGroupVersion.Version) + case *resourcev1beta2.ResourceClaim: + calls = append(calls, resourcev1beta2.SchemeGroupVersion.String()) + return nil + default: + return fmt.Errorf("unexpected delete-all object type %T", obj) + } + }, + }).Build() + + err := DeleteResourceClaims(context.Background(), cl, + client.InNamespace("default"), + client.MatchingLabels(ResourceClaimLabels("my-pcs")), + ) + require.NoError(t, err) + assert.Equal(t, []string{ + resourcev1.SchemeGroupVersion.String(), + resourcev1beta2.SchemeGroupVersion.String(), + }, calls) + }) +} + +func TestListResourceClaimMetadata(t *testing.T) { + scheme := newTestScheme() + + t.Run("falls back to v1beta2 when v1 API is not served", func(t *testing.T) { + var calls []string + cl := fake.NewClientBuilder().WithScheme(scheme).WithInterceptorFuncs(interceptor.Funcs{ + List: func(_ context.Context, _ client.WithWatch, list client.ObjectList, _ ...client.ListOption) error { + gvk := list.GetObjectKind().GroupVersionKind() + calls = append(calls, gvk.GroupVersion().String()) + if gvk.Version == resourcev1.SchemeGroupVersion.Version { + return resourceClaimNoKindMatch(resourcev1.SchemeGroupVersion.Version) + } + if gvk.Version != resourcev1beta2.SchemeGroupVersion.Version { + return fmt.Errorf("unexpected list version %q", gvk.Version) + } + + objMetaList, ok := list.(*metav1.PartialObjectMetadataList) + if !ok { + return fmt.Errorf("unexpected list object type %T", list) + } + objMetaList.Items = []metav1.PartialObjectMetadata{ + {ObjectMeta: metav1.ObjectMeta{Name: "my-rc", Namespace: "default"}}, + } + return nil + }, + }).Build() + + objMetaList, err := ListResourceClaimMetadata(context.Background(), cl, client.InNamespace("default")) + require.NoError(t, err) + require.Len(t, objMetaList.Items, 1) + assert.Equal(t, "my-rc", objMetaList.Items[0].Name) + assert.Equal(t, []string{ + resourcev1.SchemeGroupVersion.String(), + resourcev1beta2.SchemeGroupVersion.String(), + }, calls) + }) } // --- CleanupStalePerReplicaRCs --- @@ -559,3 +658,10 @@ func TestCleanupStalePerReplicaRCs(t *testing.T) { // Ensure ptr.To works for tests that need int pointer var _ = ptr.To(0) + +func resourceClaimNoKindMatch(version string) error { + return &meta.NoKindMatchError{ + GroupKind: resourcev1.SchemeGroupVersion.WithKind(resourceClaimKind).GroupKind(), + SearchedVersions: []string{version}, + } +} diff --git a/operator/internal/resourceclaim/resolve_test.go b/operator/internal/resourceclaim/resolve_test.go index 1712e54f5..b5aa1b8f7 100644 --- a/operator/internal/resourceclaim/resolve_test.go +++ b/operator/internal/resourceclaim/resolve_test.go @@ -23,6 +23,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" resourcev1 "k8s.io/api/resource/v1" + resourcev1beta2 "k8s.io/api/resource/v1beta2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -32,6 +33,7 @@ func newTestScheme() *runtime.Scheme { scheme := runtime.NewScheme() _ = grovecorev1alpha1.AddToScheme(scheme) _ = resourcev1.AddToScheme(scheme) + _ = resourcev1beta2.AddToScheme(scheme) return scheme }