From 4e3bb71618f1fac47a41ba749b31f47216f3cdcf Mon Sep 17 00:00:00 2001 From: Ruinan Liu Date: Fri, 13 Feb 2026 20:21:25 -0800 Subject: [PATCH 1/2] Initial commit for cross reconciler refernce deletion group --- internal/resource/cache.go | 8 +-- internal/resource/cache_test.go | 108 ++++++++++++++++++++++++++++++++ internal/resource/tree.go | 18 ++++-- internal/resource/tree_test.go | 20 +++--- 4 files changed, 133 insertions(+), 21 deletions(-) diff --git a/internal/resource/cache.go b/internal/resource/cache.go index 81138d58..fd76b1f7 100644 --- a/internal/resource/cache.go +++ b/internal/resource/cache.go @@ -109,7 +109,7 @@ func (c *Cache) Fill(ctx context.Context, comp *apiv1.Composition, synUUID strin logger.Error(err, "invalid resource - cannot load into cache", "resourceSliceName", slice.Name, "resourceIndex", i) return } - + isGhostResource := false if c.ResourceFilter != nil { matches, err := c.evaluateResourceFilter(ctx, comp, res) if err != nil { @@ -117,12 +117,10 @@ func (c *Cache) Fill(ctx context.Context, comp *apiv1.Composition, synUUID strin resourceFilterErrors.Inc() continue } - if !matches { - continue - } + isGhostResource = !matches } - builder.Add(res) + builder.Add(res, isGhostResource) } } tree := builder.Build() diff --git a/internal/resource/cache_test.go b/internal/resource/cache_test.go index 7526ffd8..a8b3bdf2 100644 --- a/internal/resource/cache_test.go +++ b/internal/resource/cache_test.go @@ -413,6 +413,114 @@ func TestCacheResourceFilterAlwaysTrue(t *testing.T) { assert.True(t, found) } +// TestCacheGhostResourceCrossReconcilerDeletionOrder simulates two eno-reconcilers (A and B) +// sharing the same composition. Reconciler A manages a CRD-like resource (deletion-group -1) +// and reconciler B manages a Deployment-like resource (deletion-group 0). +// The test proves that B's resource is blocked until A's resource is confirmed deleted, +// even though A's resource is a ghost (filtered out) in B's cache. +func TestCacheGhostResourceCrossReconcilerDeletionOrder(t *testing.T) { + ctx := context.Background() + + // Shared composition with DeletionTimestamp set to activate deletion-group logic + now := metav1.Now() + comp := &apiv1.Composition{} + comp.Name = "test-comp" + comp.Namespace = "test-ns" + comp.DeletionTimestamp = &now + + // Two resources in the same slice: + // - "crd-resource" with deletion-group -1, label role=crd (managed by reconciler A) + // - "deployment-resource" with deletion-group 0, label role=deployment (managed by reconciler B) + slices := []apiv1.ResourceSlice{{ + ObjectMeta: metav1.ObjectMeta{Name: "slice-1", Namespace: "test-ns"}, + Spec: apiv1.ResourceSliceSpec{ + Resources: []apiv1.Manifest{ + {Manifest: `{"apiVersion": "v1", "kind": "ConfigMap", "metadata": {"name": "crd-resource", "namespace": "default", "labels": {"role": "crd"}, "annotations": {"eno.azure.io/deletion-group": "-1"}}}`}, + {Manifest: `{"apiVersion": "v1", "kind": "ConfigMap", "metadata": {"name": "deployment-resource", "namespace": "default", "labels": {"role": "deployment"}, "annotations": {"eno.azure.io/deletion-group": "0"}}}`}, + }, + }, + }} + + const synUUID = "test-syn" + crdRef := Ref{Name: "crd-resource", Namespace: "default", Kind: "ConfigMap"} + deployRef := Ref{Name: "deployment-resource", Namespace: "default", Kind: "ConfigMap"} + + // --- Reconciler A: manages role=crd resources --- + filterA, err := enocel.Parse("self.metadata.labels.role == 'crd'") + require.NoError(t, err) + var cacheA Cache + queueA := workqueue.NewTypedRateLimitingQueue(workqueue.DefaultTypedControllerRateLimiter[Request]()) + cacheA.SetQueue(queueA) + cacheA.ResourceFilter = filterA + + cacheA.Fill(ctx, comp, synUUID, slices) + cacheA.Visit(ctx, comp, synUUID, slices) + dumpQueue(queueA) // drain + + // Reconciler A can see the crd-resource (it's its own) + res, visible, found := cacheA.Get(ctx, synUUID, crdRef) + assert.NotNil(t, res, "reconciler A should find its own resource") + assert.True(t, found) + assert.True(t, visible, "crd-resource (deletion-group -1) has no dependencies, should be visible") + + // Reconciler A cannot see deployment-resource (it's a ghost) + res, visible, found = cacheA.Get(ctx, synUUID, deployRef) + assert.Nil(t, res) + assert.False(t, found, "deployment-resource is a ghost in reconciler A's cache") + assert.False(t, visible) + + // --- Reconciler B: manages role=deployment resources --- + filterB, err := enocel.Parse("self.metadata.labels.role == 'deployment'") + require.NoError(t, err) + var cacheB Cache + queueB := workqueue.NewTypedRateLimitingQueue(workqueue.DefaultTypedControllerRateLimiter[Request]()) + cacheB.SetQueue(queueB) + cacheB.ResourceFilter = filterB + + cacheB.Fill(ctx, comp, synUUID, slices) + cacheB.Visit(ctx, comp, synUUID, slices) + dumpQueue(queueB) // drain + + // Reconciler B cannot see crd-resource (it's a ghost) + res, visible, found = cacheB.Get(ctx, synUUID, crdRef) + assert.Nil(t, res) + assert.False(t, found, "crd-resource is a ghost in reconciler B's cache") + assert.False(t, visible) + + // Reconciler B can find deployment-resource but it should NOT be visible yet + // because it depends on crd-resource (deletion-group -1 < 0) which hasn't been deleted + res, visible, found = cacheB.Get(ctx, synUUID, deployRef) + assert.NotNil(t, res, "reconciler B should find its own resource") + assert.True(t, found) + assert.False(t, visible, "deployment-resource should be blocked by ghost crd-resource in deletion-group -1") + + // --- Simulate reconciler A deleting the crd-resource --- + // Reconciler A writes Deleted: true to the resource slice status for the crd-resource (index 0) + slicesWithStatus := []apiv1.ResourceSlice{{ + ObjectMeta: metav1.ObjectMeta{Name: "slice-1", Namespace: "test-ns"}, + Spec: slices[0].Spec, + Status: apiv1.ResourceSliceStatus{ + Resources: []apiv1.ResourceState{ + {Deleted: true}, // crd-resource at index 0 is deleted + {}, // deployment-resource at index 1 is not yet deleted + }, + }, + }} + + // Reconciler B's informer picks up the status change and calls Visit + cacheB.Visit(ctx, comp, synUUID, slicesWithStatus) + enqueuedB := dumpQueue(queueB) + + // The deployment-resource should have been enqueued because its dependency was cleared + assert.Contains(t, enqueuedB, deployRef.String(), "deployment-resource should be enqueued after ghost dependency is cleared") + + // Now deployment-resource should be visible — the ghost crd-resource dependency is satisfied + res, visible, found = cacheB.Get(ctx, synUUID, deployRef) + assert.NotNil(t, res) + assert.True(t, found) + assert.True(t, visible, "deployment-resource should now be visible after crd-resource ghost is deleted") +} + func dumpQueue(q workqueue.TypedRateLimitingInterface[Request]) (slice []string) { for { if q.Len() == 0 { diff --git a/internal/resource/tree.go b/internal/resource/tree.go index c5d78170..9dfc5dd6 100644 --- a/internal/resource/tree.go +++ b/internal/resource/tree.go @@ -10,8 +10,11 @@ import ( ) type indexedResource struct { - Resource *Resource - Seen bool + Resource *Resource + Seen bool + // If GhostResource true it means it is being managed by other eno-reconciler, but only participate in dependency graph to ensure + // that the ordering (readiness-group/deletion-group works as expected) + GhostResource bool PendingDependencies map[Ref]struct{} Dependents map[Ref]*indexedResource } @@ -54,7 +57,7 @@ func (b *treeBuilder) init() { } } -func (b *treeBuilder) Add(resource *Resource) { +func (b *treeBuilder) Add(resource *Resource, ghostResource bool) { b.init() // Handle conflicting refs deterministically @@ -65,6 +68,7 @@ func (b *treeBuilder) Add(resource *Resource) { // Index the resource into the builder idx := &indexedResource{ Resource: resource, + GhostResource: ghostResource, PendingDependencies: map[Ref]struct{}{}, Dependents: map[Ref]*indexedResource{}, } @@ -136,7 +140,7 @@ type tree struct { // Get returns the resource and determines if it's visible based on the state of its dependencies. func (t *tree) Get(key Ref) (res *Resource, visible bool, found bool) { idx, ok := t.byRef[key] - if !ok { + if !ok || idx.GhostResource { // don't need to act on ghost resource as it is managed by other eno-reconciler return nil, false, false } //TODO: debug logging on what we're blocked on might help future issues. @@ -152,8 +156,10 @@ func (t *tree) UpdateState(ref ManifestRef, state *apiv1.ResourceState, enqueue // Requeue self when the state has changed lastKnown := idx.Resource.latestKnownState.Swap(state) - if (!idx.Seen && lastKnown == nil) || !lastKnown.Equal(state) { - enqueue(idx.Resource.Ref) + if !idx.GhostResource { + if (!idx.Seen && lastKnown == nil) || !lastKnown.Equal(state) { + enqueue(idx.Resource.Ref) + } } idx.Seen = true diff --git a/internal/resource/tree_test.go b/internal/resource/tree_test.go index c54bbad9..f48b6352 100644 --- a/internal/resource/tree_test.go +++ b/internal/resource/tree_test.go @@ -165,7 +165,7 @@ func TestTreeBuilderSanity(t *testing.T) { t.Run(tc.Name, func(t *testing.T) { var b treeBuilder for _, r := range tc.Resources { - b.Add(r) + b.Add(r, false) } tree := b.Build() @@ -199,22 +199,22 @@ func TestTreeVisibility(t *testing.T) { Ref: newTestRef("test-resource-4"), readinessGroup: 4, ManifestRef: ManifestRef{Index: 4}, - }) + }, false) b.Add(&Resource{ Ref: newTestRef("test-resource-1"), readinessGroup: 1, ManifestRef: ManifestRef{Index: 1}, - }) + }, false) b.Add(&Resource{ Ref: newTestRef("test-resource-3"), readinessGroup: 3, ManifestRef: ManifestRef{Index: 3}, - }) + }, false) b.Add(&Resource{ Ref: newTestRef("test-resource-2"), readinessGroup: 2, ManifestRef: ManifestRef{Index: 2}, - }) + }, false) names := []string{"test-resource-1", "test-resource-2", "test-resource-3", "test-resource-4"} tree := b.Build() @@ -284,21 +284,21 @@ func TestTreeDeletion(t *testing.T) { ManifestRef: ManifestRef{Index: 1}, parsed: &unstructured.Unstructured{}, compositionDeleted: true, - }) + }, false) b.Add(&Resource{ Ref: newTestRef("test-resource-3"), readinessGroup: 3, ManifestRef: ManifestRef{Index: 3}, parsed: &unstructured.Unstructured{}, compositionDeleted: true, - }) + }, false) b.Add(&Resource{ Ref: newTestRef("test-resource-2"), readinessGroup: 2, ManifestRef: ManifestRef{Index: 2}, parsed: &unstructured.Unstructured{}, compositionDeleted: true, - }) + }, false) tree := b.Build() // All resources are seen, but only one is ready @@ -331,11 +331,11 @@ func TestTreeRefConflicts(t *testing.T) { b.Add(&Resource{ Ref: newTestRef("test-resource"), manifestHash: []byte("b"), - }) + }, false) b.Add(&Resource{ Ref: newTestRef("test-resource"), manifestHash: []byte("a"), - }) + }, false) tree := b.Build() res, visible, found := tree.Get(newTestRef("test-resource")) From c68ede864cf956677442d41d310afa95f1d58f98 Mon Sep 17 00:00:00 2001 From: Ruinan Liu Date: Sun, 15 Feb 2026 13:59:56 -0800 Subject: [PATCH 2/2] update --- .../controllers/reconciliation/controller.go | 38 ++++++++- .../reconciliation/controller_test.go | 81 +++++++++++++++++++ 2 files changed, 115 insertions(+), 4 deletions(-) diff --git a/internal/controllers/reconciliation/controller.go b/internal/controllers/reconciliation/controller.go index 7959c4ae..59e29a4d 100644 --- a/internal/controllers/reconciliation/controller.go +++ b/internal/controllers/reconciliation/controller.go @@ -165,10 +165,7 @@ func (c *Controller) Reconcile(ctx context.Context, req resource.Request) (ctrl. return ctrl.Result{Requeue: true}, nil } - deleted := current == nil || - (current.GetDeletionTimestamp() != nil && !snap.ForegroundDeletion) || - (snap.Deleted() && (snap.Orphan || snap.Disable || failingOpen)) // orphaning should be reflected on the status. - + deleted := shouldMarkResourceDeleted(snap, current, failingOpen) c.writeBuffer.PatchStatusAsync(ctx, &resource.ManifestRef, patchResourceState(deleted, ready)) return c.requeue(logger, snap, ready) } @@ -177,6 +174,39 @@ func (c *Controller) shouldFailOpen(resource *resource.Resource) bool { return (resource.FailOpen == nil && c.failOpen) || (resource.FailOpen != nil && *resource.FailOpen) } +// shouldMarkResourceDeleted returns true when the resource should be treated +// as deleted for status-reporting purposes. +func shouldMarkResourceDeleted(snap *resource.Snapshot, current *unstructured.Unstructured, failingOpen bool) bool { + // Resource does not exist - marking as deleted for sure. + if current == nil { + return true + } + + // Resource is being deleted by the kube-apiserver (deletionTimestamp set). + // For foreground deletion we must wait until the object is fully gone, + // so we only report deleted here for non-foreground cases. + if current.GetDeletionTimestamp() != nil && !snap.ForegroundDeletion { + return true + } + + // Orphan & Disabled should always be reflected on the status. + // FailOpen should also be reflected — *unless* we are doing a foreground + // deletion that is still in progress, because marking a resource deleted + // prematurely would break the foreground ordering guarantee. + if snap.Deleted() { + if snap.Orphan || snap.Disable { + return true + } + + if failingOpen { + // Do not prematurely report deleted during an active foreground deletion. + pendingForegroundDeletion := snap.ForegroundDeletion && current.GetDeletionTimestamp() != nil + return !pendingForegroundDeletion + } + } + return false +} + func (c *Controller) reconcileResource(ctx context.Context, comp *apiv1.Composition, prev *resource.Resource, resource *resource.Resource) (snap *resource.Snapshot, current *unstructured.Unstructured, ready *metav1.Time, modified bool, err error) { logger := logr.FromContextOrDiscard(ctx) diff --git a/internal/controllers/reconciliation/controller_test.go b/internal/controllers/reconciliation/controller_test.go index b8653211..8aae8f36 100644 --- a/internal/controllers/reconciliation/controller_test.go +++ b/internal/controllers/reconciliation/controller_test.go @@ -15,6 +15,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" @@ -249,3 +250,83 @@ func TestRequeueDoesNotPanic(t *testing.T) { }). Evaluate(t) } + +func TestShouldMarkResourceDeleted(t *testing.T) { + now := metav1.Now() + existing := &unstructured.Unstructured{} + deleting := &unstructured.Unstructured{} + deleting.SetDeletionTimestamp(&now) + + tests := []struct { + name string + snap *resource.Snapshot + current *unstructured.Unstructured + failOpen bool + wantDelete bool + }{ + { + name: "current is nil — resource does not exist", + snap: &resource.Snapshot{Resource: &resource.Resource{}}, + current: nil, + wantDelete: true, + }, + { + name: "deletionTimestamp set, non-foreground", + snap: &resource.Snapshot{Resource: &resource.Resource{}}, + current: deleting, + wantDelete: true, + }, + { + name: "deletionTimestamp set, foreground — wait for full removal", + snap: &resource.Snapshot{Resource: &resource.Resource{}, ForegroundDeletion: true}, + current: deleting, + wantDelete: false, + }, + { + name: "deleted + orphan — always report deleted", + snap: &resource.Snapshot{Resource: &resource.Resource{}, Disable: true, Orphan: true}, + current: existing, + wantDelete: true, + }, + { + name: "deleted + disable — always report deleted", + snap: &resource.Snapshot{Resource: &resource.Resource{}, Disable: true}, + current: existing, + wantDelete: true, + }, + { + name: "deleted + failOpen, no foreground — report deleted", + snap: &resource.Snapshot{Resource: &resource.Resource{}, Disable: true}, + current: existing, + failOpen: true, + wantDelete: true, + }, + { + name: "deleted + failOpen + foreground + deletionTimestamp — do NOT prematurely report deleted", + snap: &resource.Snapshot{Resource: &resource.Resource{}, Disable: true, ForegroundDeletion: true}, + current: deleting, + failOpen: true, + wantDelete: true, // Disable branch returns true before reaching failOpen + }, + { + name: "not deleted, resource exists — no deletion", + snap: &resource.Snapshot{Resource: &resource.Resource{}}, + current: existing, + wantDelete: false, + }, + { + name: "not deleted, resource exists, failOpen — no deletion", + snap: &resource.Snapshot{Resource: &resource.Resource{}}, + current: existing, + failOpen: true, + wantDelete: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := shouldMarkResourceDeleted(tt.snap, tt.current, tt.failOpen) + assert.Equal(t, tt.wantDelete, got) + }) + } +}