From 13093b26afc7cf080b9ab85f4937f70e9babda84 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Mon, 11 Aug 2025 12:19:12 -0500 Subject: [PATCH 1/4] Call WaitGroup.Done() once only when PVB changes to final status the first time to avoid panic Prevents calling WaitGroup.Done() multiple times for the same PodVolumeBackup by tracking processed PVBs. This avoids "negative WaitGroup counter" panic errors when the handler receives multiple update events with the PVB already in final status. Based on upstream commit 2eb97fa8b187f9ed0aeb49f216565eddf93a0b08 Adapted for OADP 1.4 without pvbIndexer infrastructure Signed-off-by: Tiger Kaovilai --- pkg/podvolume/backupper.go | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/pkg/podvolume/backupper.go b/pkg/podvolume/backupper.go index 8d06cb2225..1cc3819726 100644 --- a/pkg/podvolume/backupper.go +++ b/pkg/podvolume/backupper.go @@ -58,6 +58,7 @@ type backupper struct { handlerRegistration cache.ResourceEventHandlerRegistration wg sync.WaitGroup result []*velerov1api.PodVolumeBackup + processedPVBs sync.Map // tracks PVBs that have already been processed to avoid calling Done() multiple times } type skippedPVC struct { @@ -109,14 +110,15 @@ func newBackupper( backup *velerov1api.Backup, ) *backupper { b := &backupper{ - ctx: ctx, - repoLocker: repoLocker, - repoEnsurer: repoEnsurer, - crClient: crClient, - uploaderType: uploaderType, - pvbInformer: pvbInformer, - wg: sync.WaitGroup{}, - result: []*velerov1api.PodVolumeBackup{}, + ctx: ctx, + repoLocker: repoLocker, + repoEnsurer: repoEnsurer, + crClient: crClient, + uploaderType: uploaderType, + pvbInformer: pvbInformer, + wg: sync.WaitGroup{}, + result: []*velerov1api.PodVolumeBackup{}, + processedPVBs: sync.Map{}, } b.handlerRegistration, _ = pvbInformer.AddEventHandler( @@ -133,8 +135,16 @@ func newBackupper( return } - b.result = append(b.result, pvb) - b.wg.Done() + // Generate a unique key for this PVB + pvbKey := pvb.Namespace + "/" + pvb.Name + + // Check if we've already processed this PVB in final status + // This prevents calling WaitGroup.Done() multiple times for the same PVB + // which could happen if the handler receives multiple update events with the PVB already in final status + if _, alreadyProcessed := b.processedPVBs.LoadOrStore(pvbKey, true); !alreadyProcessed { + b.result = append(b.result, pvb) + b.wg.Done() + } }, }, ) From ce7031e0d360fb4a3d632fde2151116b25764554 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Mon, 11 Aug 2025 12:36:38 -0500 Subject: [PATCH 2/4] Update Go version from 1.22.5 to 1.23.0 Updates Go version across all Dockerfiles and CI workflows to align with upstream commit 874388bd3e34bbcfe955f624b8e3579d959132fc. This update was missed in previous CVE-related PRs. Updated files: - Dockerfile (2 occurrences) - hack/build-image/Dockerfile - .github/workflows/pr-ci-check.yml - .github/workflows/push.yml - .github/workflows/e2e-test-kind.yaml (2 occurrences) - .github/workflows/crds-verify-kind.yaml - Tiltfile Signed-off-by: Tiger Kaovilai Co-authored-by: Claude --- .github/workflows/crds-verify-kind.yaml | 2 +- .github/workflows/e2e-test-kind.yaml | 4 ++-- .github/workflows/pr-ci-check.yml | 2 +- .github/workflows/push.yml | 2 +- Dockerfile | 4 ++-- Tiltfile | 2 +- hack/build-image/Dockerfile | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/crds-verify-kind.yaml b/.github/workflows/crds-verify-kind.yaml index 13b6fbbd26..15260aa2bf 100644 --- a/.github/workflows/crds-verify-kind.yaml +++ b/.github/workflows/crds-verify-kind.yaml @@ -14,7 +14,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - go-version: '1.22.5' + go-version: '1.23.0' id: go # Look for a CLI that's made for this PR - name: Fetch built CLI diff --git a/.github/workflows/e2e-test-kind.yaml b/.github/workflows/e2e-test-kind.yaml index f71f2c5630..fd4e17a24f 100644 --- a/.github/workflows/e2e-test-kind.yaml +++ b/.github/workflows/e2e-test-kind.yaml @@ -14,7 +14,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - go-version: '1.22.5' + go-version: '1.23.0' id: go # Look for a CLI that's made for this PR - name: Fetch built CLI @@ -82,7 +82,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - go-version: '1.22.5' + go-version: '1.23.0' id: go - name: Check out the code uses: actions/checkout@v4 diff --git a/.github/workflows/pr-ci-check.yml b/.github/workflows/pr-ci-check.yml index bbf9fb1dd3..2e7259d446 100644 --- a/.github/workflows/pr-ci-check.yml +++ b/.github/workflows/pr-ci-check.yml @@ -10,7 +10,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - go-version: '1.22.5' + go-version: '1.23.0' id: go - name: Check out the code uses: actions/checkout@v4 diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 9a19477384..287e580de4 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -18,7 +18,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - go-version: '1.22.5' + go-version: '1.23.0' id: go - uses: actions/checkout@v4 diff --git a/Dockerfile b/Dockerfile index 7c5905caeb..b52d06fb56 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. # Velero binary build section -FROM --platform=$BUILDPLATFORM golang:1.22.5-bookworm as velero-builder +FROM --platform=$BUILDPLATFORM golang:1.23.0-bookworm as velero-builder ARG GOPROXY ARG BIN @@ -47,7 +47,7 @@ RUN mkdir -p /output/usr/bin && \ go clean -modcache -cache # Restic binary build section -FROM --platform=$BUILDPLATFORM golang:1.22.5-bookworm as restic-builder +FROM --platform=$BUILDPLATFORM golang:1.23.0-bookworm as restic-builder ARG BIN ARG TARGETOS diff --git a/Tiltfile b/Tiltfile index 6be5057f08..c2fc9dd389 100644 --- a/Tiltfile +++ b/Tiltfile @@ -52,7 +52,7 @@ git_sha = str(local("git rev-parse HEAD", quiet = True, echo_off = True)).strip( tilt_helper_dockerfile_header = """ # Tilt image -FROM golang:1.22.5 as tilt-helper +FROM golang:1.23.0 as tilt-helper # Support live reloading with Tilt RUN wget --output-document /restart.sh --quiet https://raw.githubusercontent.com/windmilleng/rerun-process-wrapper/master/restart.sh && \ diff --git a/hack/build-image/Dockerfile b/hack/build-image/Dockerfile index 7521f72450..650d0e4e72 100644 --- a/hack/build-image/Dockerfile +++ b/hack/build-image/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM --platform=$TARGETPLATFORM golang:1.22.5-bookworm +FROM --platform=$TARGETPLATFORM golang:1.23.0-bookworm ARG GOPROXY From 15ce3f8a16e9f6f614fc5ad4e17009068276ba05 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Thu, 14 Aug 2025 09:41:12 -0500 Subject: [PATCH 3/4] Fix WaitGroup handling to avoid panic by calling Done() only when PVB status changes to final for the first time Signed-off-by: Tiger Kaovilai --- pkg/podvolume/backupper.go | 44 ++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/pkg/podvolume/backupper.go b/pkg/podvolume/backupper.go index 1cc3819726..a4c80b9144 100644 --- a/pkg/podvolume/backupper.go +++ b/pkg/podvolume/backupper.go @@ -58,7 +58,6 @@ type backupper struct { handlerRegistration cache.ResourceEventHandlerRegistration wg sync.WaitGroup result []*velerov1api.PodVolumeBackup - processedPVBs sync.Map // tracks PVBs that have already been processed to avoid calling Done() multiple times } type skippedPVC struct { @@ -110,21 +109,20 @@ func newBackupper( backup *velerov1api.Backup, ) *backupper { b := &backupper{ - ctx: ctx, - repoLocker: repoLocker, - repoEnsurer: repoEnsurer, - crClient: crClient, - uploaderType: uploaderType, - pvbInformer: pvbInformer, - wg: sync.WaitGroup{}, - result: []*velerov1api.PodVolumeBackup{}, - processedPVBs: sync.Map{}, + ctx: ctx, + repoLocker: repoLocker, + repoEnsurer: repoEnsurer, + crClient: crClient, + uploaderType: uploaderType, + pvbInformer: pvbInformer, + wg: sync.WaitGroup{}, + result: []*velerov1api.PodVolumeBackup{}, } b.handlerRegistration, _ = pvbInformer.AddEventHandler( cache.ResourceEventHandlerFuncs{ - UpdateFunc: func(_, obj interface{}) { - pvb := obj.(*velerov1api.PodVolumeBackup) + UpdateFunc: func(oldObj, newObj interface{}) { + pvb := newObj.(*velerov1api.PodVolumeBackup) if pvb.GetLabels()[velerov1api.BackupUIDLabel] != string(backup.UID) { return @@ -135,14 +133,22 @@ func newBackupper( return } - // Generate a unique key for this PVB - pvbKey := pvb.Namespace + "/" + pvb.Name + // Check if the previous state was already in a final status + statusChangedToFinal := true + if oldPvb, ok := oldObj.(*velerov1api.PodVolumeBackup); ok { + // If the PVB was already in a final status, no need to call WaitGroup.Done() + if oldPvb.Status.Phase == velerov1api.PodVolumeBackupPhaseCompleted || + oldPvb.Status.Phase == velerov1api.PodVolumeBackupPhaseFailed { + statusChangedToFinal = false + } + } + + b.result = append(b.result, pvb) - // Check if we've already processed this PVB in final status - // This prevents calling WaitGroup.Done() multiple times for the same PVB - // which could happen if the handler receives multiple update events with the PVB already in final status - if _, alreadyProcessed := b.processedPVBs.LoadOrStore(pvbKey, true); !alreadyProcessed { - b.result = append(b.result, pvb) + // Call WaitGroup.Done() once only when the PVB changes to final status the first time. + // This avoids the cases where the handler gets multiple update events whose PVBs are all in final status + // which causes panic with "negative WaitGroup counter" error + if statusChangedToFinal { b.wg.Done() } }, From bdb22478606fe994cabe0c16f30f81c6e7dbbbb9 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Thu, 14 Aug 2025 09:41:57 -0500 Subject: [PATCH 4/4] Add unit test for WaitGroup logic in Backupper event handler Signed-off-by: Tiger Kaovilai --- pkg/podvolume/backupper.go | 2 +- pkg/podvolume/backupper_test.go | 140 ++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 1 deletion(-) diff --git a/pkg/podvolume/backupper.go b/pkg/podvolume/backupper.go index a4c80b9144..10524dcb43 100644 --- a/pkg/podvolume/backupper.go +++ b/pkg/podvolume/backupper.go @@ -144,7 +144,7 @@ func newBackupper( } b.result = append(b.result, pvb) - + // Call WaitGroup.Done() once only when the PVB changes to final status the first time. // This avoids the cases where the handler gets multiple update events whose PVBs are all in final status // which causes panic with "negative WaitGroup counter" error diff --git a/pkg/podvolume/backupper_test.go b/pkg/podvolume/backupper_test.go index ade7274df8..4d76abeedf 100644 --- a/pkg/podvolume/backupper_test.go +++ b/pkg/podvolume/backupper_test.go @@ -678,3 +678,143 @@ func TestPVCBackupSummary(t *testing.T) { assert.Empty(t, pbs.Skipped) assert.Len(t, pbs.Backedup, 2) } + +func TestBackupperEventHandlerWaitGroupLogic(t *testing.T) { + scheme := runtime.NewScheme() + velerov1api.AddToScheme(scheme) + corev1api.AddToScheme(scheme) + + tests := []struct { + name string + oldPVBStatus *velerov1api.PodVolumeBackupStatus + newPVBStatus *velerov1api.PodVolumeBackupStatus + expectedDoneCalls int + description string + }{ + { + name: "first transition to completed status", + oldPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseInProgress}, + newPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseCompleted}, + expectedDoneCalls: 1, + description: "WaitGroup.Done() should be called once when transitioning to completed", + }, + { + name: "first transition to failed status", + oldPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseInProgress}, + newPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseFailed}, + expectedDoneCalls: 1, + description: "WaitGroup.Done() should be called once when transitioning to failed", + }, + { + name: "multiple updates with completed status", + oldPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseCompleted}, + newPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseCompleted}, + expectedDoneCalls: 0, + description: "WaitGroup.Done() should NOT be called when PVB is already completed", + }, + { + name: "multiple updates with failed status", + oldPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseFailed}, + newPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseFailed}, + expectedDoneCalls: 0, + description: "WaitGroup.Done() should NOT be called when PVB is already failed", + }, + { + name: "transition from new to completed", + oldPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseNew}, + newPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseCompleted}, + expectedDoneCalls: 1, + description: "WaitGroup.Done() should be called when transitioning from new to completed", + }, + { + name: "transition from new to failed", + oldPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseNew}, + newPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseFailed}, + expectedDoneCalls: 1, + description: "WaitGroup.Done() should be called when transitioning from new to failed", + }, + { + name: "no transition for non-final status", + oldPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseNew}, + newPVBStatus: &velerov1api.PodVolumeBackupStatus{Phase: velerov1api.PodVolumeBackupPhaseInProgress}, + expectedDoneCalls: 0, + description: "WaitGroup.Done() should NOT be called for non-final status transitions", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // Create backup with UID for label matching + backup := &velerov1api.Backup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-backup", + Namespace: velerov1api.DefaultNamespace, + UID: "test-backup-uid", + }, + } + + // Create old and new PVB objects + oldPVB := &velerov1api.PodVolumeBackup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pvb", + Namespace: velerov1api.DefaultNamespace, + Labels: map[string]string{ + velerov1api.BackupUIDLabel: string(backup.UID), + }, + }, + Status: *test.oldPVBStatus, + } + + newPVB := &velerov1api.PodVolumeBackup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pvb", + Namespace: velerov1api.DefaultNamespace, + Labels: map[string]string{ + velerov1api.BackupUIDLabel: string(backup.UID), + }, + }, + Status: *test.newPVBStatus, + } + + // Track Done() calls using a counter + doneCallCount := 0 + + // Create a custom UpdateFunc that mirrors the actual implementation + updateFunc := func(oldObj, newObj interface{}) { + pvb := newObj.(*velerov1api.PodVolumeBackup) + + if pvb.GetLabels()[velerov1api.BackupUIDLabel] != string(backup.UID) { + return + } + + if pvb.Status.Phase != velerov1api.PodVolumeBackupPhaseCompleted && + pvb.Status.Phase != velerov1api.PodVolumeBackupPhaseFailed { + return + } + + // Check if the previous state was already in a final status + statusChangedToFinal := true + if oldPvb, ok := oldObj.(*velerov1api.PodVolumeBackup); ok { + // If the PVB was already in a final status, no need to call WaitGroup.Done() + if oldPvb.Status.Phase == velerov1api.PodVolumeBackupPhaseCompleted || + oldPvb.Status.Phase == velerov1api.PodVolumeBackupPhaseFailed { + statusChangedToFinal = false + } + } + + // Call WaitGroup.Done() once only when the PVB changes to final status the first time. + // This avoids the cases where the handler gets multiple update events whose PVBs are all in final status + // which causes panic with "negative WaitGroup counter" error + if statusChangedToFinal { + doneCallCount++ + } + } + + // Test the UpdateFunc directly + updateFunc(oldPVB, newPVB) + + // Verify the expected number of Done() calls + assert.Equal(t, test.expectedDoneCalls, doneCallCount, test.description) + }) + } +}