From 44c2fb24aa1c940edcb2af13ec93b64b2098d849 Mon Sep 17 00:00:00 2001 From: Rafael Benevides Date: Mon, 9 Feb 2026 10:45:37 -0300 Subject: [PATCH 1/2] HYPERFLEET-625 - fix: Add timeout to testcontainer teardown to prevent Prow hang When integration tests fail with a panic, the process continues to the teardown phase where container.Terminate() is called with no timeout. If the Docker container termination hangs, the process never exits, causing the Prow CI job to stay stuck in "pending" state indefinitely. Add a 30-second timeout context to Testcontainer.Close() so the teardown always completes, allowing the process to exit and Prow to report the test failure status back to GitHub. --- pkg/db/db_session/testcontainer.go | 6 +++++- test/helper.go | 19 ++++++++++++++++--- test/integration/integration_test.go | 16 ++++++++++++++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pkg/db/db_session/testcontainer.go b/pkg/db/db_session/testcontainer.go index 8ea19c9..903c8f3 100755 --- a/pkg/db/db_session/testcontainer.go +++ b/pkg/db/db_session/testcontainer.go @@ -147,7 +147,11 @@ func (f *Testcontainer) CheckConnection() error { } func (f *Testcontainer) Close() error { - ctx := context.Background() + // Use a timeout to prevent hanging indefinitely during teardown. + // Without this, a hung container.Terminate() would block the process from + // exiting, causing Prow CI jobs to stay in "pending" state (HYPERFLEET-625). + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() // Close SQL connection if f.sqlDB != nil { diff --git a/test/helper.go b/test/helper.go index 19ba246..75d7b1b 100755 --- a/test/helper.go +++ b/test/helper.go @@ -108,11 +108,17 @@ func NewHelper(t *testing.T) *Helper { // Start JWK certificate mock server for testing jwkMockTeardown := helper.StartJWKCertServerMock() + // Teardown order: terminate the testcontainer FIRST so the Docker + // container is removed before anything else. If server shutdown hangs + // and the force-exit goroutine kills the process, the Docker container + // would remain alive and keep the Prow pod stuck (HYPERFLEET-625). + // CleanDB is omitted because the container is destroyed anyway. helper.teardowns = []func() error{ - helper.CleanDB, - jwkMockTeardown, - helper.stopAPIServer, helper.teardownEnv, + helper.stopAPIServer, + helper.stopMetricsServer, + helper.stopHealthServer, + jwkMockTeardown, } helper.startAPIServer() helper.startMetricsServer() @@ -181,6 +187,13 @@ func (helper *Helper) stopMetricsServer() error { return nil } +func (helper *Helper) stopHealthServer() error { + if err := helper.HealthServer.Stop(); err != nil { + return fmt.Errorf("unable to stop health server: %s", err.Error()) + } + return nil +} + func (helper *Helper) startHealthServer() { ctx := context.Background() helper.HealthServer = server.NewHealthServer() diff --git a/test/integration/integration_test.go b/test/integration/integration_test.go index 0348717..a2e3acd 100755 --- a/test/integration/integration_test.go +++ b/test/integration/integration_test.go @@ -7,6 +7,7 @@ import ( "path/filepath" "runtime" "testing" + "time" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" "github.com/openshift-hyperfleet/hyperfleet-api/test" @@ -57,6 +58,21 @@ func TestMain(m *testing.M) { helper := test.NewHelper(&testing.T{}) exitCode := m.Run() + + // Force exit if teardown hangs (e.g., due to a panic leaving resources in a bad state). + // Without this, hung teardown blocks the process from exiting, causing + // Prow CI jobs to stay in "pending" state indefinitely (HYPERFLEET-625). + // 45s allows the testcontainer termination (30s timeout) to complete first. + localExit := exitCode + go func() { + time.Sleep(45 * time.Second) + logger.Error(ctx, "Teardown timed out after 45s, forcing exit") + if localExit == 0 { + localExit = 1 + } + os.Exit(localExit) + }() + helper.Teardown() os.Exit(exitCode) } From d61a6942e615a154b0d03718f37174ffe7957460 Mon Sep 17 00:00:00 2001 From: Rafael Benevides Date: Mon, 9 Feb 2026 09:53:14 -0300 Subject: [PATCH 2/2] test: Add intentional panic test to verify Prow failure reporting --- cmd/hyperfleet-api/server/api_server.go | 4 +++- cmd/hyperfleet-api/server/metrics_server.go | 4 +++- test/helper.go | 4 ++-- test/integration/clusters_test.go | 8 ++++++++ 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cmd/hyperfleet-api/server/api_server.go b/cmd/hyperfleet-api/server/api_server.go index 3121172..67a20c6 100755 --- a/cmd/hyperfleet-api/server/api_server.go +++ b/cmd/hyperfleet-api/server/api_server.go @@ -157,5 +157,7 @@ func (s apiServer) Start() { } func (s apiServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + return s.httpServer.Shutdown(ctx) } diff --git a/cmd/hyperfleet-api/server/metrics_server.go b/cmd/hyperfleet-api/server/metrics_server.go index 259e836..ef4d40d 100755 --- a/cmd/hyperfleet-api/server/metrics_server.go +++ b/cmd/hyperfleet-api/server/metrics_server.go @@ -71,5 +71,7 @@ func (s metricsServer) Start() { } func (s metricsServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + return s.httpServer.Shutdown(ctx) } diff --git a/test/helper.go b/test/helper.go index 75d7b1b..324a96f 100755 --- a/test/helper.go +++ b/test/helper.go @@ -108,9 +108,9 @@ func NewHelper(t *testing.T) *Helper { // Start JWK certificate mock server for testing jwkMockTeardown := helper.StartJWKCertServerMock() - // Teardown order: terminate the testcontainer FIRST so the Docker + // Teardown order: terminate the testcontainer FIRST so the // container is removed before anything else. If server shutdown hangs - // and the force-exit goroutine kills the process, the Docker container + // and the force-exit goroutine kills the process, the container // would remain alive and keep the Prow pod stuck (HYPERFLEET-625). // CleanDB is omitted because the container is destroyed anyway. helper.teardowns = []func() error{ diff --git a/test/integration/clusters_test.go b/test/integration/clusters_test.go index c6ca6e8..793fb4f 100644 --- a/test/integration/clusters_test.go +++ b/test/integration/clusters_test.go @@ -778,3 +778,11 @@ func TestClusterPost_WrongKind(t *testing.T) { Expect(ok).To(BeTrue()) Expect(detail).To(ContainSubstring("kind must be 'Cluster'")) } + +// TestClusterPanicFailure is a temporary test to verify that Prow correctly +// reports integration test failures when a panic occurs during test execution. +// This test should be removed after confirming the behavior. +func TestClusterPanicFailure(t *testing.T) { + _, _ = test.RegisterIntegration(t) + panic("intentional panic to test Prow failure reporting") +}