From b7c4270a8ed717226fc5962566ef5b5d0338ebf9 Mon Sep 17 00:00:00 2001 From: Rafael Benevides Date: Mon, 9 Feb 2026 10:45:37 -0300 Subject: [PATCH] HYPERFLEET-625 - fix: Add timeout to testcontainer teardown to prevent Prow hang When integration tests fail with a panic, the process continues to the teardown phase where container.Terminate() is called with no timeout. If the Docker container termination hangs, the process never exits, causing the Prow CI job to stay stuck in "pending" state indefinitely. Add a 30-second timeout context to Testcontainer.Close() so the teardown always completes, allowing the process to exit and Prow to report the test failure status back to GitHub. --- cmd/hyperfleet-api/server/api_server.go | 4 +++- cmd/hyperfleet-api/server/metrics_server.go | 4 +++- pkg/db/db_session/testcontainer.go | 6 +++++- test/helper.go | 19 ++++++++++++++++--- test/integration/integration_test.go | 16 ++++++++++++++++ 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/cmd/hyperfleet-api/server/api_server.go b/cmd/hyperfleet-api/server/api_server.go index 3121172..67a20c6 100755 --- a/cmd/hyperfleet-api/server/api_server.go +++ b/cmd/hyperfleet-api/server/api_server.go @@ -157,5 +157,7 @@ func (s apiServer) Start() { } func (s apiServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + return s.httpServer.Shutdown(ctx) } diff --git a/cmd/hyperfleet-api/server/metrics_server.go b/cmd/hyperfleet-api/server/metrics_server.go index 259e836..ef4d40d 100755 --- a/cmd/hyperfleet-api/server/metrics_server.go +++ b/cmd/hyperfleet-api/server/metrics_server.go @@ -71,5 +71,7 @@ func (s metricsServer) Start() { } func (s metricsServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + return s.httpServer.Shutdown(ctx) } diff --git a/pkg/db/db_session/testcontainer.go b/pkg/db/db_session/testcontainer.go index 8ea19c9..903c8f3 100755 --- a/pkg/db/db_session/testcontainer.go +++ b/pkg/db/db_session/testcontainer.go @@ -147,7 +147,11 @@ func (f *Testcontainer) CheckConnection() error { } func (f *Testcontainer) Close() error { - ctx := context.Background() + // Use a timeout to prevent hanging indefinitely during teardown. + // Without this, a hung container.Terminate() would block the process from + // exiting, causing Prow CI jobs to stay in "pending" state (HYPERFLEET-625). + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() // Close SQL connection if f.sqlDB != nil { diff --git a/test/helper.go b/test/helper.go index 19ba246..324a96f 100755 --- a/test/helper.go +++ b/test/helper.go @@ -108,11 +108,17 @@ func NewHelper(t *testing.T) *Helper { // Start JWK certificate mock server for testing jwkMockTeardown := helper.StartJWKCertServerMock() + // Teardown order: terminate the testcontainer FIRST so the + // container is removed before anything else. If server shutdown hangs + // and the force-exit goroutine kills the process, the container + // would remain alive and keep the Prow pod stuck (HYPERFLEET-625). + // CleanDB is omitted because the container is destroyed anyway. helper.teardowns = []func() error{ - helper.CleanDB, - jwkMockTeardown, - helper.stopAPIServer, helper.teardownEnv, + helper.stopAPIServer, + helper.stopMetricsServer, + helper.stopHealthServer, + jwkMockTeardown, } helper.startAPIServer() helper.startMetricsServer() @@ -181,6 +187,13 @@ func (helper *Helper) stopMetricsServer() error { return nil } +func (helper *Helper) stopHealthServer() error { + if err := helper.HealthServer.Stop(); err != nil { + return fmt.Errorf("unable to stop health server: %s", err.Error()) + } + return nil +} + func (helper *Helper) startHealthServer() { ctx := context.Background() helper.HealthServer = server.NewHealthServer() diff --git a/test/integration/integration_test.go b/test/integration/integration_test.go index 0348717..a2e3acd 100755 --- a/test/integration/integration_test.go +++ b/test/integration/integration_test.go @@ -7,6 +7,7 @@ import ( "path/filepath" "runtime" "testing" + "time" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" "github.com/openshift-hyperfleet/hyperfleet-api/test" @@ -57,6 +58,21 @@ func TestMain(m *testing.M) { helper := test.NewHelper(&testing.T{}) exitCode := m.Run() + + // Force exit if teardown hangs (e.g., due to a panic leaving resources in a bad state). + // Without this, hung teardown blocks the process from exiting, causing + // Prow CI jobs to stay in "pending" state indefinitely (HYPERFLEET-625). + // 45s allows the testcontainer termination (30s timeout) to complete first. + localExit := exitCode + go func() { + time.Sleep(45 * time.Second) + logger.Error(ctx, "Teardown timed out after 45s, forcing exit") + if localExit == 0 { + localExit = 1 + } + os.Exit(localExit) + }() + helper.Teardown() os.Exit(exitCode) }