diff --git a/cmd/hyperfleet-api/server/api_server.go b/cmd/hyperfleet-api/server/api_server.go index 3121172..67a20c6 100755 --- a/cmd/hyperfleet-api/server/api_server.go +++ b/cmd/hyperfleet-api/server/api_server.go @@ -157,5 +157,7 @@ func (s apiServer) Start() { } func (s apiServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + return s.httpServer.Shutdown(ctx) } diff --git a/cmd/hyperfleet-api/server/metrics_server.go b/cmd/hyperfleet-api/server/metrics_server.go index 259e836..ef4d40d 100755 --- a/cmd/hyperfleet-api/server/metrics_server.go +++ b/cmd/hyperfleet-api/server/metrics_server.go @@ -71,5 +71,7 @@ func (s metricsServer) Start() { } func (s metricsServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + return s.httpServer.Shutdown(ctx) } diff --git a/pkg/db/db_session/testcontainer.go b/pkg/db/db_session/testcontainer.go index 8ea19c9..903c8f3 100755 --- a/pkg/db/db_session/testcontainer.go +++ b/pkg/db/db_session/testcontainer.go @@ -147,7 +147,11 @@ func (f *Testcontainer) CheckConnection() error { } func (f *Testcontainer) Close() error { - ctx := context.Background() + // Use a timeout to prevent hanging indefinitely during teardown. + // Without this, a hung container.Terminate() would block the process from + // exiting, causing Prow CI jobs to stay in "pending" state (HYPERFLEET-625). + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() // Close SQL connection if f.sqlDB != nil { diff --git a/test/helper.go b/test/helper.go index 19ba246..324a96f 100755 --- a/test/helper.go +++ b/test/helper.go @@ -108,11 +108,17 @@ func NewHelper(t *testing.T) *Helper { // Start JWK certificate mock server for testing jwkMockTeardown := helper.StartJWKCertServerMock() + // Teardown order: terminate the testcontainer FIRST so the + // container is removed before anything else. If server shutdown hangs + // and the force-exit goroutine kills the process, the container + // would remain alive and keep the Prow pod stuck (HYPERFLEET-625). + // CleanDB is omitted because the container is destroyed anyway. helper.teardowns = []func() error{ - helper.CleanDB, - jwkMockTeardown, - helper.stopAPIServer, helper.teardownEnv, + helper.stopAPIServer, + helper.stopMetricsServer, + helper.stopHealthServer, + jwkMockTeardown, } helper.startAPIServer() helper.startMetricsServer() @@ -181,6 +187,13 @@ func (helper *Helper) stopMetricsServer() error { return nil } +func (helper *Helper) stopHealthServer() error { + if err := helper.HealthServer.Stop(); err != nil { + return fmt.Errorf("unable to stop health server: %s", err.Error()) + } + return nil +} + func (helper *Helper) startHealthServer() { ctx := context.Background() helper.HealthServer = server.NewHealthServer() diff --git a/test/integration/integration_test.go b/test/integration/integration_test.go index 0348717..a2e3acd 100755 --- a/test/integration/integration_test.go +++ b/test/integration/integration_test.go @@ -7,6 +7,7 @@ import ( "path/filepath" "runtime" "testing" + "time" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" "github.com/openshift-hyperfleet/hyperfleet-api/test" @@ -57,6 +58,21 @@ func TestMain(m *testing.M) { helper := test.NewHelper(&testing.T{}) exitCode := m.Run() + + // Force exit if teardown hangs (e.g., due to a panic leaving resources in a bad state). + // Without this, hung teardown blocks the process from exiting, causing + // Prow CI jobs to stay in "pending" state indefinitely (HYPERFLEET-625). + // 45s allows the testcontainer termination (30s timeout) to complete first. + localExit := exitCode + go func() { + time.Sleep(45 * time.Second) + logger.Error(ctx, "Teardown timed out after 45s, forcing exit") + if localExit == 0 { + localExit = 1 + } + os.Exit(localExit) + }() + helper.Teardown() os.Exit(exitCode) }