diff --git a/.github/workflows/nightly-e2e.yml b/.github/workflows/nightly-e2e.yml index e0828676c5..5295589b90 100644 --- a/.github/workflows/nightly-e2e.yml +++ b/.github/workflows/nightly-e2e.yml @@ -18,25 +18,31 @@ on: jobs: compatibility-test: runs-on: ubuntu-latest - timeout-minutes: 200 + # Each consumer version runs as an independent matrix job. A single scenario + # normally completes in well under 30 minutes, so this timeout makes a hung + # scenario (e.g. a consumer chain that never starts) fail fast instead of + # blocking for hours. + timeout-minutes: 30 + strategy: + # Don't cancel sibling versions if one version is incompatible/hangs. + fail-fast: false + matrix: + # Consumer versions tested against the latest provider. + # For new versions to be tested add/remove entries here. + consumer-version: [latest] steps: - uses: actions/setup-go@v5 with: go-version: "1.25.9" - uses: actions/checkout@v4 with: - fetch-depth: 0 # get all history for all branches and tags + fetch-depth: 0 # get all history for all branches and tags (needed to build tagged versions) - name: Checkout LFS objects run: git lfs checkout - - name: Setup Go - uses: actions/setup-go@v5 - with: - go-version: "1.25.9" # The Go version to download (if necessary) and use. - name: E2E compatibility test - # Run compatibility tests for different consumer (-cv) and provider (-pv) versions. - # Combination of all provider versions with consumer versions are tested. - # For new versions to be tested add/modify -pc/-cv parameters. - run: go run ./tests/e2e/... --tc compatibility -pv latest -cv latest -cv v5.2.0 -cv v6.3.0 + # Run compatibility tests for the latest provider against a single consumer + # version per matrix job. + run: go run ./tests/e2e/... --tc compatibility -pv latest -cv ${{ matrix.consumer-version }} happy-path-test: runs-on: ubuntu-latest timeout-minutes: 20 @@ -377,6 +383,7 @@ jobs: nightly-test-fail: needs: + - compatibility-test - happy-path-test - changeover-test - democracy-reward-test diff --git a/tests/e2e/actions.go b/tests/e2e/actions.go index c5d715b159..6c452b1224 100644 --- a/tests/e2e/actions.go +++ b/tests/e2e/actions.go @@ -35,6 +35,12 @@ const ( V620 = "v6.2.0" ) +// startChainTimeout bounds how long StartChain waits for a chain to come up and +// signal `done`. A healthy chain starts in well under a minute; this generous +// cap exists so that a chain which never produces blocks (e.g. an incompatible +// consumer version) fails fast instead of hanging until the CI job timeout. +const startChainTimeout = 5 * time.Minute + // Note: to get error response reported back from this command '--gas auto' needs to be set. var gas = "auto" @@ -199,17 +205,39 @@ func (tr *Chain) StartChain( scanner := bufio.NewScanner(cmdReader) - for scanner.Scan() { - out := scanner.Text() - if verbose { - fmt.Println("startChain: " + out) + // Wait for the chain-start script to emit the `done` sentinel, but bound the + // wait: start-chain.sh polls for the chain to produce blocks in an unbounded + // loop, so a chain that never starts would otherwise block here until the CI + // job timeout. Fail fast instead. + scanDone := make(chan error, 1) + go func() { + for scanner.Scan() { + out := scanner.Text() + if verbose { + fmt.Println("startChain: " + out) + } + if out == done { + scanDone <- nil + return + } } - if out == done { - break + if err := scanner.Err(); err != nil { + scanDone <- err + return } - } - if err := scanner.Err(); err != nil { - log.Fatal(err) + // The script exited (stdout closed) before signaling done, which means + // the chain failed to start rather than just being slow. + scanDone <- fmt.Errorf("chain %s start script exited before signaling done", action.Chain) + }() + + select { + case err := <-scanDone: + if err != nil { + log.Fatal(err) + } + case <-time.After(startChainTimeout): + _ = cmd.Process.Kill() + log.Fatalf("timed out after %s waiting for chain %s to start", startChainTimeout, action.Chain) } tr.addChainToRelayer(AddChainToRelayerAction{ diff --git a/tests/e2e/v5/actions.go b/tests/e2e/v5/actions.go index b5ae710243..1d8c06e474 100644 --- a/tests/e2e/v5/actions.go +++ b/tests/e2e/v5/actions.go @@ -23,6 +23,11 @@ import ( const ( done = "done!!!!!!!!" + // scanTimeout bounds how long we wait for a child script to emit the `done` + // sentinel, so a chain/command that never completes fails fast instead of + // hanging until the CI job timeout. + scanTimeout = 5 * time.Minute + VLatest = "latest" V400 = "v4.0.0" V330 = "v3.3.0" @@ -136,17 +141,33 @@ func (tr *Chain) StartChain( scanner := bufio.NewScanner(cmdReader) - for scanner.Scan() { - out := scanner.Text() - if verbose { - fmt.Println("startChain: " + out) + scanDone := make(chan error, 1) + go func() { + for scanner.Scan() { + out := scanner.Text() + if verbose { + fmt.Println("startChain: " + out) + } + if out == done { + scanDone <- nil + return + } } - if out == done { - break + if err := scanner.Err(); err != nil { + scanDone <- err + return } - } - if err := scanner.Err(); err != nil { - log.Fatal(err) + scanDone <- fmt.Errorf("chain %s start script exited before signaling done", action.Chain) + }() + + select { + case err := <-scanDone: + if err != nil { + log.Fatal(err) + } + case <-time.After(scanTimeout): + _ = cmd.Process.Kill() + log.Fatalf("timed out after %s waiting for chain %s to start", scanTimeout, action.Chain) } tr.addChainToRelayer(AddChainToRelayerAction{ @@ -820,17 +841,33 @@ func (tr Chain) AssignConsumerPubKey(action e2e.AssignConsumerPubKeyAction, verb scanner := bufio.NewScanner(cmdReader) - for scanner.Scan() { - out := scanner.Text() - if verbose { - fmt.Println("assign key - reconfigure: " + out) + scanDone := make(chan error, 1) + go func() { + for scanner.Scan() { + out := scanner.Text() + if verbose { + fmt.Println("assign key - reconfigure: " + out) + } + if out == done { + scanDone <- nil + return + } } - if out == done { - break + if err := scanner.Err(); err != nil { + scanDone <- err + return } - } - if err := scanner.Err(); err != nil { - log.Fatal(err) + scanDone <- fmt.Errorf("reconfigure node for %s exited before signaling done", action.Chain) + }() + + select { + case err := <-scanDone: + if err != nil { + log.Fatal(err) + } + case <-time.After(scanTimeout): + _ = configureNodeCmd.Process.Kill() + log.Fatalf("timed out after %s waiting to reconfigure node for chain %s", scanTimeout, action.Chain) } // TODO: @MSalopek refactor this so test config is not changed at runtime