Skip to content

Commit 9bc57a2

Browse files
committed
Stabilize MPI test timing
Synchronize ranks before timed sections so scheduler skew and barrier waits are not counted as task runtime, preventing rare timeout flakes like these: ``` [ RUN ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_3_3 unknown file: error: C++ exception with description " Task execute time need to be: time < 1 secs. Original time in secs: 1.21769 " thrown in the test body. [ OK ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_3_3 (1224 ms) [ FAILED ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_3_3, where GetParam() = (64-byte object <20-AA 75-60 F6-7F 00-00 C0-6C 6E-60 F6-7F 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 88-77 B8-48 FD-01 00-00>, "nesterov_a_test_task_processes_3_mpi_enabled", (3, "3")) (1225 ms) [ RUN ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_7_7 job aborted: [ranks] message [0] terminated [1] application aborted aborting MPI_COMM_WORLD (comm=0x44000000), error 1, comm rank 1 [2] terminated ---- error analysis ----- [1] on runnervmqq1k9 D:\a\parallel_programming_course\parallel_programming_course\install\bin\ppc_func_tests aborted the job. abort code 1 ---- error analysis ----- [ PROCESS 1 ] [ PROCESS 1 ] Traceback (most recent call last): File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 308, in <module> _execute(args_dict, env_copy) File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 283, in _execute runner.run_processes(args_dict["additional_mpi_args"]) File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 247, in run_processes self.__run_exec( File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 122, in __run_exec raise Exception(f"Subprocess return {result.returncode}.") Exception: Subprocess return 1. Error: Process completed with exit code 1. ```
1 parent 626e4be commit 9bc57a2

5 files changed

Lines changed: 28 additions & 6 deletions

File tree

modules/runners/src/runners.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,22 +82,22 @@ int RunAllTests() {
8282
}
8383

8484
void SyncGTestSeed() {
85-
unsigned int seed = 0;
8685
int rank = -1;
8786
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
88-
if (rank == 0) {
87+
int seed = ::testing::GTEST_FLAG(random_seed);
88+
if (rank == 0 && seed == 0) {
8989
try {
90-
seed = std::random_device{}();
90+
seed = static_cast<int>((std::random_device{}() % 99999U) + 1U);
9191
} catch (...) {
9292
seed = 0;
9393
}
9494
if (seed == 0) {
9595
const auto now = static_cast<std::uint64_t>(std::chrono::steady_clock::now().time_since_epoch().count());
96-
seed = static_cast<unsigned int>(((now & 0x7fffffffULL) | 1ULL));
96+
seed = static_cast<int>((now % 99999ULL) + 1ULL);
9797
}
9898
}
99-
MPI_Bcast(&seed, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
100-
::testing::GTEST_FLAG(random_seed) = static_cast<int>(seed);
99+
MPI_Bcast(&seed, 1, MPI_INT, 0, MPI_COMM_WORLD);
100+
::testing::GTEST_FLAG(random_seed) = seed;
101101
}
102102

103103
void SyncGTestFilter() {

modules/util/include/func_test_util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class BaseRunFuncTests : public ::testing::TestWithParam<FuncTestParam<InType, O
103103

104104
void ValidateTask() {
105105
EXPECT_TRUE(task_->Validation());
106+
SynchronizeMpiRanks();
106107
EXPECT_TRUE(task_->PreProcessing());
107108
}
108109

modules/util/include/perf_test_util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class BaseRunPerfTests : public ::testing::TestWithParam<PerfTestParam<InType, O
8585
task_ = task_getter(GetTestInputData());
8686
ppc::performance::Perf perf(task_);
8787
ppc::performance::PerfAttr perf_attr;
88+
SynchronizeMpiRanks();
8889
SetPerfAttributes(perf_attr);
8990

9091
if (mode == ppc::performance::PerfResults::TypeOfRunning::kPipeline) {

modules/util/include/util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ int GetNumThreads();
7575
int GetNumProc();
7676
double GetTaskMaxTime();
7777
double GetPerfMaxTime();
78+
void SynchronizeMpiRanks();
7879

7980
template <typename T>
8081
std::string GetNamespace() {

modules/util/src/util.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "util/include/util.hpp"
22

3+
#include <mpi.h>
4+
35
#include <algorithm>
46
#include <array>
57
#include <filesystem>
@@ -65,3 +67,20 @@ bool ppc::util::IsUnderMpirun() {
6567
return static_cast<bool>(mpi_env.has_value());
6668
});
6769
}
70+
71+
void ppc::util::SynchronizeMpiRanks() {
72+
int initialized = 0;
73+
if (MPI_Initialized(&initialized) != MPI_SUCCESS || initialized == 0) {
74+
return;
75+
}
76+
77+
int finalized = 0;
78+
if (MPI_Finalized(&finalized) != MPI_SUCCESS || finalized != 0) {
79+
return;
80+
}
81+
82+
const int barrier_res = MPI_Barrier(MPI_COMM_WORLD);
83+
if (barrier_res != MPI_SUCCESS) {
84+
MPI_Abort(MPI_COMM_WORLD, barrier_res);
85+
}
86+
}

0 commit comments

Comments
 (0)