From fe1c0fa795b7150a36e9b4611dcea7da07c9b9a4 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 26 May 2026 16:12:34 +0200 Subject: [PATCH] Increase test timeout and rerun flaky tests (#662) --- ci/run_python.sh | 36 +++++++++++++++---- .../distributed_ucxx/tests/test_worker.py | 6 +++- .../_lib_async/tests/test_multiple_nodes.py | 12 +++++-- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/ci/run_python.sh b/ci/run_python.sh index 6e6c0581f..def2aac6a 100755 --- a/ci/run_python.sh +++ b/ci/run_python.sh @@ -72,12 +72,36 @@ run_py_benchmark() { echo -e "\e[1;33mSLOW BENCHMARK: it may seem like a deadlock but will eventually complete.\e[0m" fi - UCX_KEEPALIVE_INTERVAL=1ms \ - UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} \ - UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} \ - python "${TIMEOUT_TOOL_PATH}" --enable-python $((2*60)) \ - python -m ucxx.benchmarks.send_recv --backend "${BACKEND}" \ - -o cupy --reuse-alloc -n 8MiB --n-buffers "$N_BUFFERS" --progress-mode "${PROGRESS_MODE}" ${ASYNCIO_WAIT} + MAX_ATTEMPTS=3 + LAST_STATUS=0 + + set +e + for attempt in $(seq 1 "${MAX_ATTEMPTS}"); do + echo "Attempt ${attempt}/${MAX_ATTEMPTS} to run Python benchmark" + + UCX_KEEPALIVE_INTERVAL=1ms \ + UCXPY_ENABLE_DELAYED_SUBMISSION=${ENABLE_DELAYED_SUBMISSION} \ + UCXPY_ENABLE_PYTHON_FUTURE=${ENABLE_PYTHON_FUTURE} \ + python "${TIMEOUT_TOOL_PATH}" --enable-python $((2*60)) \ + python -m ucxx.benchmarks.send_recv --backend "${BACKEND}" \ + -o cupy --reuse-alloc -n 8MiB --n-buffers "$N_BUFFERS" --progress-mode "${PROGRESS_MODE}" ${ASYNCIO_WAIT} + + LAST_STATUS=$? + if [ "${LAST_STATUS}" -eq 0 ]; then + break + fi + + if [ "${attempt}" -lt "${MAX_ATTEMPTS}" ]; then + echo "Python benchmark failed with status ${LAST_STATUS}; retrying" + sleep 1 + fi + done + set -e + + if [ "${LAST_STATUS}" -ne 0 ]; then + echo "Failure running Python benchmark after ${MAX_ATTEMPTS} attempts" + exit "${LAST_STATUS}" + fi } log_message "Python Core Tests" diff --git a/python/distributed-ucxx/distributed_ucxx/tests/test_worker.py b/python/distributed-ucxx/distributed_ucxx/tests/test_worker.py index b7f71e8cf..0d25c2118 100644 --- a/python/distributed-ucxx/distributed_ucxx/tests/test_worker.py +++ b/python/distributed-ucxx/distributed_ucxx/tests/test_worker.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause import pytest @@ -12,6 +12,10 @@ @pytest.mark.parametrize("protocol", ["ucx", "ucxx"]) @pytest.mark.parametrize("Worker", [Worker, Nanny]) +@pytest.mark.flaky( + reruns=3, + only_rerun="Trying to reset UCX but not all Endpoints and/or Listeners are closed", +) @gen_test() async def test_protocol_from_scheduler_address(ucxx_loop, protocol, Worker): async with Scheduler(protocol=protocol, dashboard_address=":0") as s: diff --git a/python/ucxx/ucxx/_lib_async/tests/test_multiple_nodes.py b/python/ucxx/ucxx/_lib_async/tests/test_multiple_nodes.py index 1043a1a3f..a9d6de358 100644 --- a/python/ucxx/ucxx/_lib_async/tests/test_multiple_nodes.py +++ b/python/ucxx/ucxx/_lib_async/tests/test_multiple_nodes.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause import asyncio @@ -40,7 +40,15 @@ async def client_node(port): @pytest.mark.asyncio @pytest.mark.parametrize("num_servers", [1, 2, 4]) -@pytest.mark.parametrize("num_clients", [1, 10, 50, 100]) +@pytest.mark.parametrize( + "num_clients", + [ + 1, + 10, + pytest.param(50, marks=pytest.mark.asyncio_timeout(90)), + pytest.param(100, marks=pytest.mark.asyncio_timeout(90)), + ], +) async def test_many_servers_many_clients(num_servers, num_clients): somaxconn = get_somaxconn()