diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index d09e221e5..b10925efc 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -10,6 +10,7 @@ dependencies: - c-compiler - cloudpickle - cmake>=3.26.4,!=3.30.0 +- cuda-core>=0.3.2 - cuda-cudart-dev - cuda-nvcc - cuda-version=12.9 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 535497a32..4d383e938 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -10,6 +10,7 @@ dependencies: - c-compiler - cloudpickle - cmake>=3.26.4,!=3.30.0 +- cuda-core>=0.3.2 - cuda-cudart-dev - cuda-nvcc - cuda-version=12.9 diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index b7e6b4774..bcf7605b6 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -10,6 +10,7 @@ dependencies: - c-compiler - cloudpickle - cmake>=3.26.4,!=3.30.0 +- cuda-core>=0.3.2 - cuda-cudart-dev - cuda-nvcc - cuda-version=13.1 diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index ea64db655..49de3b6e2 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -10,6 +10,7 @@ dependencies: - c-compiler - cloudpickle - cmake>=3.26.4,!=3.30.0 +- cuda-core>=0.3.2 - cuda-cudart-dev - cuda-nvcc - cuda-version=13.1 diff --git a/conda/recipes/ucxx/recipe.yaml b/conda/recipes/ucxx/recipe.yaml index 441a1812f..bb5b16466 100644 --- a/conda/recipes/ucxx/recipe.yaml +++ b/conda/recipes/ucxx/recipe.yaml @@ -290,8 +290,7 @@ outputs: - ${{ pin_subpackage("libucxx", exact=True) }} - cuda-cudart-dev run: - - numba >=0.60.0,<0.62.0 - - numba-cuda >=0.22.1 + - cuda-core >=0.3.2 - numpy >=1.23,<3.0 # 'nvidia-ml-py' provides the 'pynvml' module - nvidia-ml-py>=12 @@ -431,8 +430,7 @@ outputs: - setuptools>=77.0.0 - wheel run: - - numba >=0.60.0,<0.62.0 - - numba-cuda >=0.22.1 + - cuda-core >=0.3.2 - python - pyyaml >=6 - rapids-dask-dependency ${{ rapids_version }} diff --git a/dependencies.yaml b/dependencies.yaml index 022f2cc81..289d8b00f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -297,9 +297,31 @@ dependencies: - &numpy numpy>=1.23,<3.0 # 'nvidia-ml-py' provides the 'pynvml' module - nvidia-ml-py>=12 + - cuda-core>=0.3.2 + run_python_distributed_ucxx: + common: + - output_types: [conda, requirements, pyproject] + packages: + - rapids-dask-dependency==26.4.*,>=0.0.0a0 + - pyyaml>=6 + - cuda-core>=0.3.2 + test_cpp: + common: + - output_types: conda + packages: + - *cmake_ver + test_python_ucxx: + common: + - output_types: [conda, requirements, pyproject] + packages: + - cloudpickle + - pytest<9.0.0 + - pytest-asyncio>=1.0.0 + - pytest-rerunfailures!=16.0.0 # See https://github.com/pytest-dev/pytest-rerunfailures/issues/302 + - rapids-dask-dependency==26.4.*,>=0.0.0a0 - output_types: [conda] packages: - - &numba_cuda numba-cuda>=0.22.1 + - &numba_cuda_test numba-cuda>=0.22.1 specific: - output_types: [requirements, pyproject] matrices: @@ -307,25 +329,26 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - &numba_cuda_cu12 numba-cuda[cu12]>=0.22.1 + - &numba_cuda_cu12_test numba-cuda[cu12]>=0.22.1 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - &numba_cuda_cu13 numba-cuda[cu13]>=0.22.1 + - &numba_cuda_cu13_test numba-cuda[cu13]>=0.22.1 # fallback to numba-cuda with no extra CUDA packages if 'cuda_suffixed' isn't true - matrix: packages: - - *numba_cuda - run_python_distributed_ucxx: + - *numba_cuda_test + test_python_distributed_ucxx: common: - output_types: [conda, requirements, pyproject] packages: - - rapids-dask-dependency==26.4.*,>=0.0.0a0 - - pyyaml>=6 + - *numpy + - pytest<9.0.0 + - pytest-rerunfailures!=16.0.0 # See https://github.com/pytest-dev/pytest-rerunfailures/issues/302 - output_types: [conda] packages: - - *numba_cuda + - *numba_cuda_test specific: - output_types: [requirements, pyproject] matrices: @@ -333,37 +356,15 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - *numba_cuda_cu12 + - *numba_cuda_cu12_test - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - *numba_cuda_cu13 - # fallback to numba-cuda with no extra CUDA packages if 'cuda_suffixed' isn't true + - *numba_cuda_cu13_test - matrix: packages: - - *numba_cuda - test_cpp: - common: - - output_types: conda - packages: - - *cmake_ver - test_python_ucxx: - common: - - output_types: [conda, requirements, pyproject] - packages: - - cloudpickle - - pytest<9.0.0 - - pytest-asyncio>=1.0.0 - - pytest-rerunfailures!=16.0.0 # See https://github.com/pytest-dev/pytest-rerunfailures/issues/302 - - rapids-dask-dependency==26.4.*,>=0.0.0a0 - test_python_distributed_ucxx: - common: - - output_types: [conda, requirements, pyproject] - packages: - - *numpy - - pytest<9.0.0 - - pytest-rerunfailures!=16.0.0 # See https://github.com/pytest-dev/pytest-rerunfailures/issues/302 + - *numba_cuda_test depends_on_cupy: common: - output_types: conda diff --git a/python/distributed-ucxx/distributed_ucxx/ucxx.py b/python/distributed-ucxx/distributed_ucxx/ucxx.py index 56eaf96ae..ff5585184 100644 --- a/python/distributed-ucxx/distributed_ucxx/ucxx.py +++ b/python/distributed-ucxx/distributed_ucxx/ucxx.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause """ @@ -94,13 +94,12 @@ class CudaStream(Enum): def synchronize_stream(stream: CudaStream = CudaStream.Default): - import numba.cuda + from ucxx._cuda_context import synchronize_default_stream if stream == CudaStream.Default: - numba_stream = numba.cuda.default_stream() + synchronize_default_stream() else: raise ValueError("Unsupported stream") - numba_stream.synchronize() class gc_disabled: @@ -246,11 +245,11 @@ def init_once(): or ("cuda" in ucx_tls and "^cuda" not in ucx_tls) ): try: - import numba.cuda - except ImportError: + from ucxx._cuda_context import ensure_cuda_context + except ImportError as e: raise ImportError( - "CUDA support with UCX requires Numba for context management" - ) + "CUDA support with UCX requires cuda-core for context management." + ) from e cuda_visible_device = get_device_index_and_uuid( os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0] @@ -261,7 +260,7 @@ def init_once(): pre_existing_cuda_context.device_info, os.getpid() ) - numba.cuda.current_context() + ensure_cuda_context(0) cuda_context_created = has_cuda_context() if ( @@ -291,7 +290,8 @@ def init_once(): pool_size_str = get_rmm_config("pool-size") - # Find the function, `cuda_array()`, to use when allocating new CUDA arrays + # Find the function, `cuda_array()`, to use when allocating new CUDA arrays. + # RMM is required for CUDA array allocation at runtime (numba is only for tests). try: import rmm @@ -304,22 +304,9 @@ def device_array(n): pool_allocator=True, managed_memory=False, initial_pool_size=pool_size ) except ImportError: - try: - import numba.cuda - - def numba_device_array(n): - a = numba.cuda.device_array((n,), dtype="u1") - weakref.finalize(a, numba.cuda.current_context) - return a - - device_array = numba_device_array - except ImportError: - - def device_array(n): - raise RuntimeError( - "In order to send/recv CUDA arrays, Numba or RMM is required" - ) + def device_array(n): + raise RuntimeError("In order to send/recv CUDA arrays, RMM is required.") if pool_size_str is not None: logger.warning( diff --git a/python/distributed-ucxx/pyproject.toml b/python/distributed-ucxx/pyproject.toml index db13c0660..e0de7a322 100644 --- a/python/distributed-ucxx/pyproject.toml +++ b/python/distributed-ucxx/pyproject.toml @@ -20,7 +20,7 @@ license = "BSD-3-Clause" license-files = ["LICENSE"] requires-python = ">=3.11" dependencies = [ - "numba-cuda>=0.22.1", + "cuda-core>=0.3.2", "pyyaml>=6", "rapids-dask-dependency==26.4.*,>=0.0.0a0", "ucxx==0.49.*,>=0.0.0a0", @@ -46,6 +46,7 @@ docs = [ test = [ "cudf==26.4.*,>=0.0.0a0", "cupy-cuda13x>=13.6.0", + "numba-cuda>=0.22.1", "numpy>=1.23,<3.0", "pytest-rerunfailures!=16.0.0", "pytest<9.0.0", diff --git a/python/ucxx/examples/basic.py b/python/ucxx/examples/basic.py index 291ccc17c..5db46c205 100644 --- a/python/ucxx/examples/basic.py +++ b/python/ucxx/examples/basic.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause import argparse @@ -12,9 +12,9 @@ def _create_cuda_context(): - import numba.cuda + from ucxx._cuda_context import ensure_cuda_context - numba.cuda.current_context() + ensure_cuda_context(0) async def _progress_coroutine(worker): diff --git a/python/ucxx/pyproject.toml b/python/ucxx/pyproject.toml index e16b7d0c5..a95642d97 100644 --- a/python/ucxx/pyproject.toml +++ b/python/ucxx/pyproject.toml @@ -19,8 +19,8 @@ authors = [ license = "BSD-3-Clause" requires-python = ">=3.11" dependencies = [ + "cuda-core>=0.3.2", "libucxx==0.49.*,>=0.0.0a0", - "numba-cuda>=0.22.1", "numpy>=1.23,<3.0", "nvidia-ml-py>=12", "rmm==26.4.*,>=0.0.0a0", @@ -44,6 +44,7 @@ test = [ "cloudpickle", "cudf==26.4.*,>=0.0.0a0", "cupy-cuda13x>=13.6.0", + "numba-cuda>=0.22.1", "pytest-asyncio>=1.0.0", "pytest-rerunfailures!=16.0.0", "pytest<9.0.0", diff --git a/python/ucxx/ucxx/_cuda_context.py b/python/ucxx/ucxx/_cuda_context.py new file mode 100644 index 000000000..a4d32f518 --- /dev/null +++ b/python/ucxx/ucxx/_cuda_context.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: BSD-3-Clause + +"""CUDA context management using cuda.core. + +Provides helpers to ensure a CUDA context is created and to synchronize +the default stream. +""" + + +def _get_device_class(): + """Get the Device class from cuda.core.""" + try: + from cuda.core import Device + + return Device + except ImportError: + try: + from cuda.core.experimental import Device + + return Device + except ImportError as e: + raise ImportError( + "CUDA context management requires cuda-core (cuda-core>=0.3.2)." + ) from e + + +def ensure_cuda_context(device_id: int = 0) -> None: + """Ensure a CUDA context exists for the given device and set it as current. + + Parameters + ---------- + device_id : int, optional + The CUDA device index (default: 0). + """ + Device = _get_device_class() + Device(device_id).set_current() + + +def synchronize_default_stream(device_id: int = 0) -> None: + """Synchronize the default CUDA stream of the current device. + + Required when coordinating with UCX CUDA transfers (e.g. before send/recv + of CUDA buffers). + + Parameters + ---------- + device_id : int, optional + The CUDA device index (default: 0). + """ + Device = _get_device_class() + device = Device(device_id) + device.set_current() + device.sync() diff --git a/python/ucxx/ucxx/_lib_async/continuous_ucx_progress.py b/python/ucxx/ucxx/_lib_async/continuous_ucx_progress.py index 1ba0509ca..58740d8ec 100644 --- a/python/ucxx/ucxx/_lib_async/continuous_ucx_progress.py +++ b/python/ucxx/ucxx/_lib_async/continuous_ucx_progress.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause @@ -70,9 +70,9 @@ def __eq__(self, other): def _create_context(): - import numba.cuda + from ucxx._cuda_context import ensure_cuda_context - numba.cuda.current_context() + ensure_cuda_context(0) class ThreadMode(ProgressTask): diff --git a/python/ucxx/ucxx/benchmarks/backends/ucxx_core.py b/python/ucxx/ucxx/benchmarks/backends/ucxx_core.py index ee27a8491..d06b0c526 100644 --- a/python/ucxx/ucxx/benchmarks/backends/ucxx_core.py +++ b/python/ucxx/ucxx/benchmarks/backends/ucxx_core.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause from argparse import Namespace @@ -17,9 +17,9 @@ def _create_cuda_context(device): - import numba.cuda + from ucxx._cuda_context import ensure_cuda_context - numba.cuda.current_context(0) + ensure_cuda_context(device) def _transfer_wireup(ep, server): diff --git a/python/ucxx/ucxx/benchmarks/send_recv.py b/python/ucxx/ucxx/benchmarks/send_recv.py index f47ef732c..b919b3b94 100644 --- a/python/ucxx/ucxx/benchmarks/send_recv.py +++ b/python/ucxx/ucxx/benchmarks/send_recv.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause import argparse @@ -54,10 +54,10 @@ def _get_backend_implementation(backend): def _set_cuda_device(object_type, device): if object_type in ["cupy", "rmm"]: - import numba.cuda + from ucxx._cuda_context import ensure_cuda_context os.environ["CUDA_VISIBLE_DEVICES"] = str(device) - numba.cuda.current_context() + ensure_cuda_context(0) def server(queue, args): diff --git a/python/ucxx/ucxx/benchmarks/utils.py b/python/ucxx/ucxx/benchmarks/utils.py index bfc7b0244..5ff72b9a0 100644 --- a/python/ucxx/ucxx/benchmarks/utils.py +++ b/python/ucxx/ucxx/benchmarks/utils.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: BSD-3-Clause import asyncio @@ -19,12 +19,12 @@ def _ensure_cuda_device(devs, rank): - import numba.cuda + from ucxx._cuda_context import ensure_cuda_context dev_id = devs[rank % len(devs)] os.environ["CUDA_VISIBLE_DEVICES"] = str(dev_id) logger.debug(f"{dev_id=}, {rank=}") - numba.cuda.current_context() + ensure_cuda_context(0) def get_allocator(