From a0f25af532becd4f449964e6d8870182b91fd9b0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 15 Aug 2025 13:52:17 -0700 Subject: [PATCH 01/20] initial --- numba_cuda/numba/cuda/cudadrv/driver.py | 36 +++++++++++++++++++------ numba_cuda/numba/cuda/dispatcher.py | 7 ++++- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 7470cb8b5..58b045acd 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -3508,10 +3508,15 @@ def host_to_device(dst, src, size, stream=0): varargs = [] if stream: - assert isinstance(stream, Stream) + from cuda.core import experimental + + assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemcpyHtoDAsync if USE_NV_BINDING: - handle = stream.handle.value + if isinstance(stream, experimental.Stream): + handle = int(stream.handle) + else: + handle = stream.handle.value else: handle = stream.handle varargs.append(handle) @@ -3530,10 +3535,15 @@ def device_to_host(dst, src, size, stream=0): varargs = [] if stream: - assert isinstance(stream, Stream) + from cuda.core import experimental + + assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemcpyDtoHAsync if USE_NV_BINDING: - handle = stream.handle.value + if isinstance(stream, experimental.Stream): + handle = int(stream.handle) + else: + handle = stream.handle.value else: handle = stream.handle varargs.append(handle) @@ -3552,10 +3562,15 @@ def device_to_device(dst, src, size, stream=0): varargs = [] if stream: - assert isinstance(stream, Stream) + from cuda.core import experimental # Ensure experimental is imported + + assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemcpyDtoDAsync if USE_NV_BINDING: - handle = stream.handle.value + if isinstance(stream, experimental.Stream): + handle = int(stream.handle) + else: + handle = stream.handle.value else: handle = stream.handle varargs.append(handle) @@ -3577,10 +3592,15 @@ def device_memset(dst, val, size, stream=0): varargs = [] if stream: - assert isinstance(stream, Stream) + from cuda.core import experimental # Ensure experimental is imported + + assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemsetD8Async if USE_NV_BINDING: - handle = stream.handle.value + if isinstance(stream, experimental.Stream): + handle = int(stream.handle) + else: + handle = stream.handle.value else: handle = stream.handle varargs.append(handle) diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 78d777227..b56b0f8a2 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -467,7 +467,12 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): self._prepare_args(t, v, stream, retr, kernelargs) if driver.USE_NV_BINDING: - stream_handle = stream and stream.handle.value or 0 + from cuda.core import experimental + + if isinstance(stream, experimental.Stream): + stream_handle = int(stream.handle) + else: + stream_handle = stream and stream.handle.value or 0 else: zero_stream = None stream_handle = stream and stream.handle or zero_stream From 5322eef438c2b6cff5a1367577b30062941f4a86 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sat, 16 Aug 2025 06:11:06 -0700 Subject: [PATCH 02/20] tests --- .../cuda/tests/cudadrv/test_cuda_driver.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index 4da2548f8..dc179577b 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -9,6 +9,8 @@ from numba.cuda.cudadrv import devices, drvapi, driver as _driver from numba.cuda.testing import unittest, CUDATestCase from numba.cuda.testing import skip_on_cudasim +from cuda.core import experimental +import contextlib ptx1 = """ @@ -151,6 +153,44 @@ def test_cuda_driver_stream_operations(self): for i, v in enumerate(array): self.assertEqual(i, v) + def test_cuda_core_stream_operations(self): + module = self.context.create_module_ptx(self.ptx) + function = module.get_function("_Z10helloworldPi") + array = (c_int * 100)() + dev = experimental.Device() + dev.set_current() + stream = dev.create_stream() + + @contextlib.contextmanager + def auto_synchronize(stream): + try: + yield stream + finally: + stream.sync() + + with auto_synchronize(stream): + memory = self.context.memalloc(sizeof(array)) + host_to_device(memory, array, sizeof(array), stream=stream) + + ptr = memory.device_ctypes_pointer + + launch_kernel( + function.handle, # Kernel + 1, + 1, + 1, # gx, gy, gz + 100, + 1, + 1, # bx, by, bz + 0, # dynamic shared mem + stream.handle, # stream + [ptr], + ) + + device_to_host(array, memory, sizeof(array), stream=stream) + for i, v in enumerate(array): + self.assertEqual(i, v) + def test_cuda_driver_default_stream(self): # Test properties of the default stream ds = self.context.get_default_stream() From 251f4e9dfb08d0bd00f5ce8a7b437297d08691a0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sat, 16 Aug 2025 07:10:33 -0700 Subject: [PATCH 03/20] refactor --- numba_cuda/numba/cuda/cudadrv/driver.py | 101 +++++++++++------------- 1 file changed, 46 insertions(+), 55 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 58b045acd..47163f289 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -11,6 +11,7 @@ """ +from cuda.core import experimental import sys import os import ctypes @@ -3499,31 +3500,33 @@ def device_memory_depends(devmem, *objs): depset.extend(objs) +def _stream_handle(stream): + from cuda.core import experimental + + if USE_NV_BINDING: + if isinstance(stream, experimental.Stream): + return int(stream.handle) + return stream.handle.value + return stream.handle + + def host_to_device(dst, src, size, stream=0): """ NOTE: The underlying data pointer from the host data buffer is used and it should not be changed until the operation which can be asynchronous completes. """ - varargs = [] + fn = driver.cuMemcpyHtoD + args = (device_pointer(dst), host_pointer(src, readonly=True), size) if stream: from cuda.core import experimental assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemcpyHtoDAsync - if USE_NV_BINDING: - if isinstance(stream, experimental.Stream): - handle = int(stream.handle) - else: - handle = stream.handle.value - else: - handle = stream.handle - varargs.append(handle) - else: - fn = driver.cuMemcpyHtoD + args += (_stream_handle(stream),) - fn(device_pointer(dst), host_pointer(src, readonly=True), size, *varargs) + fn(*args) def device_to_host(dst, src, size, stream=0): @@ -3532,82 +3535,60 @@ def device_to_host(dst, src, size, stream=0): it should not be changed until the operation which can be asynchronous completes. """ - varargs = [] + fn = driver.cuMemcpyDtoH + args = (host_pointer(dst), device_pointer(src), size) if stream: from cuda.core import experimental assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemcpyDtoHAsync - if USE_NV_BINDING: - if isinstance(stream, experimental.Stream): - handle = int(stream.handle) - else: - handle = stream.handle.value - else: - handle = stream.handle - varargs.append(handle) - else: - fn = driver.cuMemcpyDtoH + args += (_stream_handle(stream),) - fn(host_pointer(dst), device_pointer(src), size, *varargs) + fn(*args) def device_to_device(dst, src, size, stream=0): """ - NOTE: The underlying data pointer from the host data buffer is used and + NOTE: The underlying data pointer from the device buffer is used and it should not be changed until the operation which can be asynchronous completes. """ - varargs = [] + fn = driver.cuMemcpyDtoD + args = (device_pointer(dst), device_pointer(src), size) if stream: - from cuda.core import experimental # Ensure experimental is imported + from cuda.core import experimental assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemcpyDtoDAsync - if USE_NV_BINDING: - if isinstance(stream, experimental.Stream): - handle = int(stream.handle) - else: - handle = stream.handle.value - else: - handle = stream.handle - varargs.append(handle) - else: - fn = driver.cuMemcpyDtoD + args += (_stream_handle(stream),) - fn(device_pointer(dst), device_pointer(src), size, *varargs) + fn(*args) def device_memset(dst, val, size, stream=0): - """Memset on the device. - If stream is not zero, asynchronous mode is used. + """ + Memset on the device. + If stream is 0, the call is synchronous. + If stream is a Stream object, asynchronous mode is used. dst: device memory val: byte value to be written - size: number of byte to be written - stream: a CUDA stream + size: number of bytes to be written + stream: 0 (synchronous) or a CUDA stream """ - varargs = [] + fn = driver.cuMemsetD8 + args = (device_pointer(dst), val, size) if stream: - from cuda.core import experimental # Ensure experimental is imported + from cuda.core import experimental assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemsetD8Async - if USE_NV_BINDING: - if isinstance(stream, experimental.Stream): - handle = int(stream.handle) - else: - handle = stream.handle.value - else: - handle = stream.handle - varargs.append(handle) - else: - fn = driver.cuMemsetD8 + args += (_stream_handle(stream),) - fn(device_pointer(dst), val, size, *varargs) + fn(*args) def profile_start(): @@ -3668,3 +3649,13 @@ def inspect_obj_content(objpath: str): code_types.add(match.group(1)) return code_types + + +def _stream_handle(stream): + if isinstance(stream, experimental.Stream): + return int(stream.handle) + else: + if USE_NV_BINDING: + return stream.handle.value + else: + return stream.handle From 505cd4da1b6c9f8d3415a0a46d86f2b52511d7c2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 18 Aug 2025 04:38:25 -0700 Subject: [PATCH 04/20] small changes --- numba_cuda/numba/cuda/cudadrv/driver.py | 12 ++---------- numba_cuda/numba/cuda/dispatcher.py | 11 +---------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 47163f289..bb95b905a 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -3501,8 +3501,6 @@ def device_memory_depends(devmem, *objs): def _stream_handle(stream): - from cuda.core import experimental - if USE_NV_BINDING: if isinstance(stream, experimental.Stream): return int(stream.handle) @@ -3520,8 +3518,6 @@ def host_to_device(dst, src, size, stream=0): args = (device_pointer(dst), host_pointer(src, readonly=True), size) if stream: - from cuda.core import experimental - assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemcpyHtoDAsync args += (_stream_handle(stream),) @@ -3539,8 +3535,6 @@ def device_to_host(dst, src, size, stream=0): args = (host_pointer(dst), device_pointer(src), size) if stream: - from cuda.core import experimental - assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemcpyDtoHAsync args += (_stream_handle(stream),) @@ -3558,8 +3552,6 @@ def device_to_device(dst, src, size, stream=0): args = (device_pointer(dst), device_pointer(src), size) if stream: - from cuda.core import experimental - assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemcpyDtoDAsync args += (_stream_handle(stream),) @@ -3582,8 +3574,6 @@ def device_memset(dst, val, size, stream=0): args = (device_pointer(dst), val, size) if stream: - from cuda.core import experimental - assert isinstance(stream, (Stream, experimental.Stream)) fn = driver.cuMemsetD8Async args += (_stream_handle(stream),) @@ -3652,6 +3642,8 @@ def inspect_obj_content(objpath: str): def _stream_handle(stream): + if stream == 0: + return stream if isinstance(stream, experimental.Stream): return int(stream.handle) else: diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index b56b0f8a2..2392f960c 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -466,16 +466,7 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): for t, v in zip(self.argument_types, args): self._prepare_args(t, v, stream, retr, kernelargs) - if driver.USE_NV_BINDING: - from cuda.core import experimental - - if isinstance(stream, experimental.Stream): - stream_handle = int(stream.handle) - else: - stream_handle = stream and stream.handle.value or 0 - else: - zero_stream = None - stream_handle = stream and stream.handle or zero_stream + stream_handle = driver._stream_handle(stream) # Invoke kernel driver.launch_kernel( From b861723bebad7c23797d9f477c06d1ab056b4d4c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 18 Aug 2025 05:21:10 -0700 Subject: [PATCH 05/20] __cuda_stream__ --- numba_cuda/numba/cuda/cudadrv/driver.py | 28 ++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index bb95b905a..fece87ba7 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2338,6 +2338,11 @@ def __int__(self): # The default stream's handle.value is 0, which gives `None` return self.handle.value or drvapi.CU_STREAM_DEFAULT + def __cuda_stream__(self): + if not self.handle.value: + return (0, drvapi.CU_STREAM_DEFAULT) + return 0, self.handle.value if USE_NV_BINDING else self.handle + def __repr__(self): default_streams = { drvapi.CU_STREAM_DEFAULT: "", @@ -3500,14 +3505,6 @@ def device_memory_depends(devmem, *objs): depset.extend(objs) -def _stream_handle(stream): - if USE_NV_BINDING: - if isinstance(stream, experimental.Stream): - return int(stream.handle) - return stream.handle.value - return stream.handle - - def host_to_device(dst, src, size, stream=0): """ NOTE: The underlying data pointer from the host data buffer is used and @@ -3644,10 +3641,13 @@ def inspect_obj_content(objpath: str): def _stream_handle(stream): if stream == 0: return stream - if isinstance(stream, experimental.Stream): - return int(stream.handle) - else: - if USE_NV_BINDING: - return stream.handle.value + elif hasattr(stream, "__cuda_stream__"): + _, ptr = stream.__cuda_stream__() + if isinstance(stream, experimental.Stream): + return int(ptr) else: - return stream.handle + return ptr + else: + raise TypeError( + "Expected a Stream object or 0, got %s" % type(stream).__name__ + ) From 2181748191a9c77f2ce93dd2c88fd1fddecca281 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 20 Aug 2025 07:41:18 -0700 Subject: [PATCH 06/20] accomodate ctypes bindings --- numba_cuda/numba/cuda/cudadrv/driver.py | 45 +++++++++++++------------ 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index b828e02b2..e73a854d2 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -11,7 +11,6 @@ """ -from cuda.core import experimental import sys import os import ctypes @@ -59,11 +58,11 @@ if USE_NV_BINDING: from cuda.bindings import driver as binding - from cuda.core.experimental import ( - Linker, - LinkerOptions, - ObjectCode, - ) + + from cuda.core import experimental + # Linker, + # LinkerOptions, + # ObjectCode, # There is no definition of the default stream in the Nvidia bindings (nor # is there at the C/C++ level), so we define it here so we don't need to @@ -1508,7 +1507,7 @@ def create_module_ptx(self, ptx): if isinstance(ptx, str): ptx = ptx.encode("utf8") if USE_NV_BINDING: - image = ObjectCode.from_ptx(ptx) + image = experimental.ObjectCode.from_ptx(ptx) else: image = c_char_p(ptx) return self.create_module_image(image) @@ -2981,7 +2980,7 @@ def __init__( self.lto = lto self.additional_flags = additional_flags - self.options = LinkerOptions( + self.options = experimental.LinkerOptions( max_register_count=self.max_registers, lineinfo=lineinfo, arch=arch, @@ -3008,7 +3007,7 @@ def error_log(self): raise RuntimeError("Link not yet complete.") def add_ptx(self, ptx, name=""): - obj = ObjectCode.from_ptx(ptx, name=name) + obj = experimental.ObjectCode.from_ptx(ptx, name=name) self._object_codes.append(obj) def add_cu(self, cu, name=""): @@ -3024,23 +3023,23 @@ def add_cu(self, cu, name=""): self._object_codes.append(obj) def add_cubin(self, cubin, name=""): - obj = ObjectCode.from_cubin(cubin, name=name) + obj = experimental.ObjectCode.from_cubin(cubin, name=name) self._object_codes.append(obj) def add_ltoir(self, ltoir, name=""): - obj = ObjectCode.from_ltoir(ltoir, name=name) + obj = experimental.ObjectCode.from_ltoir(ltoir, name=name) self._object_codes.append(obj) def add_fatbin(self, fatbin, name=""): - obj = ObjectCode.from_fatbin(fatbin, name=name) + obj = experimental.ObjectCode.from_fatbin(fatbin, name=name) self._object_codes.append(obj) def add_object(self, obj, name=""): - obj = ObjectCode.from_object(obj, name=name) + obj = experimental.ObjectCode.from_object(obj, name=name) self._object_codes.append(obj) def add_library(self, lib, name=""): - obj = ObjectCode.from_library(lib, name=name) + obj = experimental.ObjectCode.from_library(lib, name=name) self._object_codes.append(obj) def add_file(self, path, kind): @@ -3074,7 +3073,7 @@ def add_data(self, data, kind, name): fn(data, name) def get_linked_ptx(self): - options = LinkerOptions( + options = experimental.LinkerOptions( max_register_count=self.max_registers, lineinfo=self.lineinfo, arch=self.arch, @@ -3082,7 +3081,7 @@ def get_linked_ptx(self): ptx=True, ) - self.linker = Linker(*self._object_codes, options=options) + self.linker = experimental.Linker(*self._object_codes, options=options) result = self.linker.link("ptx") self.close() @@ -3095,7 +3094,9 @@ def close(self): self.linker.close() def complete(self): - self.linker = Linker(*self._object_codes, options=self.options) + self.linker = experimental.Linker( + *self._object_codes, options=self.options + ) result = self.linker.link("cubin") self.close() self._complete = True @@ -3634,11 +3635,13 @@ def _stream_handle(stream): return stream elif hasattr(stream, "__cuda_stream__"): _, ptr = stream.__cuda_stream__() - if isinstance(stream, experimental.Stream): + if isinstance(ptr, binding.CUstream): return int(ptr) else: return ptr else: - raise TypeError( - "Expected a Stream object or 0, got %s" % type(stream).__name__ - ) + allowed = (Stream, experimental.Stream) if USE_NV_BINDING else Stream + if not isinstance(stream, allowed): + raise TypeError( + "Expected a Stream object or 0, got %s" % type(stream).__name__ + ) From 46863d3f0cf5004023656dea83681c33f546ecdf Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 20 Aug 2025 07:41:55 -0700 Subject: [PATCH 07/20] clean --- numba_cuda/numba/cuda/cudadrv/driver.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index e73a854d2..8eeab7d08 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -58,11 +58,7 @@ if USE_NV_BINDING: from cuda.bindings import driver as binding - from cuda.core import experimental - # Linker, - # LinkerOptions, - # ObjectCode, # There is no definition of the default stream in the Nvidia bindings (nor # is there at the C/C++ level), so we define it here so we don't need to From 2082063ad65d783be26a1b77dead544dc220a3b6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 20 Aug 2025 08:11:07 -0700 Subject: [PATCH 08/20] more pacifying ctypes bindings --- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index dc179577b..2a52890e0 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -9,9 +9,11 @@ from numba.cuda.cudadrv import devices, drvapi, driver as _driver from numba.cuda.testing import unittest, CUDATestCase from numba.cuda.testing import skip_on_cudasim -from cuda.core import experimental import contextlib +from numba import config +if config.USE_NV_BINDING: + from cuda.core import experimental ptx1 = """ .version 1.4 @@ -153,6 +155,7 @@ def test_cuda_driver_stream_operations(self): for i, v in enumerate(array): self.assertEqual(i, v) + @unittest.skipIf(not config.USE_NV_BINDING, "NV binding not enabled") def test_cuda_core_stream_operations(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function("_Z10helloworldPi") From ec5841c694afa8e89d8d6cab90fb4c3434f63723 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 20 Aug 2025 08:58:19 -0700 Subject: [PATCH 09/20] fix --- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index 2a52890e0..5f04beeec 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -10,9 +10,8 @@ from numba.cuda.testing import unittest, CUDATestCase from numba.cuda.testing import skip_on_cudasim import contextlib -from numba import config -if config.USE_NV_BINDING: +if _driver.USE_NV_BINDING: from cuda.core import experimental ptx1 = """ @@ -155,7 +154,7 @@ def test_cuda_driver_stream_operations(self): for i, v in enumerate(array): self.assertEqual(i, v) - @unittest.skipIf(not config.USE_NV_BINDING, "NV binding not enabled") + @unittest.skipIf(not _driver.USE_NV_BINDING, "NV binding not enabled") def test_cuda_core_stream_operations(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function("_Z10helloworldPi") From 4fcf9d1426fd8d84ae902bc5b929adce0d163c7d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 25 Aug 2025 08:08:42 -0700 Subject: [PATCH 10/20] renaming --- numba_cuda/numba/cuda/cudadrv/driver.py | 41 ++++++++++--------- .../cuda/tests/cudadrv/test_cuda_driver.py | 4 +- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 3060b6cb6..a22a6f234 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -61,7 +61,12 @@ if USE_NV_BINDING: from cuda.bindings import driver as binding - from cuda.core import experimental + from cuda.core.experimental import ( + Linker, + LinkerOptions, + ObjectCode, + Stream as ExperimentalStream, + ) # There is no definition of the default stream in the Nvidia bindings (nor # is there at the C/C++ level), so we define it here so we don't need to @@ -1500,7 +1505,7 @@ def create_module_ptx(self, ptx): if isinstance(ptx, str): ptx = ptx.encode("utf8") if USE_NV_BINDING: - image = experimental.ObjectCode.from_ptx(ptx) + image = ObjectCode.from_ptx(ptx) else: image = c_char_p(ptx) return self.create_module_image(image) @@ -2973,7 +2978,7 @@ def __init__( self.lto = lto self.additional_flags = additional_flags - self.options = experimental.LinkerOptions( + self.options = LinkerOptions( max_register_count=self.max_registers, lineinfo=lineinfo, arch=arch, @@ -3000,7 +3005,7 @@ def error_log(self): raise RuntimeError("Link not yet complete.") def add_ptx(self, ptx, name=""): - obj = experimental.ObjectCode.from_ptx(ptx, name=name) + obj = ObjectCode.from_ptx(ptx, name=name) self._object_codes.append(obj) def add_cu(self, cu, name=""): @@ -3016,23 +3021,23 @@ def add_cu(self, cu, name=""): self._object_codes.append(obj) def add_cubin(self, cubin, name=""): - obj = experimental.ObjectCode.from_cubin(cubin, name=name) + obj = ObjectCode.from_cubin(cubin, name=name) self._object_codes.append(obj) def add_ltoir(self, ltoir, name=""): - obj = experimental.ObjectCode.from_ltoir(ltoir, name=name) + obj = ObjectCode.from_ltoir(ltoir, name=name) self._object_codes.append(obj) def add_fatbin(self, fatbin, name=""): - obj = experimental.ObjectCode.from_fatbin(fatbin, name=name) + obj = ObjectCode.from_fatbin(fatbin, name=name) self._object_codes.append(obj) def add_object(self, obj, name=""): - obj = experimental.ObjectCode.from_object(obj, name=name) + obj = ObjectCode.from_object(obj, name=name) self._object_codes.append(obj) def add_library(self, lib, name=""): - obj = experimental.ObjectCode.from_library(lib, name=name) + obj = ObjectCode.from_library(lib, name=name) self._object_codes.append(obj) def add_file(self, path, kind): @@ -3066,7 +3071,7 @@ def add_data(self, data, kind, name): fn(data, name) def get_linked_ptx(self): - options = experimental.LinkerOptions( + options = LinkerOptions( max_register_count=self.max_registers, lineinfo=self.lineinfo, arch=self.arch, @@ -3074,7 +3079,7 @@ def get_linked_ptx(self): ptx=True, ) - self.linker = experimental.Linker(*self._object_codes, options=options) + self.linker = Linker(*self._object_codes, options=options) result = self.linker.link("ptx") self.close() @@ -3087,9 +3092,7 @@ def close(self): self.linker.close() def complete(self): - self.linker = experimental.Linker( - *self._object_codes, options=self.options - ) + self.linker = Linker(*self._object_codes, options=self.options) result = self.linker.link("cubin") self.close() self._complete = True @@ -3405,7 +3408,7 @@ def host_to_device(dst, src, size, stream=0): args = (device_pointer(dst), host_pointer(src, readonly=True), size) if stream: - assert isinstance(stream, (Stream, experimental.Stream)) + assert isinstance(stream, (Stream, ExperimentalStream)) fn = driver.cuMemcpyHtoDAsync args += (_stream_handle(stream),) @@ -3422,7 +3425,7 @@ def device_to_host(dst, src, size, stream=0): args = (host_pointer(dst), device_pointer(src), size) if stream: - assert isinstance(stream, (Stream, experimental.Stream)) + assert isinstance(stream, (Stream, ExperimentalStream)) fn = driver.cuMemcpyDtoHAsync args += (_stream_handle(stream),) @@ -3439,7 +3442,7 @@ def device_to_device(dst, src, size, stream=0): args = (device_pointer(dst), device_pointer(src), size) if stream: - assert isinstance(stream, (Stream, experimental.Stream)) + assert isinstance(stream, (Stream, ExperimentalStream)) fn = driver.cuMemcpyDtoDAsync args += (_stream_handle(stream),) @@ -3461,7 +3464,7 @@ def device_memset(dst, val, size, stream=0): args = (device_pointer(dst), val, size) if stream: - assert isinstance(stream, (Stream, experimental.Stream)) + assert isinstance(stream, (Stream, ExperimentalStream)) fn = driver.cuMemsetD8Async args += (_stream_handle(stream),) @@ -3538,7 +3541,7 @@ def _stream_handle(stream): else: return ptr else: - allowed = (Stream, experimental.Stream) if USE_NV_BINDING else Stream + allowed = (Stream, ExperimentalStream) if USE_NV_BINDING else Stream if not isinstance(stream, allowed): raise TypeError( "Expected a Stream object or 0, got %s" % type(stream).__name__ diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index c925f03ab..65f90d328 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -15,7 +15,7 @@ import contextlib if _driver.USE_NV_BINDING: - from cuda.core import experimental + from cuda.core.experimental import Device ptx1 = """ .version 1.4 @@ -162,7 +162,7 @@ def test_cuda_core_stream_operations(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function("_Z10helloworldPi") array = (c_int * 100)() - dev = experimental.Device() + dev = Device() dev.set_current() stream = dev.create_stream() From 220c2e3e20be1738467e317bbc455c344c44c538 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 25 Aug 2025 08:56:49 -0700 Subject: [PATCH 11/20] address reviews --- numba_cuda/numba/cuda/cudadrv/driver.py | 9 +++++++- .../cuda/tests/cudadrv/test_cuda_driver.py | 21 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index a22a6f234..6c88566a0 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -61,6 +61,7 @@ if USE_NV_BINDING: from cuda.bindings import driver as binding + from cuda.bindings.utils import get_cuda_native_handle from cuda.core.experimental import ( Linker, LinkerOptions, @@ -3532,12 +3533,18 @@ def inspect_obj_content(objpath: str): def _stream_handle(stream): + """ + Obtain the appropriate handle for various types of + acceptable stream objects. Acceptable types are + int (0 for default stream), Stream, ExperimentalStream + """ + if stream == 0: return stream elif hasattr(stream, "__cuda_stream__"): _, ptr = stream.__cuda_stream__() if isinstance(ptr, binding.CUstream): - return int(ptr) + return get_cuda_native_handle(ptr) else: return ptr else: diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index 65f90d328..a31b1cb55 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -9,6 +9,7 @@ driver, launch_kernel, ) +from numba import cuda from numba.cuda.cudadrv import devices, drvapi, driver as _driver from numba.cuda.testing import unittest, CUDATestCase from numba.cuda.testing import skip_on_cudasim @@ -196,6 +197,26 @@ def auto_synchronize(stream): for i, v in enumerate(array): self.assertEqual(i, v) + @unittest.skipIf(not _driver.USE_NV_BINDING, "NV binding not enabled") + def test_cuda_core_stream_launch_user_facing(self): + @cuda.jit + def kernel(a): + idx = cuda.grid(1) + if idx < len(a): + a[idx] = idx + + ary = cuda.to_device([0] * 100) + + dev = Device() + dev.set_current() + stream = dev.create_stream() + + kernel[1, 100, stream](ary) + + result = ary.copy_to_host() + for i, v in enumerate(result): + self.assertEqual(i, v) + def test_cuda_driver_default_stream(self): # Test properties of the default stream ds = self.context.get_default_stream() From f3b07c0174e004cf9d1bdc820c04a9fa8b97c25e Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Mon, 25 Aug 2025 10:57:00 -0500 Subject: [PATCH 12/20] Update numba_cuda/numba/cuda/cudadrv/driver.py Co-authored-by: Keith Kraus --- numba_cuda/numba/cuda/cudadrv/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 6c88566a0..6af08c0f6 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2339,7 +2339,7 @@ def __int__(self): def __cuda_stream__(self): if not self.handle.value: return (0, drvapi.CU_STREAM_DEFAULT) - return 0, self.handle.value if USE_NV_BINDING else self.handle + return (0, self.handle.value if USE_NV_BINDING else self.handle) def __repr__(self): default_streams = { From 20440ab80f61a9d2ec661840c017a17260589587 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 7 Oct 2025 12:26:00 -0700 Subject: [PATCH 13/20] address some reviews --- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index a31b1cb55..a0cbdae1a 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -205,15 +205,16 @@ def kernel(a): if idx < len(a): a[idx] = idx - ary = cuda.to_device([0] * 100) - dev = Device() dev.set_current() stream = dev.create_stream() + ary = cuda.to_device([0] * 100, stream=stream) + stream.sync() + kernel[1, 100, stream](ary) - result = ary.copy_to_host() + result = ary.copy_to_host(stream=stream) for i, v in enumerate(result): self.assertEqual(i, v) From f0ff9d5b723657ea6dd9feae34ce20d062b8313f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 13 Oct 2025 04:51:17 -0700 Subject: [PATCH 14/20] fix ctypes tests --- numba_cuda/numba/cuda/cudadrv/driver.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 79d1e51d2..423df2cdc 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -3382,7 +3382,6 @@ def host_to_device(dst, src, size, stream=0): args = (device_pointer(dst), host_pointer(src, readonly=True), size) if stream: - assert isinstance(stream, (Stream, ExperimentalStream)) fn = driver.cuMemcpyHtoDAsync args += (_stream_handle(stream),) @@ -3399,7 +3398,6 @@ def device_to_host(dst, src, size, stream=0): args = (host_pointer(dst), device_pointer(src), size) if stream: - assert isinstance(stream, (Stream, ExperimentalStream)) fn = driver.cuMemcpyDtoHAsync args += (_stream_handle(stream),) @@ -3416,7 +3414,6 @@ def device_to_device(dst, src, size, stream=0): args = (device_pointer(dst), device_pointer(src), size) if stream: - assert isinstance(stream, (Stream, ExperimentalStream)) fn = driver.cuMemcpyDtoDAsync args += (_stream_handle(stream),) @@ -3438,7 +3435,6 @@ def device_memset(dst, val, size, stream=0): args = (device_pointer(dst), val, size) if stream: - assert isinstance(stream, (Stream, ExperimentalStream)) fn = driver.cuMemsetD8Async args += (_stream_handle(stream),) @@ -3531,15 +3527,19 @@ def _stream_handle(stream): if stream == 0: return stream + if USE_NV_BINDING: + allowed = (Stream, ExperimentalStream) + else: + allowed = (Stream,) + if not isinstance(stream, allowed): + raise TypeError( + "Expected a Stream object or 0, got %s" % type(stream).__name__ + ) elif hasattr(stream, "__cuda_stream__"): _, ptr = stream.__cuda_stream__() - if isinstance(ptr, binding.CUstream): + if USE_NV_BINDING and isinstance(ptr, binding.CUstream): return get_cuda_native_handle(ptr) else: return ptr else: - allowed = (Stream, ExperimentalStream) if USE_NV_BINDING else Stream - if not isinstance(stream, allowed): - raise TypeError( - "Expected a Stream object or 0, got %s" % type(stream).__name__ - ) + raise TypeError("Invalid Stream") From 1b59b5ce1703a0df97a8fe6d5e1ead749576ab10 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 13 Oct 2025 04:53:39 -0700 Subject: [PATCH 15/20] addressing old comments --- numba_cuda/numba/cuda/cudadrv/driver.py | 3 ++- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 423df2cdc..5d394d623 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -3536,7 +3536,8 @@ def _stream_handle(stream): "Expected a Stream object or 0, got %s" % type(stream).__name__ ) elif hasattr(stream, "__cuda_stream__"): - _, ptr = stream.__cuda_stream__() + ver, ptr = stream.__cuda_stream__() + assert ver == 0 if USE_NV_BINDING and isinstance(ptr, binding.CUstream): return get_cuda_native_handle(ptr) else: diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index a0cbdae1a..1e53abefa 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -213,6 +213,7 @@ def kernel(a): stream.sync() kernel[1, 100, stream](ary) + stream.sync() result = ary.copy_to_host(stream=stream) for i, v in enumerate(result): From d1ad5770771ca4f48cc0d3190e520d73f1532aa3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 15 Oct 2025 13:25:47 -0700 Subject: [PATCH 16/20] small fix --- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index 6032d129c..06b1d2a5d 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -16,8 +16,7 @@ from numba.cuda.testing import skip_on_cudasim import contextlib -if _driver.USE_NV_BINDING: - from cuda.core.experimental import Device +from cuda.core.experimental import Device ptx1 = """ .version 1.4 From b7b56eb98a8647f96964b20bd3134b3989f163d1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 15 Oct 2025 14:30:28 -0700 Subject: [PATCH 17/20] small fix --- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index 06b1d2a5d..d15dca6bd 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -156,7 +156,6 @@ def test_cuda_driver_stream_operations(self): for i, v in enumerate(array): self.assertEqual(i, v) - @unittest.skipIf(not _driver.USE_NV_BINDING, "NV binding not enabled") def test_cuda_core_stream_operations(self): module = self.context.create_module_ptx(self.ptx) function = module.get_function("_Z10helloworldPi") @@ -195,7 +194,6 @@ def auto_synchronize(stream): for i, v in enumerate(array): self.assertEqual(i, v) - @unittest.skipIf(not _driver.USE_NV_BINDING, "NV binding not enabled") def test_cuda_core_stream_launch_user_facing(self): @cuda.jit def kernel(a): From 324a48a31649f7872e02c6523d7a6e54323b1efa Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 27 Oct 2025 10:05:25 -0700 Subject: [PATCH 18/20] USE_NV_BINDING --- numba_cuda/numba/cuda/cudadrv/driver.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 594d4e993..f0a90a1d8 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -69,9 +69,6 @@ Stream as ExperimentalStream, ) -# For backwards compatibility: indicate that the NVIDIA CUDA Python bindings are -# in use. Older code checks this flag to branch on binding-specific behavior. -USE_NV_BINDING = True # There is no definition of the default stream in the Nvidia bindings (nor # is there at the C/C++ level), so we define it here so we don't need to @@ -2076,7 +2073,7 @@ def __int__(self): def __cuda_stream__(self): if not self.handle.value: return (0, drvapi.CU_STREAM_DEFAULT) - return (0, self.handle.value if USE_NV_BINDING else self.handle) + return (0, self.handle.value) def __repr__(self): default_streams = { From 7df62cecf5a1a7e517c10e777bda2dfa52d5ffe9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 27 Oct 2025 13:23:51 -0700 Subject: [PATCH 19/20] events --- numba_cuda/numba/cuda/cudadrv/driver.py | 4 ++-- numba_cuda/numba/cuda/tests/cudadrv/test_events.py | 12 +++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index f0a90a1d8..ca8287d06 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -2221,7 +2221,7 @@ def record(self, stream=0): queued in the stream at the time of the call to ``record()`` has been completed. """ - hstream = stream.handle.value if stream else binding.CUstream(0) + hstream = _stream_handle(stream) handle = self.handle.value driver.cuEventRecord(handle, hstream) @@ -2236,7 +2236,7 @@ def wait(self, stream=0): """ All future works submitted to stream will wait util the event completes. """ - hstream = stream.handle.value if stream else binding.CUstream(0) + hstream = _stream_handle(stream) handle = self.handle.value flags = 0 driver.cuStreamWaitEvent(hstream, handle, flags) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_events.py b/numba_cuda/numba/cuda/tests/cudadrv/test_events.py index ade58f0d7..95b7a18e1 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_events.py @@ -4,6 +4,7 @@ import numpy as np from numba import cuda from numba.cuda.testing import unittest, CUDATestCase +from cuda.core.experimental import Device class TestCudaEvent(CUDATestCase): @@ -22,8 +23,17 @@ def test_event_elapsed(self): evtstart.elapsed_time(evtend) def test_event_elapsed_stream(self): - N = 32 stream = cuda.stream() + self.event_elapsed_inner(stream) + + def test_event_elapsed_cuda_core_stream(self): + dev = Device() + dev.set_current() + stream = dev.create_stream() + self.event_elapsed_inner(stream) + + def event_elapsed_inner(self, stream): + N = 32 dary = cuda.device_array(N, dtype=np.double) evtstart = cuda.event() evtend = cuda.event() From f8594664929f383bdb3d24cf73b4b9f65cc86c74 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 27 Oct 2025 14:03:51 -0700 Subject: [PATCH 20/20] skip event tests on sim --- numba_cuda/numba/cuda/tests/cudadrv/test_events.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_events.py b/numba_cuda/numba/cuda/tests/cudadrv/test_events.py index 95b7a18e1..20ceebb97 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_events.py @@ -5,6 +5,7 @@ from numba import cuda from numba.cuda.testing import unittest, CUDATestCase from cuda.core.experimental import Device +from numba.cuda.testing import skip_on_cudasim class TestCudaEvent(CUDATestCase): @@ -26,6 +27,7 @@ def test_event_elapsed_stream(self): stream = cuda.stream() self.event_elapsed_inner(stream) + @skip_on_cudasim("Testing cuda.core events requires driver") def test_event_elapsed_cuda_core_stream(self): dev = Device() dev.set_current()