diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_device_array_manipulation.py b/numba_cuda/numba/cuda/tests/cudadrv/test_device_array_manipulation.py new file mode 100644 index 000000000..995be923e --- /dev/null +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_device_array_manipulation.py @@ -0,0 +1,193 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +DeviceArray behavior and memory management tests. + +These tests validate DeviceArray shape, memory lifetime, and device memory +operations. Inspired by CPU array tests, these verify CUDA-specific device +array behaviors and constraints. +""" + +import numpy as np +import pytest + +from numba import cuda +from numba.cuda.testing import CUDATestCase, skip_on_cudasim + + +@cuda.jit +def set_val_kernel(a, v): + i = cuda.grid(1) + if i < a.size: + a[i] = v + + +class TestDeviceArrayManipulation(CUDATestCase): + """Tests for DeviceArray memory behavior and shape semantics.""" + + def test_memory_lifetime(self): + """Verify device array data persists correctly across multiple + kernel launches.""" + n = 32 + arr = cuda.device_array(n, dtype=np.float32) + + threads = 16 + blocks = (n + threads - 1) // threads + set_val_kernel[blocks, threads](arr, 1.5) + + # First kernel wrote 1.5 + host = arr.copy_to_host() + np.testing.assert_allclose(host, np.full(n, 1.5)) + + # Second kernel overwrites with 2.5 on the same array + set_val_kernel[blocks, threads](arr, 2.5) + host = arr.copy_to_host() + np.testing.assert_allclose(host, np.full(n, 2.5)) + + @skip_on_cudasim("Simulator does not model device pointers") + def test_memory_pointer_stability(self): + """Verify the device pointer is unchanged after kernel launches, + ensuring the same allocation is reused.""" + n = 32 + arr = cuda.device_array(n, dtype=np.float32) + ptr_before = arr.gpu_data.device_ctypes_pointer.value + + threads = 16 + blocks = (n + threads - 1) // threads + set_val_kernel[blocks, threads](arr, 1.5) + ptr_after_first = arr.gpu_data.device_ctypes_pointer.value + self.assertEqual(ptr_before, ptr_after_first) + + set_val_kernel[blocks, threads](arr, 2.5) + ptr_after_second = arr.gpu_data.device_ctypes_pointer.value + self.assertEqual(ptr_before, ptr_after_second) + + host = arr.copy_to_host() + np.testing.assert_allclose(host, np.full(n, 2.5)) + + def test_device_array_shape(self): + """Verify DeviceArray shape attribute is correct.""" + shape = (10, 20) + arr = cuda.device_array(shape, dtype=np.float32) + self.assertEqual(arr.shape, shape) + + def test_device_array_dtype(self): + """Verify DeviceArray dtype attribute is correct.""" + dtype = np.int32 + arr = cuda.device_array(10, dtype=dtype) + self.assertEqual(arr.dtype, dtype) + + def test_device_array_size(self): + """Verify DeviceArray size attribute is correct.""" + shape = (5, 8) + arr = cuda.device_array(shape, dtype=np.float32) + self.assertEqual(arr.size, 40) + + def test_device_array_ndim(self): + """Verify DeviceArray ndim attribute is correct.""" + arr1d = cuda.device_array(10, dtype=np.float32) + arr2d = cuda.device_array((5, 8), dtype=np.float32) + arr3d = cuda.device_array((2, 3, 4), dtype=np.float32) + self.assertEqual(arr1d.ndim, 1) + self.assertEqual(arr2d.ndim, 2) + self.assertEqual(arr3d.ndim, 3) + + def test_device_array_like(self): + """Verify device_array_like creates array with matching shape and dtype.""" + host = np.arange(20, dtype=np.int32).reshape(4, 5) + dev = cuda.device_array_like(host) + self.assertEqual(dev.shape, host.shape) + self.assertEqual(dev.dtype, host.dtype) + + def test_to_device_copy_to_host_roundtrip(self): + """Verify data integrity in to_device -> copy_to_host roundtrip.""" + host_orig = np.arange(100, dtype=np.float32) + dev = cuda.to_device(host_orig) + host_copy = dev.copy_to_host() + np.testing.assert_array_equal(host_orig, host_copy) + + def test_copy_to_device_existing_array(self): + """Verify copy_to_device into pre-allocated device array.""" + host = np.arange(50, dtype=np.int32) + dev = cuda.device_array(50, dtype=np.int32) + dev.copy_to_device(host) + result = dev.copy_to_host() + np.testing.assert_array_equal(host, result) + + def test_device_to_device_copy(self): + """Verify device-to-device memory copy.""" + n = 32 + host = np.arange(n, dtype=np.float32) + dev1 = cuda.to_device(host) + dev2 = cuda.device_array(n, dtype=np.float32) + dev2.copy_to_device(dev1) + result = dev2.copy_to_host() + np.testing.assert_array_equal(host, result) + + def test_multidimensional_shape_consistency(self): + """Verify shape consistency for multidimensional arrays.""" + shapes = [(10,), (5, 8), (3, 4, 5), (2, 3, 4, 5)] + for shape in shapes: + arr = cuda.device_array(shape, dtype=np.float32) + self.assertEqual(arr.shape, shape) + self.assertEqual(arr.ndim, len(shape)) + self.assertEqual(arr.size, np.prod(shape)) + + def test_reshape_device_array(self): + """Verify reshape on a DeviceArray preserves data and changes shape.""" + host = np.arange(32, dtype=np.int32) + arr = cuda.to_device(host) + reshaped = arr.reshape((4, 8)) + self.assertEqual(reshaped.shape, (4, 8)) + np.testing.assert_array_equal(reshaped.copy_to_host(), host.reshape((4, 8))) + + def test_view_device_array(self): + """Verify view on a DeviceArray reinterprets dtype and preserves data.""" + host = np.arange(16, dtype=np.int32) + arr = cuda.to_device(host) + viewed = arr.view(np.float32) + self.assertEqual(viewed.dtype, np.float32) + self.assertEqual(viewed.shape, (16,)) + np.testing.assert_array_equal( + viewed.copy_to_host().view(np.int32), host + ) + + def test_ravel_device_array(self): + """Verify ravel on a DeviceArray returns a 1D array with correct data.""" + host = np.arange(32, dtype=np.int32).reshape(4, 8) + arr = cuda.to_device(host) + raveled = arr.ravel() + self.assertEqual(raveled.ndim, 1) + self.assertEqual(raveled.shape, (32,)) + np.testing.assert_array_equal(raveled.copy_to_host(), host.ravel()) + + def test_advanced_slicing(self): + """Verify step-slicing on a device array returns the correct elements.""" + host = np.arange(10, dtype=np.int32) + arr = cuda.to_device(host) + result = arr[::2] + np.testing.assert_array_equal(result.copy_to_host(), host[::2]) + + def test_multidimensional_slicing(self): + """Verify multidimensional slicing on a device array returns the correct elements.""" + host = np.arange(16, dtype=np.int32).reshape(4, 4) + arr = cuda.to_device(host) + result = arr[:, ::-1] + np.testing.assert_array_equal(result.copy_to_host(), host[:, ::-1]) + + @pytest.mark.xfail(reason="Boolean indexing is not supported on CUDA device arrays") + def test_boolean_indexing(self): + """Boolean mask indexing is not supported on device arrays.""" + host = np.arange(10, dtype=np.int32) + arr = cuda.to_device(host) + mask = np.array([True, False] * 5) + arr[mask] + + @pytest.mark.xfail(reason="Fancy (integer array) indexing is not supported on CUDA device arrays") + def test_fancy_indexing(self): + """Fancy indexing with integer arrays is not supported on device arrays.""" + host = np.arange(10, dtype=np.int32) + arr = cuda.to_device(host) + idx = np.array([1, 3, 5]) + arr[idx] \ No newline at end of file diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_manipulation.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_manipulation.py new file mode 100644 index 000000000..d5f1c9e98 --- /dev/null +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_manipulation.py @@ -0,0 +1,173 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +CUDA array manipulation tests using kernels and Python CUDA semantics. + +These tests validate array operations, indexing, and element-wise operations +in CUDA kernels. They are inspired by CPU array manipulation test suites and +adapted to verify CUDA device semantics. +""" + +import numpy as np + +from numba import cuda +from numba.cuda.testing import CUDATestCase + + +@cuda.jit +def fill_kernel(arr, value): + i = cuda.grid(1) + if i < arr.size: + arr[i] = value + + +@cuda.jit +def add_kernel(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = a[i] + b[i] + + +@cuda.jit +def set_index(a): + a[3] = 42 + + +@cuda.jit +def cast_int_to_float(src, dst): + i = cuda.grid(1) + if i < src.size: + dst[i] = src[i] + + +@cuda.jit +def cast_float_to_int(src, dst): + i = cuda.grid(1) + if i < src.size: + dst[i] = int(src[i]) + + +@cuda.jit +def add_mixed(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = a[i] + b[i] + + +class TestArrayManipulation(CUDATestCase): + """Tests for array manipulation operations in CUDA kernels.""" + + def test_fill_basic(self): + """Basic elementwise fill operation""" + n = 128 + arr = cuda.device_array(n, dtype=np.float32) + + threads = 64 + blocks = (n + threads - 1) // threads + fill_kernel[blocks, threads](arr, 4.25) + + result = arr.copy_to_host() + np.testing.assert_array_equal(result, np.full(n, 4.25)) + + def test_elementwise_add(self): + """Elementwise addition in a CUDA kernel.""" + n = 256 + a = np.arange(n, dtype=np.float32) + b = np.ones(n, dtype=np.float32) + + da = cuda.to_device(a) + db = cuda.to_device(b) + dout = cuda.device_array_like(a) + + threads = 64 + blocks = (n + threads - 1) // threads + add_kernel[blocks, threads](da, db, dout) + + result = dout.copy_to_host() + np.testing.assert_allclose(result, a + b) + + def test_integer_getitem_setitem(self): + """Direct integer indexing in a kernel.""" + arr = cuda.device_array(10, dtype=np.int32) + set_index[1, 1](arr) + host = arr.copy_to_host() + self.assertEqual(host[3], 42) + + def test_multidimensional_indexing(self): + """2D indexing with a grid-based kernel.""" + shape = (8, 8) + host = np.zeros(shape, dtype=np.int32) + dev = cuda.to_device(host) + + @cuda.jit + def write_2d(a): + x, y = cuda.grid(2) + if x < a.shape[0] and y < a.shape[1]: + a[x, y] = x * a.shape[1] + y + + threads = (4, 4) + blocks = ((shape[0] + threads[0] - 1) // threads[0], + (shape[1] + threads[1] - 1) // threads[1]) + + write_2d[blocks, threads](dev) + result = dev.copy_to_host() + self.assertEqual(result[2, 3], 2 * shape[1] + 3) + + def test_dtype_transitions(self): + """Type casting and mixed-dtype operations in CUDA kernels.""" + n = 10 + a = np.arange(n, dtype=np.int32) + b = np.arange(n, dtype=np.float32) + da = cuda.to_device(a) + db = cuda.to_device(b) + dout1 = cuda.device_array(n, dtype=np.float32) + dout2 = cuda.device_array(n, dtype=np.int32) + + threads = 32 + blocks = (n + threads - 1) // threads + cast_int_to_float[blocks, threads](da, dout1) + cast_float_to_int[blocks, threads](db, dout2) + + np.testing.assert_allclose(dout1.copy_to_host(), a.astype(np.float32)) + np.testing.assert_array_equal(dout2.copy_to_host(), b.astype(np.int32)) + + # Mixed dtype elementwise op + dout3 = cuda.device_array(n, dtype=np.float32) + add_mixed[blocks, threads](da, db, dout3) + np.testing.assert_allclose(dout3.copy_to_host(), a + b) + + def test_shape_semantics_in_kernel(self): + """Shape attributes are accessible inside a kernel.""" + shape = (5, 7) + arr = cuda.device_array(shape, dtype=np.int32) + + @cuda.jit + def check_shape(a, out): + if cuda.threadIdx.x == 0 and cuda.blockIdx.x == 0: + out[0] = a.shape[0] + out[1] = a.shape[1] + + out = cuda.device_array(2, dtype=np.int32) + check_shape[1, 1](arr, out) + host = out.copy_to_host() + self.assertEqual(tuple(host), shape) + + def test_grid_stride_correctness(self): + """Grid-stride loop pattern.""" + n = 10 + arr = cuda.device_array(n, dtype=np.int32) + + @cuda.jit + def grid_stride_kernel(a): + for i in range(cuda.grid(1), a.size, cuda.gridsize(1)): + a[i] = i + + # Intentionally over-provisioned: more threads than elements + # to verify the grid-stride loop handles the wrap-around. + threads = 32 + blocks = 2 + grid_stride_kernel[blocks, threads](arr) + host = arr.copy_to_host() + np.testing.assert_array_equal(host, np.arange(n)) +