NVIDIA · CodersAcademy006 · Mar 2, 2026 · gmarkall · Feb 4, 2026 · CodersAcademy006
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_device_array_manipulation.py b/numba_cuda/numba/cuda/tests/cudadrv/test_device_array_manipulation.py
@@ -0,0 +1,193 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+DeviceArray behavior and memory management tests.
+
+These tests validate DeviceArray shape, memory lifetime, and device memory
+operations. Inspired by CPU array tests, these verify CUDA-specific device
+array behaviors and constraints.
+"""
+
+import numpy as np
+import pytest
+
+from numba import cuda
+from numba.cuda.testing import CUDATestCase, skip_on_cudasim
+
+
+@cuda.jit
+def set_val_kernel(a, v):
+    i = cuda.grid(1)
+    if i < a.size:
+        a[i] = v
+
+
+class TestDeviceArrayManipulation(CUDATestCase):
+    """Tests for DeviceArray memory behavior and shape semantics."""
+
+    def test_memory_lifetime(self):
+        """Verify device array data persists correctly across multiple
+        kernel launches."""
+        n = 32
+        arr = cuda.device_array(n, dtype=np.float32)
+
+        threads = 16
+        blocks = (n + threads - 1) // threads
+        set_val_kernel[blocks, threads](arr, 1.5)
+
+        # First kernel wrote 1.5
+        host = arr.copy_to_host()
+        np.testing.assert_allclose(host, np.full(n, 1.5))
+
+        # Second kernel overwrites with 2.5 on the same array
+        set_val_kernel[blocks, threads](arr, 2.5)
+        host = arr.copy_to_host()
+        np.testing.assert_allclose(host, np.full(n, 2.5))
+
+    @skip_on_cudasim("Simulator does not model device pointers")
+    def test_memory_pointer_stability(self):
+        """Verify the device pointer is unchanged after kernel launches,
+        ensuring the same allocation is reused."""
+        n = 32
+        arr = cuda.device_array(n, dtype=np.float32)
+        ptr_before = arr.gpu_data.device_ctypes_pointer.value
+
+        threads = 16
+        blocks = (n + threads - 1) // threads
+        set_val_kernel[blocks, threads](arr, 1.5)
+        ptr_after_first = arr.gpu_data.device_ctypes_pointer.value
+        self.assertEqual(ptr_before, ptr_after_first)
+
+        set_val_kernel[blocks, threads](arr, 2.5)
+        ptr_after_second = arr.gpu_data.device_ctypes_pointer.value
+        self.assertEqual(ptr_before, ptr_after_second)
+
+        host = arr.copy_to_host()
+        np.testing.assert_allclose(host, np.full(n, 2.5))
+
+    def test_device_array_shape(self):
+        """Verify DeviceArray shape attribute is correct."""
+        shape = (10, 20)
+        arr = cuda.device_array(shape, dtype=np.float32)
+        self.assertEqual(arr.shape, shape)
+
+    def test_device_array_dtype(self):
+        """Verify DeviceArray dtype attribute is correct."""
+        dtype = np.int32
+        arr = cuda.device_array(10, dtype=dtype)
+        self.assertEqual(arr.dtype, dtype)
+
+    def test_device_array_size(self):
+        """Verify DeviceArray size attribute is correct."""
+        shape = (5, 8)
+        arr = cuda.device_array(shape, dtype=np.float32)
+        self.assertEqual(arr.size, 40)
+
+    def test_device_array_ndim(self):
+        """Verify DeviceArray ndim attribute is correct."""
+        arr1d = cuda.device_array(10, dtype=np.float32)
+        arr2d = cuda.device_array((5, 8), dtype=np.float32)
+        arr3d = cuda.device_array((2, 3, 4), dtype=np.float32)
+        self.assertEqual(arr1d.ndim, 1)
+        self.assertEqual(arr2d.ndim, 2)
+        self.assertEqual(arr3d.ndim, 3)
+
+    def test_device_array_like(self):
+        """Verify device_array_like creates array with matching shape and dtype."""
+        host = np.arange(20, dtype=np.int32).reshape(4, 5)
+        dev = cuda.device_array_like(host)
+        self.assertEqual(dev.shape, host.shape)
+        self.assertEqual(dev.dtype, host.dtype)
+
+    def test_to_device_copy_to_host_roundtrip(self):
+        """Verify data integrity in to_device -> copy_to_host roundtrip."""
+        host_orig = np.arange(100, dtype=np.float32)
+        dev = cuda.to_device(host_orig)
+        host_copy = dev.copy_to_host()
+        np.testing.assert_array_equal(host_orig, host_copy)
+
+    def test_copy_to_device_existing_array(self):
+        """Verify copy_to_device into pre-allocated device array."""
+        host = np.arange(50, dtype=np.int32)
+        dev = cuda.device_array(50, dtype=np.int32)
+        dev.copy_to_device(host)
+        result = dev.copy_to_host()
+        np.testing.assert_array_equal(host, result)
+
+    def test_device_to_device_copy(self):
+        """Verify device-to-device memory copy."""
+        n = 32
+        host = np.arange(n, dtype=np.float32)
+        dev1 = cuda.to_device(host)
+        dev2 = cuda.device_array(n, dtype=np.float32)
+        dev2.copy_to_device(dev1)
+        result = dev2.copy_to_host()
+        np.testing.assert_array_equal(host, result)
+
+    def test_multidimensional_shape_consistency(self):
+        """Verify shape consistency for multidimensional arrays."""
+        shapes = [(10,), (5, 8), (3, 4, 5), (2, 3, 4, 5)]
+        for shape in shapes:
+            arr = cuda.device_array(shape, dtype=np.float32)
+            self.assertEqual(arr.shape, shape)
+            self.assertEqual(arr.ndim, len(shape))
+            self.assertEqual(arr.size, np.prod(shape))
+
+    def test_reshape_device_array(self):
+        """Verify reshape on a DeviceArray preserves data and changes shape."""
+        host = np.arange(32, dtype=np.int32)
+        arr = cuda.to_device(host)
+        reshaped = arr.reshape((4, 8))
+        self.assertEqual(reshaped.shape, (4, 8))
+        np.testing.assert_array_equal(reshaped.copy_to_host(), host.reshape((4, 8)))
+
+    def test_view_device_array(self):
+        """Verify view on a DeviceArray reinterprets dtype and preserves data."""
+        host = np.arange(16, dtype=np.int32)
+        arr = cuda.to_device(host)
+        viewed = arr.view(np.float32)
+        self.assertEqual(viewed.dtype, np.float32)
+        self.assertEqual(viewed.shape, (16,))
+        np.testing.assert_array_equal(
+            viewed.copy_to_host().view(np.int32), host
+        )
+
+    def test_ravel_device_array(self):
+        """Verify ravel on a DeviceArray returns a 1D array with correct data."""
+        host = np.arange(32, dtype=np.int32).reshape(4, 8)
+        arr = cuda.to_device(host)
+        raveled = arr.ravel()
+        self.assertEqual(raveled.ndim, 1)
+        self.assertEqual(raveled.shape, (32,))
+        np.testing.assert_array_equal(raveled.copy_to_host(), host.ravel())
+
+    def test_advanced_slicing(self):
+        """Verify step-slicing on a device array returns the correct elements."""
+        host = np.arange(10, dtype=np.int32)
+        arr = cuda.to_device(host)
+        result = arr[::2]
+        np.testing.assert_array_equal(result.copy_to_host(), host[::2])
+
+    def test_multidimensional_slicing(self):
+        """Verify multidimensional slicing on a device array returns the correct elements."""
+        host = np.arange(16, dtype=np.int32).reshape(4, 4)
+        arr = cuda.to_device(host)
+        result = arr[:, ::-1]
+        np.testing.assert_array_equal(result.copy_to_host(), host[:, ::-1])
+
+    @pytest.mark.xfail(reason="Boolean indexing is not supported on CUDA device arrays")
+    def test_boolean_indexing(self):
+        """Boolean mask indexing is not supported on device arrays."""
+        host = np.arange(10, dtype=np.int32)
+        arr = cuda.to_device(host)
+        mask = np.array([True, False] * 5)
+        arr[mask]
+
+    @pytest.mark.xfail(reason="Fancy (integer array) indexing is not supported on CUDA device arrays")
+    def test_fancy_indexing(self):
+        """Fancy indexing with integer arrays is not supported on device arrays."""
+        host = np.arange(10, dtype=np.int32)
+        arr = cuda.to_device(host)
+        idx = np.array([1, 3, 5])
+        arr[idx]
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_manipulation.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_manipulation.py
@@ -0,0 +1,173 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+CUDA array manipulation tests using kernels and Python CUDA semantics.
+
+These tests validate array operations, indexing, and element-wise operations
+in CUDA kernels. They are inspired by CPU array manipulation test suites and
+adapted to verify CUDA device semantics.
+"""
+
+import numpy as np
+
+from numba import cuda
+from numba.cuda.testing import CUDATestCase
+
+
+@cuda.jit
+def fill_kernel(arr, value):
+    i = cuda.grid(1)
+    if i < arr.size:
+        arr[i] = value
+
+
+@cuda.jit
+def add_kernel(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = a[i] + b[i]
+
+
+@cuda.jit
+def set_index(a):
+    a[3] = 42
+
+
+@cuda.jit
+def cast_int_to_float(src, dst):
+    i = cuda.grid(1)
+    if i < src.size:
+        dst[i] = src[i]
+
+
+@cuda.jit
+def cast_float_to_int(src, dst):
+    i = cuda.grid(1)
+    if i < src.size:
+        dst[i] = int(src[i])
+
+
+@cuda.jit
+def add_mixed(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = a[i] + b[i]
+
+
+class TestArrayManipulation(CUDATestCase):
+    """Tests for array manipulation operations in CUDA kernels."""
+
+    def test_fill_basic(self):
+        """Basic elementwise fill operation"""
+        n = 128
+        arr = cuda.device_array(n, dtype=np.float32)
+
+        threads = 64
+        blocks = (n + threads - 1) // threads
+        fill_kernel[blocks, threads](arr, 4.25)
+
+        result = arr.copy_to_host()
+        np.testing.assert_array_equal(result, np.full(n, 4.25))
+
+    def test_elementwise_add(self):
+        """Elementwise addition in a CUDA kernel."""
+        n = 256
+        a = np.arange(n, dtype=np.float32)
+        b = np.ones(n, dtype=np.float32)
+
+        da = cuda.to_device(a)
+        db = cuda.to_device(b)
+        dout = cuda.device_array_like(a)
+
+        threads = 64
+        blocks = (n + threads - 1) // threads
+        add_kernel[blocks, threads](da, db, dout)
+
+        result = dout.copy_to_host()
+        np.testing.assert_allclose(result, a + b)
+
+    def test_integer_getitem_setitem(self):
+        """Direct integer indexing in a kernel."""
+        arr = cuda.device_array(10, dtype=np.int32)
+        set_index[1, 1](arr)
+        host = arr.copy_to_host()
+        self.assertEqual(host[3], 42)
+
+    def test_multidimensional_indexing(self):
+        """2D indexing with a grid-based kernel."""
+        shape = (8, 8)
+        host = np.zeros(shape, dtype=np.int32)
+        dev = cuda.to_device(host)
+
+        @cuda.jit
+        def write_2d(a):
+            x, y = cuda.grid(2)
+            if x < a.shape[0] and y < a.shape[1]:
+                a[x, y] = x * a.shape[1] + y
+
+        threads = (4, 4)
+        blocks = ((shape[0] + threads[0] - 1) // threads[0],
+                  (shape[1] + threads[1] - 1) // threads[1])
+
+        write_2d[blocks, threads](dev)
+        result = dev.copy_to_host()
+        self.assertEqual(result[2, 3], 2 * shape[1] + 3)
+
+    def test_dtype_transitions(self):
+        """Type casting and mixed-dtype operations in CUDA kernels."""
+        n = 10
+        a = np.arange(n, dtype=np.int32)
+        b = np.arange(n, dtype=np.float32)
+        da = cuda.to_device(a)
+        db = cuda.to_device(b)
+        dout1 = cuda.device_array(n, dtype=np.float32)
+        dout2 = cuda.device_array(n, dtype=np.int32)
+
+        threads = 32
+        blocks = (n + threads - 1) // threads
+        cast_int_to_float[blocks, threads](da, dout1)
+        cast_float_to_int[blocks, threads](db, dout2)
+
+        np.testing.assert_allclose(dout1.copy_to_host(), a.astype(np.float32))
+        np.testing.assert_array_equal(dout2.copy_to_host(), b.astype(np.int32))
+
+        # Mixed dtype elementwise op
+        dout3 = cuda.device_array(n, dtype=np.float32)
+        add_mixed[blocks, threads](da, db, dout3)
+        np.testing.assert_allclose(dout3.copy_to_host(), a + b)
+
+    def test_shape_semantics_in_kernel(self):
+        """Shape attributes are accessible inside a kernel."""
+        shape = (5, 7)
+        arr = cuda.device_array(shape, dtype=np.int32)
+
+        @cuda.jit
+        def check_shape(a, out):
+            if cuda.threadIdx.x == 0 and cuda.blockIdx.x == 0:
+                out[0] = a.shape[0]
+                out[1] = a.shape[1]
+
+        out = cuda.device_array(2, dtype=np.int32)
+        check_shape[1, 1](arr, out)
+        host = out.copy_to_host()
+        self.assertEqual(tuple(host), shape)
+
+    def test_grid_stride_correctness(self):
+        """Grid-stride loop pattern."""
+        n = 10
+        arr = cuda.device_array(n, dtype=np.int32)
+
+        @cuda.jit
+        def grid_stride_kernel(a):
+            for i in range(cuda.grid(1), a.size, cuda.gridsize(1)):
+                a[i] = i
+
+        # Intentionally over-provisioned: more threads than elements
+        # to verify the grid-stride loop handles the wrap-around.
+        threads = 32
+        blocks = 2
+        grid_stride_kernel[blocks, threads](arr)
+        host = arr.copy_to_host()
+        np.testing.assert_array_equal(host, np.arange(n))
+