Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 193 additions & 0 deletions numba_cuda/numba/cuda/tests/cudadrv/test_device_array_manipulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-2-Clause

"""
Comment thread
CodersAcademy006 marked this conversation as resolved.
DeviceArray behavior and memory management tests.

These tests validate DeviceArray shape, memory lifetime, and device memory
operations. Inspired by CPU array tests, these verify CUDA-specific device
array behaviors and constraints.
"""

import numpy as np
import pytest

from numba import cuda
from numba.cuda.testing import CUDATestCase, skip_on_cudasim


@cuda.jit
def set_val_kernel(a, v):
i = cuda.grid(1)
if i < a.size:
a[i] = v


class TestDeviceArrayManipulation(CUDATestCase):
"""Tests for DeviceArray memory behavior and shape semantics."""

def test_memory_lifetime(self):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if I were aiming to verify the lifetime of the memory, I'd probably check the pointer is unchanged after each kernel as well. I can imagine this would still pass if the second kernel operated on some different memory to the first.

You can get the pointer value with arr.gpu_data.device_ctypes_pointer.value.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point — I've added the pointer check as a separate test, test_memory_pointer_stability, which captures arr.gpu_data.device_ctypes_pointer.value before the first kernel launch and asserts it's unchanged after each subsequent launch. I kept test_memory_lifetime focused on value correctness and put the allocation-identity check in its own test so the failure modes are distinct — a value failure vs. pointer drift point to different bugs. test_memory_pointer_stability is decorated @skip_on_cudasim since the simulator doesn't model real device pointers, as you noted earlier.

"""Verify device array data persists correctly across multiple
kernel launches."""
n = 32
arr = cuda.device_array(n, dtype=np.float32)

threads = 16
blocks = (n + threads - 1) // threads
set_val_kernel[blocks, threads](arr, 1.5)

# First kernel wrote 1.5
host = arr.copy_to_host()
np.testing.assert_allclose(host, np.full(n, 1.5))

# Second kernel overwrites with 2.5 on the same array
set_val_kernel[blocks, threads](arr, 2.5)
host = arr.copy_to_host()
np.testing.assert_allclose(host, np.full(n, 2.5))

@skip_on_cudasim("Simulator does not model device pointers")
def test_memory_pointer_stability(self):
"""Verify the device pointer is unchanged after kernel launches,
ensuring the same allocation is reused."""
n = 32
arr = cuda.device_array(n, dtype=np.float32)
ptr_before = arr.gpu_data.device_ctypes_pointer.value

threads = 16
blocks = (n + threads - 1) // threads
set_val_kernel[blocks, threads](arr, 1.5)
ptr_after_first = arr.gpu_data.device_ctypes_pointer.value
self.assertEqual(ptr_before, ptr_after_first)

set_val_kernel[blocks, threads](arr, 2.5)
ptr_after_second = arr.gpu_data.device_ctypes_pointer.value
self.assertEqual(ptr_before, ptr_after_second)

host = arr.copy_to_host()
np.testing.assert_allclose(host, np.full(n, 2.5))

def test_device_array_shape(self):
"""Verify DeviceArray shape attribute is correct."""
shape = (10, 20)
arr = cuda.device_array(shape, dtype=np.float32)
self.assertEqual(arr.shape, shape)

def test_device_array_dtype(self):
"""Verify DeviceArray dtype attribute is correct."""
dtype = np.int32
arr = cuda.device_array(10, dtype=dtype)
self.assertEqual(arr.dtype, dtype)

def test_device_array_size(self):
"""Verify DeviceArray size attribute is correct."""
shape = (5, 8)
arr = cuda.device_array(shape, dtype=np.float32)
self.assertEqual(arr.size, 40)

def test_device_array_ndim(self):
"""Verify DeviceArray ndim attribute is correct."""
arr1d = cuda.device_array(10, dtype=np.float32)
arr2d = cuda.device_array((5, 8), dtype=np.float32)
arr3d = cuda.device_array((2, 3, 4), dtype=np.float32)
self.assertEqual(arr1d.ndim, 1)
self.assertEqual(arr2d.ndim, 2)
self.assertEqual(arr3d.ndim, 3)

def test_device_array_like(self):
"""Verify device_array_like creates array with matching shape and dtype."""
host = np.arange(20, dtype=np.int32).reshape(4, 5)
dev = cuda.device_array_like(host)
self.assertEqual(dev.shape, host.shape)
self.assertEqual(dev.dtype, host.dtype)

def test_to_device_copy_to_host_roundtrip(self):
"""Verify data integrity in to_device -> copy_to_host roundtrip."""
host_orig = np.arange(100, dtype=np.float32)
dev = cuda.to_device(host_orig)
host_copy = dev.copy_to_host()
np.testing.assert_array_equal(host_orig, host_copy)

def test_copy_to_device_existing_array(self):
"""Verify copy_to_device into pre-allocated device array."""
host = np.arange(50, dtype=np.int32)
dev = cuda.device_array(50, dtype=np.int32)
dev.copy_to_device(host)
result = dev.copy_to_host()
np.testing.assert_array_equal(host, result)

def test_device_to_device_copy(self):
"""Verify device-to-device memory copy."""
n = 32
host = np.arange(n, dtype=np.float32)
dev1 = cuda.to_device(host)
dev2 = cuda.device_array(n, dtype=np.float32)
dev2.copy_to_device(dev1)
result = dev2.copy_to_host()
np.testing.assert_array_equal(host, result)

def test_multidimensional_shape_consistency(self):
"""Verify shape consistency for multidimensional arrays."""
shapes = [(10,), (5, 8), (3, 4, 5), (2, 3, 4, 5)]
for shape in shapes:
arr = cuda.device_array(shape, dtype=np.float32)
self.assertEqual(arr.shape, shape)
self.assertEqual(arr.ndim, len(shape))
self.assertEqual(arr.size, np.prod(shape))

def test_reshape_device_array(self):
"""Verify reshape on a DeviceArray preserves data and changes shape."""
host = np.arange(32, dtype=np.int32)
arr = cuda.to_device(host)
reshaped = arr.reshape((4, 8))
self.assertEqual(reshaped.shape, (4, 8))
np.testing.assert_array_equal(reshaped.copy_to_host(), host.reshape((4, 8)))

def test_view_device_array(self):
"""Verify view on a DeviceArray reinterprets dtype and preserves data."""
host = np.arange(16, dtype=np.int32)
arr = cuda.to_device(host)
viewed = arr.view(np.float32)
self.assertEqual(viewed.dtype, np.float32)
self.assertEqual(viewed.shape, (16,))
np.testing.assert_array_equal(
viewed.copy_to_host().view(np.int32), host
)

def test_ravel_device_array(self):
"""Verify ravel on a DeviceArray returns a 1D array with correct data."""
host = np.arange(32, dtype=np.int32).reshape(4, 8)
arr = cuda.to_device(host)
raveled = arr.ravel()
self.assertEqual(raveled.ndim, 1)
self.assertEqual(raveled.shape, (32,))
np.testing.assert_array_equal(raveled.copy_to_host(), host.ravel())

def test_advanced_slicing(self):
"""Verify step-slicing on a device array returns the correct elements."""
host = np.arange(10, dtype=np.int32)
arr = cuda.to_device(host)
result = arr[::2]
np.testing.assert_array_equal(result.copy_to_host(), host[::2])

def test_multidimensional_slicing(self):
"""Verify multidimensional slicing on a device array returns the correct elements."""
host = np.arange(16, dtype=np.int32).reshape(4, 4)
arr = cuda.to_device(host)
result = arr[:, ::-1]
np.testing.assert_array_equal(result.copy_to_host(), host[:, ::-1])

@pytest.mark.xfail(reason="Boolean indexing is not supported on CUDA device arrays")
def test_boolean_indexing(self):
"""Boolean mask indexing is not supported on device arrays."""
host = np.arange(10, dtype=np.int32)
arr = cuda.to_device(host)
mask = np.array([True, False] * 5)
arr[mask]

@pytest.mark.xfail(reason="Fancy (integer array) indexing is not supported on CUDA device arrays")
def test_fancy_indexing(self):
"""Fancy indexing with integer arrays is not supported on device arrays."""
host = np.arange(10, dtype=np.int32)
arr = cuda.to_device(host)
idx = np.array([1, 3, 5])
arr[idx]
173 changes: 173 additions & 0 deletions numba_cuda/numba/cuda/tests/cudapy/test_array_manipulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-2-Clause

"""
Comment thread
CodersAcademy006 marked this conversation as resolved.
CUDA array manipulation tests using kernels and Python CUDA semantics.

These tests validate array operations, indexing, and element-wise operations
in CUDA kernels. They are inspired by CPU array manipulation test suites and
adapted to verify CUDA device semantics.
"""

import numpy as np

from numba import cuda
from numba.cuda.testing import CUDATestCase


@cuda.jit
def fill_kernel(arr, value):
i = cuda.grid(1)
if i < arr.size:
arr[i] = value


@cuda.jit
def add_kernel(a, b, out):
i = cuda.grid(1)
if i < out.size:
out[i] = a[i] + b[i]


@cuda.jit
def set_index(a):
a[3] = 42


@cuda.jit
def cast_int_to_float(src, dst):
i = cuda.grid(1)
if i < src.size:
dst[i] = src[i]


@cuda.jit
def cast_float_to_int(src, dst):
i = cuda.grid(1)
if i < src.size:
dst[i] = int(src[i])


@cuda.jit
def add_mixed(a, b, out):
i = cuda.grid(1)
if i < out.size:
out[i] = a[i] + b[i]


class TestArrayManipulation(CUDATestCase):
"""Tests for array manipulation operations in CUDA kernels."""

def test_fill_basic(self):
"""Basic elementwise fill operation"""
n = 128
arr = cuda.device_array(n, dtype=np.float32)

threads = 64
blocks = (n + threads - 1) // threads
fill_kernel[blocks, threads](arr, 4.25)

result = arr.copy_to_host()
np.testing.assert_array_equal(result, np.full(n, 4.25))

def test_elementwise_add(self):
"""Elementwise addition in a CUDA kernel."""
n = 256
a = np.arange(n, dtype=np.float32)
b = np.ones(n, dtype=np.float32)

da = cuda.to_device(a)
db = cuda.to_device(b)
dout = cuda.device_array_like(a)

threads = 64
blocks = (n + threads - 1) // threads
add_kernel[blocks, threads](da, db, dout)

result = dout.copy_to_host()
np.testing.assert_allclose(result, a + b)

def test_integer_getitem_setitem(self):
"""Direct integer indexing in a kernel."""
arr = cuda.device_array(10, dtype=np.int32)
set_index[1, 1](arr)
host = arr.copy_to_host()
self.assertEqual(host[3], 42)

def test_multidimensional_indexing(self):
"""2D indexing with a grid-based kernel."""
shape = (8, 8)
host = np.zeros(shape, dtype=np.int32)
dev = cuda.to_device(host)

@cuda.jit
def write_2d(a):
x, y = cuda.grid(2)
if x < a.shape[0] and y < a.shape[1]:
a[x, y] = x * a.shape[1] + y

threads = (4, 4)
blocks = ((shape[0] + threads[0] - 1) // threads[0],
(shape[1] + threads[1] - 1) // threads[1])

write_2d[blocks, threads](dev)
result = dev.copy_to_host()
self.assertEqual(result[2, 3], 2 * shape[1] + 3)

def test_dtype_transitions(self):
"""Type casting and mixed-dtype operations in CUDA kernels."""
n = 10
a = np.arange(n, dtype=np.int32)
b = np.arange(n, dtype=np.float32)
da = cuda.to_device(a)
db = cuda.to_device(b)
dout1 = cuda.device_array(n, dtype=np.float32)
dout2 = cuda.device_array(n, dtype=np.int32)

threads = 32
blocks = (n + threads - 1) // threads
cast_int_to_float[blocks, threads](da, dout1)
cast_float_to_int[blocks, threads](db, dout2)

np.testing.assert_allclose(dout1.copy_to_host(), a.astype(np.float32))
np.testing.assert_array_equal(dout2.copy_to_host(), b.astype(np.int32))

# Mixed dtype elementwise op
dout3 = cuda.device_array(n, dtype=np.float32)
add_mixed[blocks, threads](da, db, dout3)
np.testing.assert_allclose(dout3.copy_to_host(), a + b)

def test_shape_semantics_in_kernel(self):
"""Shape attributes are accessible inside a kernel."""
shape = (5, 7)
arr = cuda.device_array(shape, dtype=np.int32)

@cuda.jit
def check_shape(a, out):
if cuda.threadIdx.x == 0 and cuda.blockIdx.x == 0:
out[0] = a.shape[0]
out[1] = a.shape[1]

out = cuda.device_array(2, dtype=np.int32)
check_shape[1, 1](arr, out)
host = out.copy_to_host()
self.assertEqual(tuple(host), shape)

def test_grid_stride_correctness(self):
"""Grid-stride loop pattern."""
n = 10
arr = cuda.device_array(n, dtype=np.int32)

@cuda.jit
def grid_stride_kernel(a):
for i in range(cuda.grid(1), a.size, cuda.gridsize(1)):
a[i] = i

# Intentionally over-provisioned: more threads than elements
# to verify the grid-stride loop handles the wrap-around.
threads = 32
blocks = 2
grid_stride_kernel[blocks, threads](arr)
host = arr.copy_to_host()
np.testing.assert_array_equal(host, np.arange(n))