diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py b/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py
index eb9d74104..ca9467769 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: BSD-2-Clause
 
 import numpy as np
-from numba.cuda.testing import unittest, CUDATestCase
+from numba.cuda.testing import CUDATestCase
 from numba import cuda
 
 
@@ -13,15 +13,10 @@ def boolean_func(A, vertial):
         A[0] = 321
 
 
-class TestCudaBoolean(CUDATestCase):
-    def test_boolean(self):
-        func = cuda.jit("void(float64[:], bool_)")(boolean_func)
-        A = np.array([0], dtype="float64")
-        func[1, 1](A, True)
-        self.assertTrue(A[0] == 123)
-        func[1, 1](A, False)
-        self.assertTrue(A[0] == 321)
-
-
-if __name__ == "__main__":
-    unittest.main()
+def test_boolean():
+    func = cuda.jit("void(float64[:], bool_)")(boolean_func)
+    A = np.array([0], dtype="float64")
+    func[1, 1](A, True)
+    assert A[0] == 123
+    func[1, 1](A, False)
+    assert A[0] == 321
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py b/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py
index 26f2ab830..b7d1ee44f 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py
@@ -2,10 +2,11 @@
 # SPDX-License-Identifier: BSD-2-Clause
 
 import numpy as np
+import pytest
 
 from numba import cuda
 from numba.cuda import float64
-from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+from numba.cuda.testing import skip_on_cudasim
 
 
 def builtin_max(A, B, C):
@@ -26,53 +27,98 @@ def builtin_min(A, B, C):
     C[i] = float64(min(A[i], B[i]))
 
 
-@skip_on_cudasim("Tests PTX emission")
-class TestCudaMinMax(CUDATestCase):
-    def _run(
-        self,
-        kernel,
-        numpy_equivalent,
-        ptx_instruction,
-        dtype_left,
-        dtype_right,
-        n=5,
-    ):
-        kernel = cuda.jit(kernel)
-
-        c = np.zeros(n, dtype=np.float64)
-        a = np.arange(n, dtype=dtype_left) + 0.5
-        b = np.full(n, fill_value=2, dtype=dtype_right)
-
-        kernel[1, c.shape](a, b, c)
-        np.testing.assert_allclose(c, numpy_equivalent(a, b))
-
-        ptx = next(p for p in kernel.inspect_asm().values())
-        self.assertIn(ptx_instruction, ptx)
-
-    def test_max_f8f8(self):
-        self._run(builtin_max, np.maximum, "max.f64", np.float64, np.float64)
+def _run(
+    kernel,
+    numpy_equivalent,
+    ptx_instruction,
+    dtype_left,
+    dtype_right,
+    n=5,
+):
+    kernel = cuda.jit(kernel)
 
-    def test_max_f4f8(self):
-        self._run(builtin_max, np.maximum, "max.f64", np.float32, np.float64)
+    c = np.zeros(n, dtype=np.float64)
+    a = np.arange(n, dtype=dtype_left) + 0.5
+    b = np.full(n, fill_value=2, dtype=dtype_right)
 
-    def test_max_f8f4(self):
-        self._run(builtin_max, np.maximum, "max.f64", np.float64, np.float32)
+    kernel[1, c.shape](a, b, c)
+    np.testing.assert_allclose(c, numpy_equivalent(a, b))
 
-    def test_max_f4f4(self):
-        self._run(builtin_max, np.maximum, "max.f32", np.float32, np.float32)
+    ptx = next(p for p in kernel.inspect_asm().values())
+    assert ptx_instruction in ptx
 
-    def test_min_f8f8(self):
-        self._run(builtin_min, np.minimum, "min.f64", np.float64, np.float64)
 
-    def test_min_f4f8(self):
-        self._run(builtin_min, np.minimum, "min.f64", np.float32, np.float64)
-
-    def test_min_f8f4(self):
-        self._run(builtin_min, np.minimum, "min.f64", np.float64, np.float32)
-
-    def test_min_f4f4(self):
-        self._run(builtin_min, np.minimum, "min.f32", np.float32, np.float32)
-
-
-if __name__ == "__main__":
-    unittest.main()
+@skip_on_cudasim("Tests PTX emission")
+@pytest.mark.parametrize(
+    "kernel,numpy_equivalent,ptx_instruction,dtype_left,dtype_right",
+    [
+        pytest.param(
+            builtin_max,
+            np.maximum,
+            "max.f64",
+            np.float64,
+            np.float64,
+            id="max_f8f8",
+        ),
+        pytest.param(
+            builtin_max,
+            np.maximum,
+            "max.f64",
+            np.float32,
+            np.float64,
+            id="max_f4f8",
+        ),
+        pytest.param(
+            builtin_max,
+            np.maximum,
+            "max.f64",
+            np.float64,
+            np.float32,
+            id="max_f8f4",
+        ),
+        pytest.param(
+            builtin_max,
+            np.maximum,
+            "max.f32",
+            np.float32,
+            np.float32,
+            id="max_f4f4",
+        ),
+        pytest.param(
+            builtin_min,
+            np.minimum,
+            "min.f64",
+            np.float64,
+            np.float64,
+            id="min_f8f8",
+        ),
+        pytest.param(
+            builtin_min,
+            np.minimum,
+            "min.f64",
+            np.float32,
+            np.float64,
+            id="min_f4f8",
+        ),
+        pytest.param(
+            builtin_min,
+            np.minimum,
+            "min.f64",
+            np.float64,
+            np.float32,
+            id="min_f8f4",
+        ),
+        pytest.param(
+            builtin_min,
+            np.minimum,
+            "min.f32",
+            np.float32,
+            np.float32,
+            id="min_f4f4",
+        ),
+    ],
+)
+def test_minmax(
+    kernel, numpy_equivalent, ptx_instruction, dtype_left, dtype_right
+):
+    _run(kernel, numpy_equivalent, ptx_instruction, dtype_left, dtype_right)
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py b/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py
index 10df01fc0..ec141fac8 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py
@@ -1,145 +1,143 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
-from numba import cuda
-import numpy as np
-from numba.cuda.testing import skip_on_cudasim, CUDATestCase
 import threading
-import unittest
 
+import numpy as np
+import pytest
 
-class TestMultiGPUContext(CUDATestCase):
-    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
-    def test_multigpu_context(self):
-        @cuda.jit("void(float64[:], float64[:])")
-        def copy_plus_1(inp, out):
-            i = cuda.grid(1)
-            if i < out.size:
-                out[i] = inp[i] + 1
+from numba import cuda
+from numba.cuda.testing import skip_on_cudasim
 
-        def check(inp, out):
-            np.testing.assert_equal(inp + 1, out)
 
-        N = 32
-        A = np.arange(N, dtype=np.float64)
-        B = np.arange(N, dtype=np.float64)
+@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus")
+def test_multigpu_context():
+    @cuda.jit("void(float64[:], float64[:])")
+    def copy_plus_1(inp, out):
+        i = cuda.grid(1)
+        if i < out.size:
+            out[i] = inp[i] + 1
 
-        with cuda.gpus[0]:
-            copy_plus_1[1, N](A, B)
+    def check(inp, out):
+        np.testing.assert_equal(inp + 1, out)
 
-        check(A, B)
+    N = 32
+    A = np.arange(N, dtype=np.float64)
+    B = np.arange(N, dtype=np.float64)
 
+    with cuda.gpus[0]:
         copy_plus_1[1, N](A, B)
-        check(A, B)
 
-        with cuda.gpus[0]:
-            A0 = np.arange(N, dtype=np.float64)
-            B0 = np.arange(N, dtype=np.float64)
-            copy_plus_1[1, N](A0, B0)
+    check(A, B)
 
-            with cuda.gpus[1]:
-                A1 = np.arange(N, dtype=np.float64)
-                B1 = np.arange(N, dtype=np.float64)
-                copy_plus_1[1, N](A1, B1)
+    copy_plus_1[1, N](A, B)
+    check(A, B)
 
-        check(A0, B0)
-        check(A1, B1)
-
-        A = np.arange(N, dtype=np.float64)
-        B = np.arange(N, dtype=np.float64)
-        copy_plus_1[1, N](A, B)
-        check(A, B)
-
-    @skip_on_cudasim("Simulator does not support multiple threads")
-    def test_multithreaded(self):
-        def work(gpu, dA, results, ridx):
-            try:
-                with gpu:
-                    arr = dA.copy_to_host()
-
-            except Exception as e:
-                results[ridx] = e
-
-            else:
-                results[ridx] = np.all(arr == np.arange(10))
-
-        dA = cuda.to_device(np.arange(10))
-
-        nthreads = 10
-        results = [None] * nthreads
-        threads = [
-            threading.Thread(
-                target=work, args=(cuda.gpus.current, dA, results, i)
-            )
-            for i in range(nthreads)
-        ]
-        for th in threads:
-            th.start()
-
-        for th in threads:
-            th.join()
-
-        for r in results:
-            if isinstance(r, BaseException):
-                raise r
-            else:
-                self.assertTrue(r)
-
-    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
-    def test_with_context(self):
-        @cuda.jit
-        def vector_add_scalar(arr, val):
-            i = cuda.grid(1)
-            if i < arr.size:
-                arr[i] += val
-
-        hostarr = np.arange(10, dtype=np.float32)
-        with cuda.gpus[0]:
-            arr1 = cuda.to_device(hostarr)
+    with cuda.gpus[0]:
+        A0 = np.arange(N, dtype=np.float64)
+        B0 = np.arange(N, dtype=np.float64)
+        copy_plus_1[1, N](A0, B0)
 
         with cuda.gpus[1]:
-            arr2 = cuda.to_device(hostarr)
+            A1 = np.arange(N, dtype=np.float64)
+            B1 = np.arange(N, dtype=np.float64)
+            copy_plus_1[1, N](A1, B1)
 
-        with cuda.gpus[0]:
-            vector_add_scalar[1, 10](arr1, 1)
+    check(A0, B0)
+    check(A1, B1)
 
-        with cuda.gpus[1]:
-            vector_add_scalar[1, 10](arr2, 2)
+    A = np.arange(N, dtype=np.float64)
+    B = np.arange(N, dtype=np.float64)
+    copy_plus_1[1, N](A, B)
+    check(A, B)
 
-        with cuda.gpus[0]:
-            np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
 
-        with cuda.gpus[1]:
-            np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
-
-    @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
-    def test_with_context_peer_copy(self):
-        # Peer access is not always possible - for example, with one GPU in TCC
-        # mode and one in WDDM - if that is the case, this test would fail so
-        # we need to skip it.
-        with cuda.gpus[0]:
-            ctx = cuda.current_context()
-            if not ctx.can_access_peer(1):
-                self.skipTest("Peer access between GPUs disabled")
-
-        # 1. Create a range in an array
-        hostarr = np.arange(10, dtype=np.float32)
-
-        # 2. Copy range array from host -> GPU 0
-        with cuda.gpus[0]:
-            arr1 = cuda.to_device(hostarr)
-
-        # 3. Initialize a zero-filled array on GPU 1
-        with cuda.gpus[1]:
-            arr2 = cuda.to_device(np.zeros_like(hostarr))
+@skip_on_cudasim("Simulator does not support multiple threads")
+def test_multithreaded():
+    def work(gpu, dA, results, ridx):
+        try:
+            with gpu:
+                arr = dA.copy_to_host()
+
+        except Exception as e:
+            results[ridx] = e
+
+        else:
+            results[ridx] = np.all(arr == np.arange(10))
+
+    dA = cuda.to_device(np.arange(10))
+
+    nthreads = 10
+    results = [None] * nthreads
+    threads = [
+        threading.Thread(target=work, args=(cuda.gpus.current, dA, results, i))
+        for i in range(nthreads)
+    ]
+    for th in threads:
+        th.start()
+
+    for th in threads:
+        th.join()
+
+    for r in results:
+        if isinstance(r, BaseException):
+            raise r
+        else:
+            assert r
+
+
+@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus")
+def test_with_context():
+    @cuda.jit
+    def vector_add_scalar(arr, val):
+        i = cuda.grid(1)
+        if i < arr.size:
+            arr[i] += val
+
+    hostarr = np.arange(10, dtype=np.float32)
+    with cuda.gpus[0]:
+        arr1 = cuda.to_device(hostarr)
+
+    with cuda.gpus[1]:
+        arr2 = cuda.to_device(hostarr)
+
+    with cuda.gpus[0]:
+        vector_add_scalar[1, 10](arr1, 1)
+
+    with cuda.gpus[1]:
+        vector_add_scalar[1, 10](arr2, 2)
+
+    with cuda.gpus[0]:
+        np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
+
+    with cuda.gpus[1]:
+        np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
+
+
+@pytest.mark.skipif(len(cuda.gpus) < 2, reason="need more than 1 gpus")
+def test_with_context_peer_copy():
+    # Peer access is not always possible - for example, with one GPU in TCC
+    # mode and one in WDDM - if that is the case, this test would fail so
+    # we need to skip it.
+    with cuda.gpus[0]:
+        ctx = cuda.current_context()
+        if not ctx.can_access_peer(1):
+            pytest.skip("Peer access between GPUs disabled")
+
+    # 1. Create a range in an array
+    hostarr = np.arange(10, dtype=np.float32)
 
-        with cuda.gpus[0]:
-            # 4. Copy range from GPU 0 -> GPU 1
-            arr2.copy_to_device(arr1)
+    # 2. Copy range array from host -> GPU 0
+    with cuda.gpus[0]:
+        arr1 = cuda.to_device(hostarr)
 
-            # 5. Copy range from GPU 1 -> host and check contents
-            np.testing.assert_equal(arr2.copy_to_host(), hostarr)
+    # 3. Initialize a zero-filled array on GPU 1
+    with cuda.gpus[1]:
+        arr2 = cuda.to_device(np.zeros_like(hostarr))
 
+    with cuda.gpus[0]:
+        # 4. Copy range from GPU 0 -> GPU 1
+        arr2.copy_to_device(arr1)
 
-if __name__ == "__main__":
-    unittest.main()
+        # 5. Copy range from GPU 1 -> host and check contents
+        np.testing.assert_equal(arr2.copy_to_host(), hostarr)
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_powi.py b/numba_cuda/numba/cuda/tests/cudapy/test_powi.py
index 4bf80bb1a..d99886744 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_powi.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_powi.py
@@ -5,7 +5,6 @@
 import numpy as np
 from numba import cuda
 from numba.cuda import float64, int8, int32, void
-from numba.cuda.testing import unittest, CUDATestCase
 
 
 def cu_mat_power(A, power, power_A):
@@ -54,7 +53,7 @@ def random_complex(N):
     return np.random.random(1) + np.random.random(1) * 1j
 
 
-class TestCudaPowi(CUDATestCase):
+class TestCudaPowi:
     def test_powi(self):
         dec = cuda.jit(void(float64[:, :], int8, float64[:, :]))
         kernel = dec(cu_mat_power)
@@ -63,7 +62,7 @@ def test_powi(self):
         A = np.arange(10, dtype=np.float64).reshape(2, 5)
         Aout = np.empty_like(A)
         kernel[1, A.shape](A, power, Aout)
-        self.assertTrue(np.allclose(Aout, A**power))
+        assert np.allclose(Aout, A**power)
 
     def test_powi_binop(self):
         dec = cuda.jit(void(float64[:, :], int8, float64[:, :]))
@@ -73,7 +72,7 @@ def test_powi_binop(self):
         A = np.arange(10, dtype=np.float64).reshape(2, 5)
         Aout = np.empty_like(A)
         kernel[1, A.shape](A, power, Aout)
-        self.assertTrue(np.allclose(Aout, A**power))
+        assert np.allclose(Aout, A**power)
 
     # Relative tolerance kwarg is provided because 1.0e-7 (the default for
     # assert_allclose) is a bit tight for single precision.
@@ -122,7 +121,3 @@ def test_cpow_complex64_inplace_binop(self):
 
     def test_cpow_complex128_inplace_binop(self):
         self._test_cpow_inplace_binop(np.complex128, rtol=3.0e-7)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_random.py b/numba_cuda/numba/cuda/tests/cudapy/test_random.py
index c99e29aa5..7d17627ec 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_random.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_random.py
@@ -4,12 +4,11 @@
 import math
 
 import numpy as np
+import pytest
 
 from numba import cuda
-from numba.cuda.testing import unittest
 from numba.cuda.testing import (
     skip_on_cudasim,
-    CUDATestCase,
     skip_on_standalone_numba_cuda,
 )
 
@@ -20,12 +19,32 @@
     xoroshiro128p_normal_float64,
 )
 
-
 # Distributions
 UNIFORM = 1
 NORMAL = 2
 
 
+def check_uniform(kernel_func, dtype):
+    states = cuda.random.create_xoroshiro128p_states(32 * 2, seed=1)
+    out = np.zeros(2 * 32 * 32, dtype=np.float32)
+
+    kernel_func[2, 32](states, out, 32, UNIFORM)
+    assert out.max() == pytest.approx(1.0, abs=1e-3)
+    assert out.min() == pytest.approx(0.0, abs=1e-3)
+    assert out.mean() == pytest.approx(0.5, abs=1.5e-2)
+    assert out.std() == pytest.approx(1.0 / (2 * math.sqrt(3)), abs=6e-3)
+
+
+def check_normal(kernel_func, dtype):
+    states = cuda.random.create_xoroshiro128p_states(32 * 2, seed=1)
+    out = np.zeros(2 * 32 * 32, dtype=dtype)
+
+    kernel_func[2, 32](states, out, 32, NORMAL)
+
+    assert out.mean() == pytest.approx(0.0, abs=4e-3)
+    assert out.std() == pytest.approx(1.0, abs=2e-3)
+
+
 @cuda.jit
 def rng_kernel_float32(states, out, count, distribution):
     thread_id = cuda.grid(1)
@@ -53,11 +72,11 @@ def rng_kernel_float64(states, out, count, distribution):
 
 
 @skip_on_standalone_numba_cuda
-class TestCudaRandomXoroshiro128p(CUDATestCase):
+class TestCudaRandomXoroshiro128p:
     def test_create(self):
         states = cuda.random.create_xoroshiro128p_states(10, seed=1)
         s = states.copy_to_host()
-        self.assertEqual(len(np.unique(s)), 10)
+        assert len(np.unique(s)) == 10
 
     def test_create_subsequence_start(self):
         states = cuda.random.create_xoroshiro128p_states(10, seed=1)
@@ -77,41 +96,18 @@ def test_create_stream(self):
             10, seed=1, stream=stream
         )
         s = states.copy_to_host()
-        self.assertEqual(len(np.unique(s)), 10)
-
-    def check_uniform(self, kernel_func, dtype):
-        states = cuda.random.create_xoroshiro128p_states(32 * 2, seed=1)
-        out = np.zeros(2 * 32 * 32, dtype=np.float32)
-
-        kernel_func[2, 32](states, out, 32, UNIFORM)
-        self.assertAlmostEqual(out.min(), 0.0, delta=1e-3)
-        self.assertAlmostEqual(out.max(), 1.0, delta=1e-3)
-        self.assertAlmostEqual(out.mean(), 0.5, delta=1.5e-2)
-        self.assertAlmostEqual(out.std(), 1.0 / (2 * math.sqrt(3)), delta=6e-3)
+        assert len(np.unique(s)) == 10
 
     def test_uniform_float32(self):
-        self.check_uniform(rng_kernel_float32, np.float32)
+        check_uniform(rng_kernel_float32, np.float32)
 
     @skip_on_cudasim("skip test for speed under cudasim")
     def test_uniform_float64(self):
-        self.check_uniform(rng_kernel_float64, np.float64)
-
-    def check_normal(self, kernel_func, dtype):
-        states = cuda.random.create_xoroshiro128p_states(32 * 2, seed=1)
-        out = np.zeros(2 * 32 * 32, dtype=dtype)
-
-        kernel_func[2, 32](states, out, 32, NORMAL)
-
-        self.assertAlmostEqual(out.mean(), 0.0, delta=4e-3)
-        self.assertAlmostEqual(out.std(), 1.0, delta=2e-3)
+        check_uniform(rng_kernel_float64, np.float64)
 
     def test_normal_float32(self):
-        self.check_normal(rng_kernel_float32, np.float32)
+        check_normal(rng_kernel_float32, np.float32)
 
     @skip_on_cudasim("skip test for speed under cudasim")
     def test_normal_float64(self):
-        self.check_normal(rng_kernel_float64, np.float64)
-
-
-if __name__ == "__main__":
-    unittest.main()
+        check_normal(rng_kernel_float64, np.float64)
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py b/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py
index c3d85e8ac..fbd9c24b3 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py
@@ -5,272 +5,362 @@
 import itertools
 import warnings
 import numpy as np
-import unittest
+import pytest
 from numba.cuda import HAS_NUMBA
 
 if HAS_NUMBA:
     from numba import njit
-from numba import cuda
 from numba.cuda import config, types
 from numba.cuda.testing import skip_on_standalone_numba_cuda
 from numba.cuda.typing.typeof import typeof
 from numba.cuda.np import numpy_support
-from numba.cuda.tests.support import TestCase
-
-
-class BaseUFuncTest:
-    def setUp(self):
-        self.inputs = [
-            (np.uint32(0), types.uint32),
-            (np.uint32(1), types.uint32),
-            (np.int32(-1), types.int32),
-            (np.int32(0), types.int32),
-            (np.int32(1), types.int32),
-            (np.uint64(0), types.uint64),
-            (np.uint64(1), types.uint64),
-            (np.int64(-1), types.int64),
-            (np.int64(0), types.int64),
-            (np.int64(1), types.int64),
-            (np.float32(-0.5), types.float32),
-            (np.float32(0.0), types.float32),
-            (np.float32(0.5), types.float32),
-            (np.float64(-0.5), types.float64),
-            (np.float64(0.0), types.float64),
-            (np.float64(0.5), types.float64),
-            (np.array([0, 1], dtype="u4"), types.Array(types.uint32, 1, "C")),
-            (np.array([0, 1], dtype="u8"), types.Array(types.uint64, 1, "C")),
-            (
-                np.array([-1, 0, 1], dtype="i4"),
-                types.Array(types.int32, 1, "C"),
-            ),
-            (
-                np.array([-1, 0, 1], dtype="i8"),
-                types.Array(types.int64, 1, "C"),
-            ),
-            (
-                np.array([-0.5, 0.0, 0.5], dtype="f4"),
-                types.Array(types.float32, 1, "C"),
-            ),
-            (
-                np.array([-0.5, 0.0, 0.5], dtype="f8"),
-                types.Array(types.float64, 1, "C"),
-            ),
-            (np.array([0, 1], dtype=np.int8), types.Array(types.int8, 1, "C")),
-            (
-                np.array([0, 1], dtype=np.int16),
-                types.Array(types.int16, 1, "C"),
-            ),
-            (
-                np.array([0, 1], dtype=np.uint8),
-                types.Array(types.uint8, 1, "C"),
-            ),
-            (
-                np.array([0, 1], dtype=np.uint16),
-                types.Array(types.uint16, 1, "C"),
-            ),
-        ]
-
-    @skip_on_standalone_numba_cuda
-    @functools.lru_cache(maxsize=None)
-    def _compile(self, pyfunc, args, nrt=False):
-        # NOTE: to test the implementation of Numpy ufuncs, we disable
-        # rewriting of array expressions.
-        return njit(args, _nrt=nrt, no_rewrites=True)(pyfunc)
-
-    def _determine_output_type(
-        self, input_type, int_output_type=None, float_output_type=None
-    ):
-        ty = input_type
-        if isinstance(ty, types.Array):
-            ndim = ty.ndim
-            ty = ty.dtype
+from numba.cuda.tests.support import reset_module_warnings
+
+
+@pytest.fixture
+def base_input():
+    return [
+        (np.uint32(0), types.uint32),
+        (np.uint32(1), types.uint32),
+        (np.int32(-1), types.int32),
+        (np.int32(0), types.int32),
+        (np.int32(1), types.int32),
+        (np.uint64(0), types.uint64),
+        (np.uint64(1), types.uint64),
+        (np.int64(-1), types.int64),
+        (np.int64(0), types.int64),
+        (np.int64(1), types.int64),
+        (np.float32(-0.5), types.float32),
+        (np.float32(0.0), types.float32),
+        (np.float32(0.5), types.float32),
+        (np.float64(-0.5), types.float64),
+        (np.float64(0.0), types.float64),
+        (np.float64(0.5), types.float64),
+        (np.array([0, 1], dtype="u4"), types.Array(types.uint32, 1, "C")),
+        (np.array([0, 1], dtype="u8"), types.Array(types.uint64, 1, "C")),
+        (
+            np.array([-1, 0, 1], dtype="i4"),
+            types.Array(types.int32, 1, "C"),
+        ),
+        (
+            np.array([-1, 0, 1], dtype="i8"),
+            types.Array(types.int64, 1, "C"),
+        ),
+        (
+            np.array([-0.5, 0.0, 0.5], dtype="f4"),
+            types.Array(types.float32, 1, "C"),
+        ),
+        (
+            np.array([-0.5, 0.0, 0.5], dtype="f8"),
+            types.Array(types.float64, 1, "C"),
+        ),
+        (np.array([0, 1], dtype=np.int8), types.Array(types.int8, 1, "C")),
+        (
+            np.array([0, 1], dtype=np.int16),
+            types.Array(types.int16, 1, "C"),
+        ),
+        (
+            np.array([0, 1], dtype=np.uint8),
+            types.Array(types.uint8, 1, "C"),
+        ),
+        (
+            np.array([0, 1], dtype=np.uint16),
+            types.Array(types.uint16, 1, "C"),
+        ),
+    ]
+
+
+@pytest.fixture(name="inputs")
+def ufunc_setup(base_input):
+    # The basic ufunc test does not set up complex inputs, so we'll add
+    # some here for testing with CUDA.
+    extra_input = [
+        (np.complex64(-0.5 - 0.5j), types.complex64),
+        (np.complex64(0.0), types.complex64),
+        (np.complex64(0.5 + 0.5j), types.complex64),
+        (np.complex128(-0.5 - 0.5j), types.complex128),
+        (np.complex128(0.0), types.complex128),
+        (np.complex128(0.5 + 0.5j), types.complex128),
+        (
+            np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype="c8"),
+            types.Array(types.complex64, 1, "C"),
+        ),
+        (
+            np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype="c16"),
+            types.Array(types.complex128, 1, "C"),
+        ),
+    ]
+
+    # Test with multiple dimensions
+    extra_input += [
+        # Basic 2D and 3D arrays
+        (
+            np.linspace(0, 1).reshape((5, -1)),
+            types.Array(types.float64, 2, "C"),
+        ),
+        (
+            np.linspace(0, 1).reshape((2, 5, -1)),
+            types.Array(types.float64, 3, "C"),
+        ),
+        # Complex data (i.e. interleaved)
+        (
+            np.linspace(0, 1 + 1j).reshape(5, -1),
+            types.Array(types.complex128, 2, "C"),
+        ),
+        # F-ordered
+        (
+            np.asfortranarray(np.linspace(0, 1).reshape((5, -1))),
+            types.Array(types.float64, 2, "F"),
+        ),
+    ]
+
+    # Add tests for other integer types
+    extra_input += [
+        (np.uint8(0), types.uint8),
+        (np.uint8(1), types.uint8),
+        (np.int8(-1), types.int8),
+        (np.int8(0), types.int8),
+        (np.uint16(0), types.uint16),
+        (np.uint16(1), types.uint16),
+        (np.int16(-1), types.int16),
+        (np.int16(0), types.int16),
+        (np.ulonglong(0), types.ulonglong),
+        (np.ulonglong(1), types.ulonglong),
+        (np.longlong(-1), types.longlong),
+        (np.longlong(0), types.longlong),
+        (
+            np.array([0, 1], dtype=np.ulonglong),
+            types.Array(types.ulonglong, 1, "C"),
+        ),
+        (
+            np.array([0, 1], dtype=np.longlong),
+            types.Array(types.longlong, 1, "C"),
+        ),
+    ]
+
+    inputs = base_input + extra_input
+
+    low_occupancy_warnings = config.CUDA_LOW_OCCUPANCY_WARNINGS
+    warn_on_implicit_copy = config.CUDA_WARN_ON_IMPLICIT_COPY
+
+    # Disable warnings about low gpu utilization in the test suite
+    config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+    # Disable warnings about host arrays in the test suite
+    config.CUDA_WARN_ON_IMPLICIT_COPY = 0
+
+    yield inputs
+
+    config.CUDA_LOW_OCCUPANCY_WARNINGS = low_occupancy_warnings
+    config.CUDA_WARN_ON_IMPLICIT_COPY = warn_on_implicit_copy
+
+
+def basic_int_ufunc_test(name, inputs):
+    skip_inputs = [
+        types.float32,
+        types.float64,
+        types.Array(types.float32, 1, "C"),
+        types.Array(types.float32, 2, "C"),
+        types.Array(types.float64, 1, "C"),
+        types.Array(types.float64, 2, "C"),
+        types.Array(types.float64, 3, "C"),
+        types.Array(types.float64, 2, "F"),
+        types.complex64,
+        types.complex128,
+        types.Array(types.complex64, 1, "C"),
+        types.Array(types.complex64, 2, "C"),
+        types.Array(types.complex128, 1, "C"),
+        types.Array(types.complex128, 2, "C"),
+    ]
+    basic_ufunc_test(name, inputs, skip_inputs=skip_inputs)
+
+
+def signed_unsigned_cmp_test(comparison_ufunc, inputs):
+    basic_ufunc_test(comparison_ufunc, inputs)
+
+    if numpy_support.numpy_version < (1, 25):
+        return
+
+    # Test additional implementations that specifically handle signed /
+    # unsigned comparisons added in NumPy 1.25:
+    # https://github.com/numpy/numpy/pull/23713
+    additional_inputs = (
+        (np.int64(-1), np.uint64(0)),
+        (np.int64(-1), np.uint64(1)),
+        (np.int64(0), np.uint64(0)),
+        (np.int64(0), np.uint64(1)),
+        (np.int64(1), np.uint64(0)),
+        (np.int64(1), np.uint64(1)),
+        (np.uint64(0), np.int64(-1)),
+        (np.uint64(0), np.int64(0)),
+        (np.uint64(0), np.int64(1)),
+        (np.uint64(1), np.int64(-1)),
+        (np.uint64(1), np.int64(0)),
+        (np.uint64(1), np.int64(1)),
+        (
+            np.array([-1, -1, 0, 0, 1, 1], dtype=np.int64),
+            np.array([0, 1, 0, 1, 0, 1], dtype=np.uint64),
+        ),
+        (
+            np.array([0, 1, 0, 1, 0, 1], dtype=np.uint64),
+            np.array([-1, -1, 0, 0, 1, 1], dtype=np.int64),
+        ),
+    )
+
+    pyfunc = _make_ufunc_usecase(comparison_ufunc)
+
+    for a, b in additional_inputs:
+        input_types = (typeof(a), typeof(b))
+        output_type = types.Array(types.bool_, 1, "C")
+        argtys = input_types + (output_type,)
+        cfunc = _compile(pyfunc, argtys)
+
+        if isinstance(a, np.ndarray):
+            result = np.zeros(a.shape, dtype=np.bool_)
         else:
-            ndim = 1
-
-        if ty in types.signed_domain:
-            if int_output_type:
-                output_type = types.Array(int_output_type, ndim, "C")
-            else:
-                output_type = types.Array(ty, ndim, "C")
-        elif ty in types.unsigned_domain:
-            if int_output_type:
-                output_type = types.Array(int_output_type, ndim, "C")
-            else:
-                output_type = types.Array(ty, ndim, "C")
+            result = np.zeros(1, dtype=np.bool_)
+
+        expected = np.zeros_like(result)
+
+        pyfunc(a, b, expected)
+        cfunc(a, b, result)
+        np.testing.assert_equal(expected, result)
+
+
+def basic_ufunc_test(
+    ufunc,
+    inputs,
+    skip_inputs=(),
+    additional_inputs=(),
+    int_output_type=None,
+    float_output_type=None,
+    kinds="ifc",
+    positive_only=False,
+):
+    # Necessary to avoid some Numpy warnings being silenced, despite
+    # the simplefilter() call below.
+    reset_module_warnings(__name__)
+
+    pyfunc = _make_ufunc_usecase(ufunc)
+
+    for input_operand, input_type in itertools.chain(inputs, additional_inputs):
+        is_tuple = isinstance(input_operand, tuple)
+        if is_tuple:
+            args = input_operand
         else:
-            if float_output_type:
-                output_type = types.Array(float_output_type, ndim, "C")
-            else:
-                output_type = types.Array(ty, ndim, "C")
-        return output_type
-
-
-class BasicUFuncTest(BaseUFuncTest):
-    def _make_ufunc_usecase(self, ufunc):
-        return _make_ufunc_usecase(ufunc)
-
-    def basic_ufunc_test(
-        self,
-        ufunc,
-        skip_inputs=(),
-        additional_inputs=(),
-        int_output_type=None,
-        float_output_type=None,
-        kinds="ifc",
-        positive_only=False,
-    ):
-        # Necessary to avoid some Numpy warnings being silenced, despite
-        # the simplefilter() call below.
-        self.reset_module_warnings(__name__)
-
-        pyfunc = self._make_ufunc_usecase(ufunc)
-
-        for input_operand, input_type in itertools.chain(
-            self.inputs, additional_inputs
-        ):
-            is_tuple = isinstance(input_operand, tuple)
-            if is_tuple:
-                args = input_operand
-            else:
-                args = (input_operand,) * ufunc.nin
-
-            if input_type in skip_inputs:
-                continue
-            if positive_only and np.any(args[0] < 0):
-                continue
-
-            # Some ufuncs don't allow all kinds of arguments
-            if args[0].dtype.kind not in kinds:
-                continue
-
-            output_type = self._determine_output_type(
-                input_type, int_output_type, float_output_type
-            )
+            args = (input_operand,) * ufunc.nin
 
-            input_types = (input_type,) * ufunc.nin
-            output_types = (output_type,) * ufunc.nout
-            argtys = input_types + output_types
-            cfunc = self._compile(pyfunc, argtys)
+        if input_type in skip_inputs:
+            continue
+        if positive_only and np.any(args[0] < 0):
+            continue
 
-            if isinstance(args[0], np.ndarray):
-                results = [
-                    np.zeros(args[0].shape, dtype=out_ty.dtype.name)
-                    for out_ty in output_types
-                ]
-                expected = [
-                    np.zeros(args[0].shape, dtype=out_ty.dtype.name)
-                    for out_ty in output_types
-                ]
-            else:
-                results = [
-                    np.zeros(1, dtype=out_ty.dtype.name)
-                    for out_ty in output_types
-                ]
-                expected = [
-                    np.zeros(1, dtype=out_ty.dtype.name)
-                    for out_ty in output_types
-                ]
+        # Some ufuncs don't allow all kinds of arguments
+        if args[0].dtype.kind not in kinds:
+            continue
 
-            invalid_flag = False
-            with warnings.catch_warnings(record=True) as warnlist:
-                warnings.simplefilter("always")
-                pyfunc(*args, *expected)
-
-                warnmsg = "invalid value encountered"
-                for thiswarn in warnlist:
-                    if issubclass(thiswarn.category, RuntimeWarning) and str(
-                        thiswarn.message
-                    ).startswith(warnmsg):
-                        invalid_flag = True
-
-            cfunc(*args, *results)
-
-            for expected_i, result_i in zip(expected, results):
-                msg = "\n".join(
-                    [
-                        "ufunc '{0}' failed",
-                        "inputs ({1}):",
-                        "{2}",
-                        "got({3})",
-                        "{4}",
-                        "expected ({5}):",
-                        "{6}",
-                    ]
-                ).format(
-                    ufunc.__name__,
-                    input_type,
-                    input_operand,
-                    output_type,
-                    result_i,
-                    expected_i.dtype,
-                    expected_i,
-                )
-                try:
-                    np.testing.assert_array_almost_equal(
-                        expected_i, result_i, decimal=5, err_msg=msg
-                    )
-                except AssertionError:
-                    if invalid_flag:
-                        # Allow output to mismatch for invalid input
-                        print(
-                            "Output mismatch for invalid input",
-                            input_tuple,
-                            result_i,
-                            expected_i,
-                        )
-                    else:
-                        raise
-
-    def signed_unsigned_cmp_test(self, comparison_ufunc):
-        self.basic_ufunc_test(comparison_ufunc)
-
-        if numpy_support.numpy_version < (1, 25):
-            return
-
-        # Test additional implementations that specifically handle signed /
-        # unsigned comparisons added in NumPy 1.25:
-        # https://github.com/numpy/numpy/pull/23713
-        additional_inputs = (
-            (np.int64(-1), np.uint64(0)),
-            (np.int64(-1), np.uint64(1)),
-            (np.int64(0), np.uint64(0)),
-            (np.int64(0), np.uint64(1)),
-            (np.int64(1), np.uint64(0)),
-            (np.int64(1), np.uint64(1)),
-            (np.uint64(0), np.int64(-1)),
-            (np.uint64(0), np.int64(0)),
-            (np.uint64(0), np.int64(1)),
-            (np.uint64(1), np.int64(-1)),
-            (np.uint64(1), np.int64(0)),
-            (np.uint64(1), np.int64(1)),
-            (
-                np.array([-1, -1, 0, 0, 1, 1], dtype=np.int64),
-                np.array([0, 1, 0, 1, 0, 1], dtype=np.uint64),
-            ),
-            (
-                np.array([0, 1, 0, 1, 0, 1], dtype=np.uint64),
-                np.array([-1, -1, 0, 0, 1, 1], dtype=np.int64),
-            ),
+        output_type = _determine_output_type(
+            input_type, int_output_type, float_output_type
         )
 
-        pyfunc = self._make_ufunc_usecase(comparison_ufunc)
-
-        for a, b in additional_inputs:
-            input_types = (typeof(a), typeof(b))
-            output_type = types.Array(types.bool_, 1, "C")
-            argtys = input_types + (output_type,)
-            cfunc = self._compile(pyfunc, argtys)
+        input_types = (input_type,) * ufunc.nin
+        output_types = (output_type,) * ufunc.nout
+        argtys = input_types + output_types
+        cfunc = _compile(pyfunc, argtys)
 
-            if isinstance(a, np.ndarray):
-                result = np.zeros(a.shape, dtype=np.bool_)
-            else:
-                result = np.zeros(1, dtype=np.bool_)
-
-            expected = np.zeros_like(result)
+        if isinstance(args[0], np.ndarray):
+            results = [
+                np.zeros(args[0].shape, dtype=out_ty.dtype.name)
+                for out_ty in output_types
+            ]
+            expected = [
+                np.zeros(args[0].shape, dtype=out_ty.dtype.name)
+                for out_ty in output_types
+            ]
+        else:
+            results = [
+                np.zeros(1, dtype=out_ty.dtype.name) for out_ty in output_types
+            ]
+            expected = [
+                np.zeros(1, dtype=out_ty.dtype.name) for out_ty in output_types
+            ]
 
-            pyfunc(a, b, expected)
-            cfunc(a, b, result)
-            np.testing.assert_equal(expected, result)
+        invalid_flag = False
+        with warnings.catch_warnings(record=True) as warnlist:
+            warnings.simplefilter("always")
+            pyfunc(*args, *expected)
+
+            warnmsg = "invalid value encountered"
+            for thiswarn in warnlist:
+                if issubclass(thiswarn.category, RuntimeWarning) and str(
+                    thiswarn.message
+                ).startswith(warnmsg):
+                    invalid_flag = True
+
+        cfunc(*args, *results)
+
+        for expected_i, result_i in zip(expected, results):
+            msg = "\n".join(
+                [
+                    "ufunc '{0}' failed",
+                    "inputs ({1}):",
+                    "{2}",
+                    "got({3})",
+                    "{4}",
+                    "expected ({5}):",
+                    "{6}",
+                ]
+            ).format(
+                ufunc.__name__,
+                input_type,
+                input_operand,
+                output_type,
+                result_i,
+                expected_i.dtype,
+                expected_i,
+            )
+            try:
+                np.testing.assert_array_almost_equal(
+                    expected_i, result_i, decimal=5, err_msg=msg
+                )
+            except AssertionError:
+                if invalid_flag:
+                    # Allow output to mismatch for invalid input
+                    print(
+                        "Output mismatch for invalid input",
+                        input_tuple,
+                        result_i,
+                        expected_i,
+                    )
+                else:
+                    raise
+
+
+def _determine_output_type(
+    input_type, int_output_type=None, float_output_type=None
+):
+    ty = input_type
+    if isinstance(ty, types.Array):
+        ndim = ty.ndim
+        ty = ty.dtype
+    else:
+        ndim = 1
+
+    if ty in types.signed_domain:
+        if int_output_type:
+            output_type = types.Array(int_output_type, ndim, "C")
+        else:
+            output_type = types.Array(ty, ndim, "C")
+    elif ty in types.unsigned_domain:
+        if int_output_type:
+            output_type = types.Array(int_output_type, ndim, "C")
+        else:
+            output_type = types.Array(ty, ndim, "C")
+    else:
+        if float_output_type:
+            output_type = types.Array(float_output_type, ndim, "C")
+        else:
+            output_type = types.Array(ty, ndim, "C")
+    return output_type
 
 
 def _make_ufunc_usecase(ufunc):
@@ -283,179 +373,57 @@ def _make_ufunc_usecase(ufunc):
     return fn
 
 
-# This class provides common functionality for UFunc tests. The UFunc tests
-# are quite long-running in comparison to other tests, so we break the tests up
-# into multiple test classes for distribution across workers.
-#
-# This class would also be a CUDATestCase, but to avoid a confusing and
-# potentially dangerous inheritance diamond with setUp methods that modify
-# global state, we implement the necessary part of CUDATestCase within this
-# class instead. This disables CUDA performance warnings for the duration of
-# tests.
-class CUDAUFuncTestBase(BasicUFuncTest, TestCase):
-    def setUp(self):
-        BasicUFuncTest.setUp(self)
-
-        # The basic ufunc test does not set up complex inputs, so we'll add
-        # some here for testing with CUDA.
-        self.inputs.extend(
-            [
-                (np.complex64(-0.5 - 0.5j), types.complex64),
-                (np.complex64(0.0), types.complex64),
-                (np.complex64(0.5 + 0.5j), types.complex64),
-                (np.complex128(-0.5 - 0.5j), types.complex128),
-                (np.complex128(0.0), types.complex128),
-                (np.complex128(0.5 + 0.5j), types.complex128),
-                (
-                    np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype="c8"),
-                    types.Array(types.complex64, 1, "C"),
-                ),
-                (
-                    np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype="c16"),
-                    types.Array(types.complex128, 1, "C"),
-                ),
-            ]
-        )
+@skip_on_standalone_numba_cuda
+@functools.lru_cache(maxsize=None)
+def _compile(pyfunc, args, nrt=False):
+    # NOTE: to test the implementation of Numpy ufuncs, we disable
+    # rewriting of array expressions.
+    return njit(args, _nrt=nrt, no_rewrites=True)(pyfunc)
 
-        # Test with multiple dimensions
-        self.inputs.extend(
-            [
-                # Basic 2D and 3D arrays
-                (
-                    np.linspace(0, 1).reshape((5, -1)),
-                    types.Array(types.float64, 2, "C"),
-                ),
-                (
-                    np.linspace(0, 1).reshape((2, 5, -1)),
-                    types.Array(types.float64, 3, "C"),
-                ),
-                # Complex data (i.e. interleaved)
-                (
-                    np.linspace(0, 1 + 1j).reshape(5, -1),
-                    types.Array(types.complex128, 2, "C"),
-                ),
-                # F-ordered
-                (
-                    np.asfortranarray(np.linspace(0, 1).reshape((5, -1))),
-                    types.Array(types.float64, 2, "F"),
-                ),
-            ]
-        )
 
-        # Add tests for other integer types
-        self.inputs.extend(
-            [
-                (np.uint8(0), types.uint8),
-                (np.uint8(1), types.uint8),
-                (np.int8(-1), types.int8),
-                (np.int8(0), types.int8),
-                (np.uint16(0), types.uint16),
-                (np.uint16(1), types.uint16),
-                (np.int16(-1), types.int16),
-                (np.int16(0), types.int16),
-                (np.ulonglong(0), types.ulonglong),
-                (np.ulonglong(1), types.ulonglong),
-                (np.longlong(-1), types.longlong),
-                (np.longlong(0), types.longlong),
-                (
-                    np.array([0, 1], dtype=np.ulonglong),
-                    types.Array(types.ulonglong, 1, "C"),
-                ),
-                (
-                    np.array([0, 1], dtype=np.longlong),
-                    types.Array(types.longlong, 1, "C"),
-                ),
-            ]
-        )
+class TestBasicTrigUFuncs:
+    def test_sin_ufunc(self, inputs):
+        basic_ufunc_test(np.sin, inputs, kinds="cf")
 
-        self._low_occupancy_warnings = config.CUDA_LOW_OCCUPANCY_WARNINGS
-        self._warn_on_implicit_copy = config.CUDA_WARN_ON_IMPLICIT_COPY
+    def test_cos_ufunc(self, inputs):
+        basic_ufunc_test(np.cos, inputs, kinds="cf")
 
-        # Disable warnings about low gpu utilization in the test suite
-        config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-        # Disable warnings about host arrays in the test suite
-        config.CUDA_WARN_ON_IMPLICIT_COPY = 0
+    def test_tan_ufunc(self, inputs):
+        basic_ufunc_test(np.tan, inputs, kinds="cf")
 
-    def tearDown(self):
-        # Restore original warning settings
-        config.CUDA_LOW_OCCUPANCY_WARNINGS = self._low_occupancy_warnings
-        config.CUDA_WARN_ON_IMPLICIT_COPY = self._warn_on_implicit_copy
+    def test_arcsin_ufunc(self, inputs):
+        basic_ufunc_test(np.arcsin, inputs, kinds="cf")
 
-    def _make_ufunc_usecase(self, ufunc):
-        return _make_ufunc_usecase(ufunc)
+    def test_arccos_ufunc(self, inputs):
+        basic_ufunc_test(np.arccos, inputs, kinds="cf")
 
-    @functools.lru_cache(maxsize=None)
-    def _compile(self, pyfunc, args):
-        # We return an already-configured kernel so that basic_ufunc_test can
-        # call it just like it does for a CPU function
-        return cuda.jit(args)(pyfunc)[1, 1]
+    def test_arctan_ufunc(self, inputs):
+        basic_ufunc_test(np.arctan, inputs, kinds="cf")
 
-    def basic_int_ufunc_test(self, name=None):
-        skip_inputs = [
-            types.float32,
-            types.float64,
-            types.Array(types.float32, 1, "C"),
-            types.Array(types.float32, 2, "C"),
-            types.Array(types.float64, 1, "C"),
-            types.Array(types.float64, 2, "C"),
-            types.Array(types.float64, 3, "C"),
-            types.Array(types.float64, 2, "F"),
-            types.complex64,
-            types.complex128,
-            types.Array(types.complex64, 1, "C"),
-            types.Array(types.complex64, 2, "C"),
-            types.Array(types.complex128, 1, "C"),
-            types.Array(types.complex128, 2, "C"),
-        ]
-        self.basic_ufunc_test(name, skip_inputs=skip_inputs)
-
-    ############################################################################
-    # Trigonometric Functions
-
-
-class TestBasicTrigUFuncs(CUDAUFuncTestBase):
-    def test_sin_ufunc(self):
-        self.basic_ufunc_test(np.sin, kinds="cf")
-
-    def test_cos_ufunc(self):
-        self.basic_ufunc_test(np.cos, kinds="cf")
-
-    def test_tan_ufunc(self):
-        self.basic_ufunc_test(np.tan, kinds="cf")
-
-    def test_arcsin_ufunc(self):
-        self.basic_ufunc_test(np.arcsin, kinds="cf")
-
-    def test_arccos_ufunc(self):
-        self.basic_ufunc_test(np.arccos, kinds="cf")
+    def test_arctan2_ufunc(self, inputs):
+        basic_ufunc_test(np.arctan2, inputs, kinds="f")
 
-    def test_arctan_ufunc(self):
-        self.basic_ufunc_test(np.arctan, kinds="cf")
 
-    def test_arctan2_ufunc(self):
-        self.basic_ufunc_test(np.arctan2, kinds="f")
+class TestHypTrigUFuncs:
+    def test_hypot_ufunc(self, inputs):
+        basic_ufunc_test(np.hypot, inputs, kinds="f")
 
+    def test_sinh_ufunc(self, inputs):
+        basic_ufunc_test(np.sinh, inputs, kinds="cf")
 
-class TestHypTrigUFuncs(CUDAUFuncTestBase):
-    def test_hypot_ufunc(self):
-        self.basic_ufunc_test(np.hypot, kinds="f")
+    def test_cosh_ufunc(self, inputs):
+        basic_ufunc_test(np.cosh, inputs, kinds="cf")
 
-    def test_sinh_ufunc(self):
-        self.basic_ufunc_test(np.sinh, kinds="cf")
+    def test_tanh_ufunc(self, inputs):
+        basic_ufunc_test(np.tanh, inputs, kinds="cf")
 
-    def test_cosh_ufunc(self):
-        self.basic_ufunc_test(np.cosh, kinds="cf")
+    def test_arcsinh_ufunc(self, inputs):
+        basic_ufunc_test(np.arcsinh, inputs, kinds="cf")
 
-    def test_tanh_ufunc(self):
-        self.basic_ufunc_test(np.tanh, kinds="cf")
+    def test_arccosh_ufunc(self, inputs):
+        basic_ufunc_test(np.arccosh, inputs, kinds="cf")
 
-    def test_arcsinh_ufunc(self):
-        self.basic_ufunc_test(np.arcsinh, kinds="cf")
-
-    def test_arccosh_ufunc(self):
-        self.basic_ufunc_test(np.arccosh, kinds="cf")
-
-    def test_arctanh_ufunc(self):
+    def test_arctanh_ufunc(self, inputs):
         # arctanh is only valid is only finite in the range ]-1, 1[
         # This means that for any of the integer types it will produce
         # conversion from infinity/-infinity to integer. That's undefined
@@ -475,109 +443,94 @@ def test_arctanh_ufunc(self):
             types.int64,
         ]
 
-        self.basic_ufunc_test(np.arctanh, skip_inputs=to_skip, kinds="cf")
-
-
-class TestConversionUFuncs(CUDAUFuncTestBase):
-    def test_deg2rad_ufunc(self):
-        self.basic_ufunc_test(np.deg2rad, kinds="f")
-
-    def test_rad2deg_ufunc(self):
-        self.basic_ufunc_test(np.rad2deg, kinds="f")
-
-    def test_degrees_ufunc(self):
-        self.basic_ufunc_test(np.degrees, kinds="f")
-
-    def test_radians_ufunc(self):
-        self.basic_ufunc_test(np.radians, kinds="f")
+        basic_ufunc_test(np.arctanh, inputs, skip_inputs=to_skip, kinds="cf")
 
-    ############################################################################
-    # Comparison functions
 
+class TestConversionUFuncs:
+    def test_deg2rad_ufunc(self, inputs):
+        basic_ufunc_test(np.deg2rad, inputs, kinds="f")
 
-class TestComparisonUFuncs1(CUDAUFuncTestBase):
-    def test_greater_ufunc(self):
-        self.signed_unsigned_cmp_test(np.greater)
+    def test_rad2deg_ufunc(self, inputs):
+        basic_ufunc_test(np.rad2deg, inputs, kinds="f")
 
-    def test_greater_equal_ufunc(self):
-        self.signed_unsigned_cmp_test(np.greater_equal)
+    def test_degrees_ufunc(self, inputs):
+        basic_ufunc_test(np.degrees, inputs, kinds="f")
 
-    def test_less_ufunc(self):
-        self.signed_unsigned_cmp_test(np.less)
+    def test_radians_ufunc(self, inputs):
+        basic_ufunc_test(np.radians, inputs, kinds="f")
 
-    def test_less_equal_ufunc(self):
-        self.signed_unsigned_cmp_test(np.less_equal)
 
-    def test_not_equal_ufunc(self):
-        self.signed_unsigned_cmp_test(np.not_equal)
+class TestComparisonUFuncs1:
+    def test_greater_ufunc(self, inputs):
+        signed_unsigned_cmp_test(np.greater, inputs)
 
-    def test_equal_ufunc(self):
-        self.signed_unsigned_cmp_test(np.equal)
+    def test_greater_equal_ufunc(self, inputs):
+        signed_unsigned_cmp_test(np.greater_equal, inputs)
 
+    def test_less_ufunc(self, inputs):
+        signed_unsigned_cmp_test(np.less, inputs)
 
-class TestLogicalUFuncs(CUDAUFuncTestBase):
-    def test_logical_and_ufunc(self):
-        self.basic_ufunc_test(np.logical_and)
+    def test_less_equal_ufunc(self, inputs):
+        signed_unsigned_cmp_test(np.less_equal, inputs)
 
-    def test_logical_or_ufunc(self):
-        self.basic_ufunc_test(np.logical_or)
+    def test_not_equal_ufunc(self, inputs):
+        signed_unsigned_cmp_test(np.not_equal, inputs)
 
-    def test_logical_xor_ufunc(self):
-        self.basic_ufunc_test(np.logical_xor)
+    def test_equal_ufunc(self, inputs):
+        signed_unsigned_cmp_test(np.equal, inputs)
 
-    def test_logical_not_ufunc(self):
-        self.basic_ufunc_test(np.logical_not)
 
+class TestLogicalUFuncs:
+    def test_logical_and_ufunc(self, inputs):
+        basic_ufunc_test(np.logical_and, inputs)
 
-class TestMinmaxUFuncs(CUDAUFuncTestBase):
-    def test_maximum_ufunc(self):
-        self.basic_ufunc_test(np.maximum)
+    def test_logical_or_ufunc(self, inputs):
+        basic_ufunc_test(np.logical_or, inputs)
 
-    def test_minimum_ufunc(self):
-        self.basic_ufunc_test(np.minimum)
+    def test_logical_xor_ufunc(self, inputs):
+        basic_ufunc_test(np.logical_xor, inputs)
 
-    def test_fmax_ufunc(self):
-        self.basic_ufunc_test(np.fmax)
+    def test_logical_not_ufunc(self, inputs):
+        basic_ufunc_test(np.logical_not, inputs)
 
-    def test_fmin_ufunc(self):
-        self.basic_ufunc_test(np.fmin)
 
+class TestMinmaxUFuncs:
+    def test_maximum_ufunc(self, inputs):
+        basic_ufunc_test(np.maximum, inputs)
 
-class TestBitwiseUFuncs(CUDAUFuncTestBase):
-    def test_bitwise_and_ufunc(self):
-        self.basic_int_ufunc_test(np.bitwise_and)
+    def test_minimum_ufunc(self, inputs):
+        basic_ufunc_test(np.minimum, inputs)
 
-    def test_bitwise_or_ufunc(self):
-        self.basic_int_ufunc_test(np.bitwise_or)
+    def test_fmax_ufunc(self, inputs):
+        basic_ufunc_test(np.fmax, inputs)
 
-    def test_bitwise_xor_ufunc(self):
-        self.basic_int_ufunc_test(np.bitwise_xor)
+    def test_fmin_ufunc(self, inputs):
+        basic_ufunc_test(np.fmin, inputs)
 
-    def test_invert_ufunc(self):
-        self.basic_int_ufunc_test(np.invert)
 
-    def test_bitwise_not_ufunc(self):
-        self.basic_int_ufunc_test(np.bitwise_not)
+class TestBitwiseUFuncs:
+    def test_bitwise_and_ufunc(self, inputs):
+        basic_int_ufunc_test(np.bitwise_and, inputs)
 
-    # Note: there is no entry for np.left_shift and np.right_shift
-    # because their implementations in NumPy have undefined behavior
-    # when the second argument is a negative. See the comment in
-    # numba/tests/test_ufuncs.py for more details.
+    def test_bitwise_or_ufunc(self, inputs):
+        basic_int_ufunc_test(np.bitwise_or, inputs)
 
-    ############################################################################
-    # Mathematical Functions
+    def test_bitwise_xor_ufunc(self, inputs):
+        basic_int_ufunc_test(np.bitwise_xor, inputs)
 
+    def test_invert_ufunc(self, inputs):
+        basic_int_ufunc_test(np.invert, inputs)
 
-class TestLogUFuncs(CUDAUFuncTestBase):
-    def test_log_ufunc(self):
-        self.basic_ufunc_test(np.log, kinds="cf")
+    def test_bitwise_not_ufunc(self, inputs):
+        basic_int_ufunc_test(np.bitwise_not, inputs)
 
-    def test_log2_ufunc(self):
-        self.basic_ufunc_test(np.log2, kinds="cf")
 
-    def test_log10_ufunc(self):
-        self.basic_ufunc_test(np.log10, kinds="cf")
+class TestLogUFuncs:
+    def test_log_ufunc(self, inputs):
+        basic_ufunc_test(np.log, inputs, kinds="cf")
 
+    def test_log2_ufunc(self, inputs):
+        basic_ufunc_test(np.log2, inputs, kinds="cf")
 
-if __name__ == "__main__":
-    unittest.main()
+    def test_log10_ufunc(self, inputs):
+        basic_ufunc_test(np.log10, inputs, kinds="cf")
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_random.py b/numba_cuda/numba/cuda/tests/doc_examples/test_random.py
index f8c198a2c..2428ce133 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_random.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_random.py
@@ -4,63 +4,57 @@
 # Contents in this file are referenced from the sphinx-generated docs.
 # "magictoken" is used for markers as beginning and ending of example text.
 
-import unittest
-from numba.cuda.testing import CUDATestCase, skip_on_cudasim
+from numba.cuda.testing import skip_on_cudasim
 
 
 @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
-class TestRandom(CUDATestCase):
-    def test_ex_3d_grid(self):
-        # magictoken.ex_3d_grid.begin
-        from numba import cuda
-        from numba.cuda.random import (
-            create_xoroshiro128p_states,
-            xoroshiro128p_uniform_float32,
-        )
-        import numpy as np
-
-        @cuda.jit
-        def random_3d(arr, rng_states):
-            # Per-dimension thread indices and strides
-            startx, starty, startz = cuda.grid(3)
-            stridex, stridey, stridez = cuda.gridsize(3)
-
-            # Linearized thread index
-            tid = (startz * stridey * stridex) + (starty * stridex) + startx
-
-            # Use strided loops over the array to assign a random value to each entry
-            for i in range(startz, arr.shape[0], stridez):
-                for j in range(starty, arr.shape[1], stridey):
-                    for k in range(startx, arr.shape[2], stridex):
-                        arr[i, j, k] = xoroshiro128p_uniform_float32(
-                            rng_states, tid
-                        )
-
-        # Array dimensions
-        X, Y, Z = 701, 900, 719
-
-        # Block and grid dimensions
-        bx, by, bz = 8, 8, 8
-        gx, gy, gz = 16, 16, 16
-
-        # Total number of threads
-        nthreads = bx * by * bz * gx * gy * gz
-
-        # Initialize a state for each thread
-        rng_states = create_xoroshiro128p_states(nthreads, seed=1)
-
-        # Generate random numbers
-        arr = cuda.device_array((X, Y, Z), dtype=np.float32)
-        random_3d[(gx, gy, gz), (bx, by, bz)](arr, rng_states)
-        # magictoken.ex_3d_grid.end
-
-        # Some basic tests of the randomly-generated numbers
-        host_arr = arr.copy_to_host()
-        self.assertGreater(np.mean(host_arr), 0.49)
-        self.assertLess(np.mean(host_arr), 0.51)
-        self.assertTrue(np.all(host_arr <= 1.0))
-        self.assertTrue(np.all(host_arr >= 0.0))
-
-
-if __name__ == "__main__":
-    unittest.main()
+def test_random():
+    # magictoken.ex_3d_grid.begin
+    from numba import cuda
+    from numba.cuda.random import (
+        create_xoroshiro128p_states,
+        xoroshiro128p_uniform_float32,
+    )
+    import numpy as np
+
+    @cuda.jit
+    def random_3d(arr, rng_states):
+        # Per-dimension thread indices and strides
+        startx, starty, startz = cuda.grid(3)
+        stridex, stridey, stridez = cuda.gridsize(3)
+
+        # Linearized thread index
+        tid = (startz * stridey * stridex) + (starty * stridex) + startx
+
+        # Use strided loops over the array to assign a random value to each entry
+        for i in range(startz, arr.shape[0], stridez):
+            for j in range(starty, arr.shape[1], stridey):
+                for k in range(startx, arr.shape[2], stridex):
+                    arr[i, j, k] = xoroshiro128p_uniform_float32(
+                        rng_states, tid
+                    )
+
+    # Array dimensions
+    X, Y, Z = 701, 900, 719
+
+    # Block and grid dimensions
+    bx, by, bz = 8, 8, 8
+    gx, gy, gz = 16, 16, 16
+
+    # Total number of threads
+    nthreads = bx * by * bz * gx * gy * gz
+
+    # Initialize a state for each thread
+    rng_states = create_xoroshiro128p_states(nthreads, seed=1)
+
+    # Generate random numbers
+    arr = cuda.device_array((X, Y, Z), dtype=np.float32)
+    random_3d[(gx, gy, gz), (bx, by, bz)](arr, rng_states)
+    # magictoken.ex_3d_grid.end
+
+    # Some basic tests of the randomly-generated numbers
+    host_arr = arr.copy_to_host()
+    assert np.mean(host_arr) > 0.49
+    assert np.mean(host_arr) < 0.51
+    assert np.all(host_arr <= 1.0)
+    assert np.all(host_arr >= 0.0)
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py b/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py
index adff666ae..cba7a9411 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py
@@ -1,31 +1,19 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
-import unittest
+# Contents in this file are referenced from the sphinx-generated docs.
+# "ex_cuda_ufunc" is used for markers as beginning and ending of example text.
 
 from numba.cuda.testing import CUDATestCase, skip_on_cudasim
 from numba.cuda.tests.support import captured_stdout
 
 
 @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
-class TestUFunc(CUDATestCase):
+def test_ufunc():
     """
     Test calling a UFunc
     """
-
-    def setUp(self):
-        # Prevent output from this test showing
-        # up when running the test suite
-        self._captured_stdout = captured_stdout()
-        self._captured_stdout.__enter__()
-        super().setUp()
-
-    def tearDown(self):
-        # No exception type, value, or traceback
-        self._captured_stdout.__exit__(None, None, None)
-        super().tearDown()
-
-    def test_ex_cuda_ufunc_call(self):
+    with captured_stdout():
         # ex_cuda_ufunc.begin
         import numpy as np
         from numba import cuda
@@ -47,7 +35,3 @@ def f(r, x):
         # the sin ufunc inside the kernel, and NumPy's sin ufunc
         np.testing.assert_allclose(r, np.sin(x))
         # ex_cuda_ufunc.end
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/support.py b/numba_cuda/numba/cuda/tests/support.py
index a895c97ed..725f19cf0 100644
--- a/numba_cuda/numba/cuda/tests/support.py
+++ b/numba_cuda/numba/cuda/tests/support.py
@@ -164,7 +164,8 @@ def import_dynamic(modname):
 
 
 def ignore_internal_warnings():
-    """Use in testing within a ` warnings.catch_warnings` block to filter out
+    """
+    Use in testing within a ` warnings.catch_warnings` block to filter out
     warnings that are unrelated/internally generated by Numba.
     """
     # Filter out warnings from typeguard
@@ -178,6 +179,20 @@ def ignore_internal_warnings():
     )
 
 
+def reset_module_warnings(module):
+    """
+    Reset the warnings registry of a module.  This can be necessary
+    as the warnings module is buggy in that regard.
+    See http://bugs.python.org/issue4180
+    """
+    if isinstance(module, str):
+        module = sys.modules[module]
+    try:
+        del module.__warningregistry__
+    except AttributeError:
+        pass
+
+
 @contextlib.contextmanager
 def override_config(name, value):
     """
@@ -246,19 +261,6 @@ class TestCase(unittest.TestCase):
     def random(self):
         return np.random.RandomState(42)
 
-    def reset_module_warnings(self, module):
-        """
-        Reset the warnings registry of a module.  This can be necessary
-        as the warnings module is buggy in that regard.
-        See http://bugs.python.org/issue4180
-        """
-        if isinstance(module, str):
-            module = sys.modules[module]
-        try:
-            del module.__warningregistry__
-        except AttributeError:
-            pass
-
     @contextlib.contextmanager
     def assertTypingError(self):
         """