From 5c3689788e4b30ebe7ed972f0005f21c1528d67a Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay Date: Tue, 20 Jan 2026 12:03:48 +0530 Subject: [PATCH 01/12] Added proper test cases for test_casting --- numba_cuda/numba/cuda/tests/test_casting.py | 60 +++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 numba_cuda/numba/cuda/tests/test_casting.py diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py new file mode 100644 index 000000000..32958d488 --- /dev/null +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -0,0 +1,60 @@ +import numpy as np +import unittest + +from numba import cuda +from numba.cuda.testing import CUDATestCase, skip_on_cudasim + + +@cuda.jit +def cast_kernel(inp, out): + i = cuda.grid(1) + if i < inp.size: + out[i] = inp[i] + + +class TestCudaCasting(CUDATestCase): + + def _run_cast_test(self, src_dtype, dst_dtype, values): + src = np.array(values, dtype=src_dtype) + dst = np.zeros_like(src, dtype=dst_dtype) + + d_src = cuda.to_device(src) + d_dst = cuda.to_device(dst) + + threadsperblock = 128 + blockspergrid = (src.size + threadsperblock - 1) // threadsperblock + + cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) + + result = d_dst.copy_to_host() + expected = src.astype(dst_dtype) + + np.testing.assert_array_equal(result, expected) + + + def test_int32_to_int64(self): + self._run_cast_test(np.int32, np.int64, [1, 2, -3, 4]) + + + def test_int64_to_int32(self): + self._run_cast_test(np.int64, np.int32, [1, 2, -3, 4]) + + + def test_int_to_float(self): + self._run_cast_test(np.int32, np.float32, [1, -2, 3, 4]) + + + def test_float_to_int(self): + self._run_cast_test(np.float32, np.int32, [1.7, -2.2, 3.9]) + + + def test_float32_to_float64(self): + self._run_cast_test(np.float32, np.float64, [1.5, -2.5, 3.25]) + + + def test_bool_to_int(self): + self._run_cast_test(np.bool_, np.int32, [True, False, True]) + + + def test_int_to_bool(self): + self._run_cast_test(np.int32, np.bool_, [0, 1, 2, -1]) From 8868c97102b21a109143288d4d1edd320a5836de Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay Date: Tue, 20 Jan 2026 15:55:20 +0530 Subject: [PATCH 02/12] CUDA: Add device-side casting tests for numeric and boolean types (#515) This adds kernel-based tests for device-side casting between common numeric and boolean dtypes, mirroring a subset of CPU test_casting. Tests are skipped under cudasim. References: #515. --- numba_cuda/numba/cuda/tests/test_casting.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index 32958d488..bcf631ce2 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -12,6 +12,7 @@ def cast_kernel(inp, out): out[i] = inp[i] +@skip_on_cudasim("Casting semantics differ under cudasim") class TestCudaCasting(CUDATestCase): def _run_cast_test(self, src_dtype, dst_dtype, values): @@ -24,7 +25,9 @@ def _run_cast_test(self, src_dtype, dst_dtype, values): threadsperblock = 128 blockspergrid = (src.size + threadsperblock - 1) // threadsperblock + cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) + cuda.synchronize() result = d_dst.copy_to_host() expected = src.astype(dst_dtype) @@ -44,7 +47,8 @@ def test_int_to_float(self): self._run_cast_test(np.int32, np.float32, [1, -2, 3, 4]) - def test_float_to_int(self): + # CUDA follows C-style truncation toward zero + def test_float_to_int_truncation(self): self._run_cast_test(np.float32, np.int32, [1.7, -2.2, 3.9]) From 00f626567ef2fd5af8b190ad9cc9e231feedd65c Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay <104912634+CodersAcademy006@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:10:46 +0530 Subject: [PATCH 03/12] Update numba_cuda/numba/cuda/tests/test_casting.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- numba_cuda/numba/cuda/tests/test_casting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index bcf631ce2..ab41276fa 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -24,7 +24,7 @@ def _run_cast_test(self, src_dtype, dst_dtype, values): threadsperblock = 128 blockspergrid = (src.size + threadsperblock - 1) // threadsperblock - + blockspergrid = (src.size + threadsperblock - 1) // threadsperblock cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) cuda.synchronize() From c1d4517b49a6b257539e4be4d45521b80e11801e Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay <104912634+CodersAcademy006@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:29:12 +0530 Subject: [PATCH 04/12] Update numba_cuda/numba/cuda/tests/test_casting.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- numba_cuda/numba/cuda/tests/test_casting.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index ab41276fa..bcc3ff688 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -22,9 +22,11 @@ def _run_cast_test(self, src_dtype, dst_dtype, values): d_src = cuda.to_device(src) d_dst = cuda.to_device(dst) + threadsperblock = 128 threadsperblock = 128 blockspergrid = (src.size + threadsperblock - 1) // threadsperblock - blockspergrid = (src.size + threadsperblock - 1) // threadsperblock + + cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) cuda.synchronize() From 29aa89e9d3f92c3190026401711088596af7079a Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay <104912634+CodersAcademy006@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:31:53 +0530 Subject: [PATCH 05/12] Update numba_cuda/numba/cuda/tests/test_casting.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- numba_cuda/numba/cuda/tests/test_casting.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index bcc3ff688..3c7118b39 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -1,5 +1,7 @@ import numpy as np -import unittest +import numpy as np + +from numba import cuda from numba import cuda from numba.cuda.testing import CUDATestCase, skip_on_cudasim From 01816bcff01f39c864dc4c6ce7ae8a245a6396cb Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay <104912634+CodersAcademy006@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:32:03 +0530 Subject: [PATCH 06/12] Update numba_cuda/numba/cuda/tests/test_casting.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- numba_cuda/numba/cuda/tests/test_casting.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index 3c7118b39..65ecb017a 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -24,10 +24,12 @@ def _run_cast_test(self, src_dtype, dst_dtype, values): d_src = cuda.to_device(src) d_dst = cuda.to_device(dst) - threadsperblock = 128 threadsperblock = 128 blockspergrid = (src.size + threadsperblock - 1) // threadsperblock + cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) + cuda.synchronize() + cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) From fa17a012dd588c5b241f60ca2e709dee485e8886 Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay <104912634+CodersAcademy006@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:33:43 +0530 Subject: [PATCH 07/12] Update numba_cuda/numba/cuda/tests/test_casting.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- numba_cuda/numba/cuda/tests/test_casting.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index 65ecb017a..7c2cf0c85 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -27,6 +27,7 @@ def _run_cast_test(self, src_dtype, dst_dtype, values): threadsperblock = 128 blockspergrid = (src.size + threadsperblock - 1) // threadsperblock + cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) cuda.synchronize() From 3dc9a44fc2a2383c1191a5aa5e6108087bbc1a79 Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay <104912634+CodersAcademy006@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:33:51 +0530 Subject: [PATCH 08/12] Update numba_cuda/numba/cuda/tests/test_casting.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- numba_cuda/numba/cuda/tests/test_casting.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index 7c2cf0c85..a010c057e 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -1,7 +1,4 @@ import numpy as np -import numpy as np - -from numba import cuda from numba import cuda from numba.cuda.testing import CUDATestCase, skip_on_cudasim From afdf6aaacba15eaa1daa15bfd57298905799f126 Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay <104912634+CodersAcademy006@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:33:59 +0530 Subject: [PATCH 09/12] Update numba_cuda/numba/cuda/tests/test_casting.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- numba_cuda/numba/cuda/tests/test_casting.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index a010c057e..302326f3b 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -28,11 +28,6 @@ def _run_cast_test(self, src_dtype, dst_dtype, values): cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) cuda.synchronize() - cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) - - cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) - cuda.synchronize() - result = d_dst.copy_to_host() expected = src.astype(dst_dtype) From 40baf2f2382cd1e850dc1e3b244dfb869c453ed5 Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay <104912634+CodersAcademy006@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:35:11 +0530 Subject: [PATCH 10/12] Update numba_cuda/numba/cuda/tests/test_casting.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- numba_cuda/numba/cuda/tests/test_casting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index 302326f3b..02aacc047 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -25,7 +25,7 @@ def _run_cast_test(self, src_dtype, dst_dtype, values): blockspergrid = (src.size + threadsperblock - 1) // threadsperblock cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) - cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) + cuda.synchronize() cuda.synchronize() result = d_dst.copy_to_host() From 2f17d073f64e505bc660782ba4a0ab76eb453f13 Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay <104912634+CodersAcademy006@users.noreply.github.com> Date: Wed, 21 Jan 2026 11:36:51 +0530 Subject: [PATCH 11/12] Update numba_cuda/numba/cuda/tests/test_casting.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- numba_cuda/numba/cuda/tests/test_casting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index 02aacc047..302326f3b 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -25,7 +25,7 @@ def _run_cast_test(self, src_dtype, dst_dtype, values): blockspergrid = (src.size + threadsperblock - 1) // threadsperblock cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) - cuda.synchronize() + cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) cuda.synchronize() result = d_dst.copy_to_host() From 29cddca62abe519c2de0aabe3b7f12aa6fa13eda Mon Sep 17 00:00:00 2001 From: Srijan Upadhyay Date: Wed, 21 Jan 2026 11:37:42 +0530 Subject: [PATCH 12/12] Optimize test_casting: use device_array and remove redundant sync - Use cuda.device_array() instead of zeros+to_device for efficiency - Remove duplicate cuda.synchronize() calls (copy_to_host is already sync) - Eliminate unnecessary Host-to-Device memory transfer --- numba_cuda/numba/cuda/tests/test_casting.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/test_casting.py b/numba_cuda/numba/cuda/tests/test_casting.py index 302326f3b..4c347b220 100644 --- a/numba_cuda/numba/cuda/tests/test_casting.py +++ b/numba_cuda/numba/cuda/tests/test_casting.py @@ -16,17 +16,14 @@ class TestCudaCasting(CUDATestCase): def _run_cast_test(self, src_dtype, dst_dtype, values): src = np.array(values, dtype=src_dtype) - dst = np.zeros_like(src, dtype=dst_dtype) d_src = cuda.to_device(src) - d_dst = cuda.to_device(dst) + d_dst = cuda.device_array(src.shape, dtype=dst_dtype) threadsperblock = 128 blockspergrid = (src.size + threadsperblock - 1) // threadsperblock cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) - cast_kernel[blockspergrid, threadsperblock](d_src, d_dst) - cuda.synchronize() result = d_dst.copy_to_host() expected = src.astype(dst_dtype)