[Do not merge] Test KernelIntrinsics by christiangnrd · Pull Request #2944 · JuliaGPU/CUDA.jl

christiangnrd · 2025-10-22T14:56:20Z

[only tests]
[only benchmarks]

github-actions · 2025-10-22T14:57:06Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 5e39ab68e..7f8ec7ad4 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -162,29 +162,29 @@ end
 
 KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = cufunction(f, tt; name, kwargs...)
-    KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
 function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+    obj.kern(args...; threads = workgroupsize, blocks = numworkgroups)
     return nothing
 end
 
 
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
     kernel_config = launch_configuration(kernel.kern.fun)
 
-    Int(min(kernel_config.threads, max_work_items))
+    return Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -199,7 +199,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 54fab2119..e0631d387 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 802c832e5..11584ca92 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "CUDA.jl") == "CUDA.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Distributed

src/mapreduce.jl

src/CUDAKernels.jl

github-actions

CUDA.jl Benchmarks

Details

Benchmark suite	Current: `3c75eb6`	Previous: `9f56ee2`	Ratio
`latency/precompile`	`54337646854.5` ns	`45061042309.5` ns	`1.21`
`latency/ttfp`	`12957095931` ns	`12778273638` ns	`1.01`
`latency/import`	`3551763524` ns	`3543310654` ns	`1.00`
`integration/volumerhs`	`9438341.5` ns	`9438152` ns	`1.00`
`integration/byval/slices=1`	`145862` ns	`145916` ns	`1.00`
`integration/byval/slices=3`	`423025` ns	`423037` ns	`1.00`
`integration/byval/reference`	`143927` ns	`143945` ns	`1.00`
`integration/byval/slices=2`	`284696` ns	`284549` ns	`1.00`
`integration/cudadevrt`	`102606` ns	`102670.5` ns	`1.00`
`kernel/indexing`	`13705` ns	`13342` ns	`1.03`
`kernel/indexing_checked`	`14354` ns	`14086` ns	`1.02`
`kernel/occupancy`	`663.8220858895705` ns	`660.9593749999999` ns	`1.00`
`kernel/launch`	`2178.5555555555557` ns	`2204` ns	`0.99`
`kernel/rand`	`16127` ns	`14561` ns	`1.11`
`array/reverse/1d`	`18528` ns	`18374` ns	`1.01`
`array/reverse/2dL_inplace`	`66523` ns	`66121` ns	`1.01`
`array/reverse/1dL`	`69060` ns	`68975` ns	`1.00`
`array/reverse/2d`	`20665` ns	`20697` ns	`1.00`
`array/reverse/1d_inplace`	`8551.666666666666` ns	`8587.333333333334` ns	`1.00`
`array/reverse/2d_inplace`	`10725` ns	`10398` ns	`1.03`
`array/reverse/2dL`	`72862` ns	`72780` ns	`1.00`
`array/reverse/1dL_inplace`	`65960` ns	`66028.5` ns	`1.00`
`array/copy`	`18769` ns	`18723` ns	`1.00`
`array/iteration/findall/int`	`151676` ns	`149865` ns	`1.01`
`array/iteration/findall/bool`	`134533.5` ns	`132264` ns	`1.02`
`array/iteration/findfirst/int`	`83676` ns	`83815` ns	`1.00`
`array/iteration/findfirst/bool`	`81796` ns	`81845` ns	`1.00`
`array/iteration/scalar`	`69218.5` ns	`67004` ns	`1.03`
`array/iteration/logical`	`204523.5` ns	`200939.5` ns	`1.02`
`array/iteration/findmin/1d`	`86343.5` ns	`88921.5` ns	`0.97`
`array/iteration/findmin/2d`	`117623` ns	`117673.5` ns	`1.00`
`array/reductions/reduce/Int64/1d`	`44657` ns	`43989` ns	`1.02`
`array/reductions/reduce/Int64/dims=1`	`47418.5` ns	`43251` ns	`1.10`
`array/reductions/reduce/Int64/dims=2`	`66765.5` ns	`59962` ns	`1.11`
`array/reductions/reduce/Int64/dims=1L`	`88038.5` ns	`87716` ns	`1.00`
`array/reductions/reduce/Int64/dims=2L`	`87506` ns	`85104` ns	`1.03`
`array/reductions/reduce/Float32/1d`	`35566` ns	`34799.5` ns	`1.02`
`array/reductions/reduce/Float32/dims=1`	`42739` ns	`49659.5` ns	`0.86`
`array/reductions/reduce/Float32/dims=2`	`61141` ns	`57379` ns	`1.07`
`array/reductions/reduce/Float32/dims=1L`	`52574` ns	`52018` ns	`1.01`
`array/reductions/reduce/Float32/dims=2L`	`70620` ns	`70083` ns	`1.01`
`array/reductions/mapreduce/Int64/1d`	`44678` ns	`43593` ns	`1.02`
`array/reductions/mapreduce/Int64/dims=1`	`47434.5` ns	`43176` ns	`1.10`
`array/reductions/mapreduce/Int64/dims=2`	`66337` ns	`60012` ns	`1.11`
`array/reductions/mapreduce/Int64/dims=1L`	`87984` ns	`87745` ns	`1.00`
`array/reductions/mapreduce/Int64/dims=2L`	`87291` ns	`85352.5` ns	`1.02`
`array/reductions/mapreduce/Float32/1d`	`35288` ns	`34798.5` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=1`	`42716` ns	`49547` ns	`0.86`
`array/reductions/mapreduce/Float32/dims=2`	`61166` ns	`56871.5` ns	`1.08`
`array/reductions/mapreduce/Float32/dims=1L`	`52387` ns	`51976` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=2L`	`70448.5` ns	`69534` ns	`1.01`
`array/broadcast`	`20682` ns	`20738` ns	`1.00`
`array/copyto!/gpu_to_gpu`	`11274` ns	`11258` ns	`1.00`
`array/copyto!/cpu_to_gpu`	`215613` ns	`217274` ns	`0.99`
`array/copyto!/gpu_to_cpu`	`283122` ns	`283488` ns	`1.00`
`array/accumulate/Int64/1d`	`120812` ns	`118668` ns	`1.02`
`array/accumulate/Int64/dims=1`	`81479` ns	`80440` ns	`1.01`
`array/accumulate/Int64/dims=2`	`156886` ns	`156264` ns	`1.00`
`array/accumulate/Int64/dims=1L`	`1753125.5` ns	`1694690` ns	`1.03`
`array/accumulate/Int64/dims=2L`	`963051` ns	`961398` ns	`1.00`
`array/accumulate/Float32/1d`	`104207.5` ns	`101158` ns	`1.03`
`array/accumulate/Float32/dims=1`	`77898` ns	`77281` ns	`1.01`
`array/accumulate/Float32/dims=2`	`146734` ns	`144244` ns	`1.02`
`array/accumulate/Float32/dims=1L`	`1622772` ns	`1585543` ns	`1.02`
`array/accumulate/Float32/dims=2L`	`669505.5` ns	`657740` ns	`1.02`
`array/construct`	`1347.85` ns	`1296.1` ns	`1.04`
`array/random/randn/Float32`	`38901.5` ns	`38521.5` ns	`1.01`
`array/random/randn!/Float32`	`31520` ns	`31518` ns	`1.00`
`array/random/rand!/Int64`	`34377` ns	`34237` ns	`1.00`
`array/random/rand!/Float32`	`8547.333333333334` ns	`8618.333333333334` ns	`0.99`
`array/random/rand/Int64`	`36018` ns	`37340` ns	`0.96`
`array/random/rand/Float32`	`13130` ns	`13131` ns	`1.00`
`array/permutedims/4d`	`51657` ns	`52877.5` ns	`0.98`
`array/permutedims/2d`	`52161` ns	`52537` ns	`0.99`
`array/permutedims/3d`	`53114` ns	`53135` ns	`1.00`
`array/sorting/1d`	`2735846.5` ns	`2735479` ns	`1.00`
`array/sorting/by`	`3304793` ns	`3304665.5` ns	`1.00`
`array/sorting/2d`	`1069043` ns	`1068975` ns	`1.00`
`cuda/synchronization/stream/auto`	`1049.3` ns	`1032.4` ns	`1.02`
`cuda/synchronization/stream/nonblocking`	`7695` ns	`7639.6` ns	`1.01`
`cuda/synchronization/stream/blocking`	`831.7951807228916` ns	`843.4102564102565` ns	`0.99`
`cuda/synchronization/context/auto`	`1195.9` ns	`1179` ns	`1.01`
`cuda/synchronization/context/nonblocking`	`7504.3` ns	`8045` ns	`0.93`
`cuda/synchronization/context/blocking`	`971.4230769230769` ns	`931.1724137931035` ns	`1.04`

This comment was automatically generated by workflow using github-action-benchmark.

This reverts commit d4c271b.

christiangnrd force-pushed the intrinsics branch from edaeb41 to 8e3e1d4 Compare October 22, 2025 15:00

christiangnrd commented Oct 22, 2025

View reviewed changes

src/mapreduce.jl Outdated Show resolved Hide resolved

christiangnrd commented Oct 22, 2025

View reviewed changes

src/mapreduce.jl Outdated Show resolved Hide resolved

christiangnrd commented Oct 22, 2025

View reviewed changes

src/CUDAKernels.jl Outdated Show resolved Hide resolved

christiangnrd force-pushed the intrinsics branch from 497ef42 to 506e02d Compare October 22, 2025 17:23

github-actions bot reviewed Oct 22, 2025

View reviewed changes

christiangnrd force-pushed the intrinsics branch 2 times, most recently from aef3728 to fef539a Compare November 6, 2025 14:51

christiangnrd force-pushed the intrinsics branch 6 times, most recently from e2d7489 to 180f4a5 Compare November 18, 2025 22:44

christiangnrd force-pushed the intrinsics branch from 180f4a5 to d4c271b Compare December 12, 2025 22:15

christiangnrd force-pushed the intrinsics branch from 80b75b2 to 14f29e5 Compare January 2, 2026 20:17

christiangnrd force-pushed the intrinsics branch from 14f29e5 to b64dcd6 Compare February 13, 2026 15:32

christiangnrd force-pushed the intrinsics branch from b64dcd6 to 3c75eb6 Compare March 24, 2026 16:47

maleadt force-pushed the master branch from f1e7455 to 5a6f767 Compare March 26, 2026 08:13

christiangnrd added 6 commits April 9, 2026 13:27

KernelIntrinsics

a642c00

Temp CI

82beb9e

Dogfood

fbf8e04

Adapt

451b0cd

Revert "Adapt"

66df125

This reverts commit d4c271b.

More compat workarounds

487ad3b

christiangnrd force-pushed the intrinsics branch from 3c75eb6 to 487ad3b Compare April 9, 2026 16:30

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Do not merge] Test KernelIntrinsics#2944

[Do not merge] Test KernelIntrinsics#2944
christiangnrd wants to merge 6 commits intoJuliaGPU:masterfrom
christiangnrd:intrinsics

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot left a comment •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Conversation

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

github-actions bot commented Oct 22, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading