Skip to content

[Do not merge] Test KernelIntrinsics#2944

Open
christiangnrd wants to merge 6 commits intoJuliaGPU:masterfrom
christiangnrd:intrinsics
Open

[Do not merge] Test KernelIntrinsics#2944
christiangnrd wants to merge 6 commits intoJuliaGPU:masterfrom
christiangnrd:intrinsics

Conversation

@christiangnrd
Copy link
Copy Markdown
Member

[only tests]
[only benchmarks]

@github-actions
Copy link
Copy Markdown
Contributor

github-actions bot commented Oct 22, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 5e39ab68e..7f8ec7ad4 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -162,29 +162,29 @@ end
 
 KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = cufunction(f, tt; name, kwargs...)
-    KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
 function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+    obj.kern(args...; threads = workgroupsize, blocks = numworkgroups)
     return nothing
 end
 
 
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
     kernel_config = launch_configuration(kernel.kern.fun)
 
-    Int(min(kernel_config.threads, max_work_items))
+    return Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -199,7 +199,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 54fab2119..e0631d387 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 802c832e5..11584ca92 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "CUDA.jl") == "CUDA.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Distributed

Copy link
Copy Markdown
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Details
Benchmark suite Current: 3c75eb6 Previous: 9f56ee2 Ratio
latency/precompile 54337646854.5 ns 45061042309.5 ns 1.21
latency/ttfp 12957095931 ns 12778273638 ns 1.01
latency/import 3551763524 ns 3543310654 ns 1.00
integration/volumerhs 9438341.5 ns 9438152 ns 1.00
integration/byval/slices=1 145862 ns 145916 ns 1.00
integration/byval/slices=3 423025 ns 423037 ns 1.00
integration/byval/reference 143927 ns 143945 ns 1.00
integration/byval/slices=2 284696 ns 284549 ns 1.00
integration/cudadevrt 102606 ns 102670.5 ns 1.00
kernel/indexing 13705 ns 13342 ns 1.03
kernel/indexing_checked 14354 ns 14086 ns 1.02
kernel/occupancy 663.8220858895705 ns 660.9593749999999 ns 1.00
kernel/launch 2178.5555555555557 ns 2204 ns 0.99
kernel/rand 16127 ns 14561 ns 1.11
array/reverse/1d 18528 ns 18374 ns 1.01
array/reverse/2dL_inplace 66523 ns 66121 ns 1.01
array/reverse/1dL 69060 ns 68975 ns 1.00
array/reverse/2d 20665 ns 20697 ns 1.00
array/reverse/1d_inplace 8551.666666666666 ns 8587.333333333334 ns 1.00
array/reverse/2d_inplace 10725 ns 10398 ns 1.03
array/reverse/2dL 72862 ns 72780 ns 1.00
array/reverse/1dL_inplace 65960 ns 66028.5 ns 1.00
array/copy 18769 ns 18723 ns 1.00
array/iteration/findall/int 151676 ns 149865 ns 1.01
array/iteration/findall/bool 134533.5 ns 132264 ns 1.02
array/iteration/findfirst/int 83676 ns 83815 ns 1.00
array/iteration/findfirst/bool 81796 ns 81845 ns 1.00
array/iteration/scalar 69218.5 ns 67004 ns 1.03
array/iteration/logical 204523.5 ns 200939.5 ns 1.02
array/iteration/findmin/1d 86343.5 ns 88921.5 ns 0.97
array/iteration/findmin/2d 117623 ns 117673.5 ns 1.00
array/reductions/reduce/Int64/1d 44657 ns 43989 ns 1.02
array/reductions/reduce/Int64/dims=1 47418.5 ns 43251 ns 1.10
array/reductions/reduce/Int64/dims=2 66765.5 ns 59962 ns 1.11
array/reductions/reduce/Int64/dims=1L 88038.5 ns 87716 ns 1.00
array/reductions/reduce/Int64/dims=2L 87506 ns 85104 ns 1.03
array/reductions/reduce/Float32/1d 35566 ns 34799.5 ns 1.02
array/reductions/reduce/Float32/dims=1 42739 ns 49659.5 ns 0.86
array/reductions/reduce/Float32/dims=2 61141 ns 57379 ns 1.07
array/reductions/reduce/Float32/dims=1L 52574 ns 52018 ns 1.01
array/reductions/reduce/Float32/dims=2L 70620 ns 70083 ns 1.01
array/reductions/mapreduce/Int64/1d 44678 ns 43593 ns 1.02
array/reductions/mapreduce/Int64/dims=1 47434.5 ns 43176 ns 1.10
array/reductions/mapreduce/Int64/dims=2 66337 ns 60012 ns 1.11
array/reductions/mapreduce/Int64/dims=1L 87984 ns 87745 ns 1.00
array/reductions/mapreduce/Int64/dims=2L 87291 ns 85352.5 ns 1.02
array/reductions/mapreduce/Float32/1d 35288 ns 34798.5 ns 1.01
array/reductions/mapreduce/Float32/dims=1 42716 ns 49547 ns 0.86
array/reductions/mapreduce/Float32/dims=2 61166 ns 56871.5 ns 1.08
array/reductions/mapreduce/Float32/dims=1L 52387 ns 51976 ns 1.01
array/reductions/mapreduce/Float32/dims=2L 70448.5 ns 69534 ns 1.01
array/broadcast 20682 ns 20738 ns 1.00
array/copyto!/gpu_to_gpu 11274 ns 11258 ns 1.00
array/copyto!/cpu_to_gpu 215613 ns 217274 ns 0.99
array/copyto!/gpu_to_cpu 283122 ns 283488 ns 1.00
array/accumulate/Int64/1d 120812 ns 118668 ns 1.02
array/accumulate/Int64/dims=1 81479 ns 80440 ns 1.01
array/accumulate/Int64/dims=2 156886 ns 156264 ns 1.00
array/accumulate/Int64/dims=1L 1753125.5 ns 1694690 ns 1.03
array/accumulate/Int64/dims=2L 963051 ns 961398 ns 1.00
array/accumulate/Float32/1d 104207.5 ns 101158 ns 1.03
array/accumulate/Float32/dims=1 77898 ns 77281 ns 1.01
array/accumulate/Float32/dims=2 146734 ns 144244 ns 1.02
array/accumulate/Float32/dims=1L 1622772 ns 1585543 ns 1.02
array/accumulate/Float32/dims=2L 669505.5 ns 657740 ns 1.02
array/construct 1347.85 ns 1296.1 ns 1.04
array/random/randn/Float32 38901.5 ns 38521.5 ns 1.01
array/random/randn!/Float32 31520 ns 31518 ns 1.00
array/random/rand!/Int64 34377 ns 34237 ns 1.00
array/random/rand!/Float32 8547.333333333334 ns 8618.333333333334 ns 0.99
array/random/rand/Int64 36018 ns 37340 ns 0.96
array/random/rand/Float32 13130 ns 13131 ns 1.00
array/permutedims/4d 51657 ns 52877.5 ns 0.98
array/permutedims/2d 52161 ns 52537 ns 0.99
array/permutedims/3d 53114 ns 53135 ns 1.00
array/sorting/1d 2735846.5 ns 2735479 ns 1.00
array/sorting/by 3304793 ns 3304665.5 ns 1.00
array/sorting/2d 1069043 ns 1068975 ns 1.00
cuda/synchronization/stream/auto 1049.3 ns 1032.4 ns 1.02
cuda/synchronization/stream/nonblocking 7695 ns 7639.6 ns 1.01
cuda/synchronization/stream/blocking 831.7951807228916 ns 843.4102564102565 ns 0.99
cuda/synchronization/context/auto 1195.9 ns 1179 ns 1.01
cuda/synchronization/context/nonblocking 7504.3 ns 8045 ns 0.93
cuda/synchronization/context/blocking 971.4230769230769 ns 931.1724137931035 ns 1.04

This comment was automatically generated by workflow using github-action-benchmark.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant