[Do not merge] Test KernelIntrinsics#2944
Open
christiangnrd wants to merge 6 commits intoJuliaGPU:masterfrom
Open
[Do not merge] Test KernelIntrinsics#2944christiangnrd wants to merge 6 commits intoJuliaGPU:masterfrom
christiangnrd wants to merge 6 commits intoJuliaGPU:masterfrom
Conversation
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 5e39ab68e..7f8ec7ad4 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -162,29 +162,29 @@ end
KI.argconvert(::CUDABackend, arg) = cudaconvert(arg)
-function KI.kernel_function(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::CUDABackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
kern = cufunction(f, tt; name, kwargs...)
- KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+ return KI.Kernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
end
function (obj::KI.Kernel{CUDABackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)
- obj.kern(args...; threads=workgroupsize, blocks=numworkgroups)
+ obj.kern(args...; threads = workgroupsize, blocks = numworkgroups)
return nothing
end
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
kernel_config = launch_configuration(kernel.kern.fun)
- Int(min(kernel_config.threads, max_work_items))
+ return Int(min(kernel_config.threads, max_work_items))
end
function KI.max_work_group_size(::CUDABackend)::Int
- Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+ return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
end
function KI.multiprocessor_count(::CUDABackend)::Int
- Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+ return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
end
## indexing
@@ -199,7 +199,7 @@ end
end
@device_override @inline function KI.get_global_id()
- return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+ return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
end
@device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 54fab2119..e0631d387 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
temp = CuDynamicSharedArray(T, (2*threads,))
# iterate the main dimension using threads and the first block dimension
- i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+ i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
- j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+ j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
if j > length(Rother)
return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
block = KI.get_group_id().x
# iterate the main dimension using threads and the first block dimension
- i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+ i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
- j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+ j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
@inbounds if i <= length(Rdim) && j <= length(Rother)
I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
@inbounds global_random_counters()[warpId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId%UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
end
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
- threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
- # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+ threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+ )
+ # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
GPUArrays.mapreducedim!(identity, op, R, partial; init)
end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
- "CPU synchronization",
- "fallback test: callable types",]))
+Testsuite.testsuite(
+ () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+ [
+ "CPU synchronization",
+ "fallback test: callable types",
+ ]
+ )
+)
for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
end
diff --git a/test/runtests.jl b/test/runtests.jl
index 802c832e5..11584ca92 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "CUDA.jl") == "CUDA.jl"
using Pkg
- Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+ Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
end
using Distributed |
edaeb41 to
8e3e1d4
Compare
christiangnrd
commented
Oct 22, 2025
christiangnrd
commented
Oct 22, 2025
christiangnrd
commented
Oct 22, 2025
497ef42 to
506e02d
Compare
Contributor
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 3c75eb6 | Previous: 9f56ee2 | Ratio |
|---|---|---|---|
latency/precompile |
54337646854.5 ns |
45061042309.5 ns |
1.21 |
latency/ttfp |
12957095931 ns |
12778273638 ns |
1.01 |
latency/import |
3551763524 ns |
3543310654 ns |
1.00 |
integration/volumerhs |
9438341.5 ns |
9438152 ns |
1.00 |
integration/byval/slices=1 |
145862 ns |
145916 ns |
1.00 |
integration/byval/slices=3 |
423025 ns |
423037 ns |
1.00 |
integration/byval/reference |
143927 ns |
143945 ns |
1.00 |
integration/byval/slices=2 |
284696 ns |
284549 ns |
1.00 |
integration/cudadevrt |
102606 ns |
102670.5 ns |
1.00 |
kernel/indexing |
13705 ns |
13342 ns |
1.03 |
kernel/indexing_checked |
14354 ns |
14086 ns |
1.02 |
kernel/occupancy |
663.8220858895705 ns |
660.9593749999999 ns |
1.00 |
kernel/launch |
2178.5555555555557 ns |
2204 ns |
0.99 |
kernel/rand |
16127 ns |
14561 ns |
1.11 |
array/reverse/1d |
18528 ns |
18374 ns |
1.01 |
array/reverse/2dL_inplace |
66523 ns |
66121 ns |
1.01 |
array/reverse/1dL |
69060 ns |
68975 ns |
1.00 |
array/reverse/2d |
20665 ns |
20697 ns |
1.00 |
array/reverse/1d_inplace |
8551.666666666666 ns |
8587.333333333334 ns |
1.00 |
array/reverse/2d_inplace |
10725 ns |
10398 ns |
1.03 |
array/reverse/2dL |
72862 ns |
72780 ns |
1.00 |
array/reverse/1dL_inplace |
65960 ns |
66028.5 ns |
1.00 |
array/copy |
18769 ns |
18723 ns |
1.00 |
array/iteration/findall/int |
151676 ns |
149865 ns |
1.01 |
array/iteration/findall/bool |
134533.5 ns |
132264 ns |
1.02 |
array/iteration/findfirst/int |
83676 ns |
83815 ns |
1.00 |
array/iteration/findfirst/bool |
81796 ns |
81845 ns |
1.00 |
array/iteration/scalar |
69218.5 ns |
67004 ns |
1.03 |
array/iteration/logical |
204523.5 ns |
200939.5 ns |
1.02 |
array/iteration/findmin/1d |
86343.5 ns |
88921.5 ns |
0.97 |
array/iteration/findmin/2d |
117623 ns |
117673.5 ns |
1.00 |
array/reductions/reduce/Int64/1d |
44657 ns |
43989 ns |
1.02 |
array/reductions/reduce/Int64/dims=1 |
47418.5 ns |
43251 ns |
1.10 |
array/reductions/reduce/Int64/dims=2 |
66765.5 ns |
59962 ns |
1.11 |
array/reductions/reduce/Int64/dims=1L |
88038.5 ns |
87716 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
87506 ns |
85104 ns |
1.03 |
array/reductions/reduce/Float32/1d |
35566 ns |
34799.5 ns |
1.02 |
array/reductions/reduce/Float32/dims=1 |
42739 ns |
49659.5 ns |
0.86 |
array/reductions/reduce/Float32/dims=2 |
61141 ns |
57379 ns |
1.07 |
array/reductions/reduce/Float32/dims=1L |
52574 ns |
52018 ns |
1.01 |
array/reductions/reduce/Float32/dims=2L |
70620 ns |
70083 ns |
1.01 |
array/reductions/mapreduce/Int64/1d |
44678 ns |
43593 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=1 |
47434.5 ns |
43176 ns |
1.10 |
array/reductions/mapreduce/Int64/dims=2 |
66337 ns |
60012 ns |
1.11 |
array/reductions/mapreduce/Int64/dims=1L |
87984 ns |
87745 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
87291 ns |
85352.5 ns |
1.02 |
array/reductions/mapreduce/Float32/1d |
35288 ns |
34798.5 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=1 |
42716 ns |
49547 ns |
0.86 |
array/reductions/mapreduce/Float32/dims=2 |
61166 ns |
56871.5 ns |
1.08 |
array/reductions/mapreduce/Float32/dims=1L |
52387 ns |
51976 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=2L |
70448.5 ns |
69534 ns |
1.01 |
array/broadcast |
20682 ns |
20738 ns |
1.00 |
array/copyto!/gpu_to_gpu |
11274 ns |
11258 ns |
1.00 |
array/copyto!/cpu_to_gpu |
215613 ns |
217274 ns |
0.99 |
array/copyto!/gpu_to_cpu |
283122 ns |
283488 ns |
1.00 |
array/accumulate/Int64/1d |
120812 ns |
118668 ns |
1.02 |
array/accumulate/Int64/dims=1 |
81479 ns |
80440 ns |
1.01 |
array/accumulate/Int64/dims=2 |
156886 ns |
156264 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1753125.5 ns |
1694690 ns |
1.03 |
array/accumulate/Int64/dims=2L |
963051 ns |
961398 ns |
1.00 |
array/accumulate/Float32/1d |
104207.5 ns |
101158 ns |
1.03 |
array/accumulate/Float32/dims=1 |
77898 ns |
77281 ns |
1.01 |
array/accumulate/Float32/dims=2 |
146734 ns |
144244 ns |
1.02 |
array/accumulate/Float32/dims=1L |
1622772 ns |
1585543 ns |
1.02 |
array/accumulate/Float32/dims=2L |
669505.5 ns |
657740 ns |
1.02 |
array/construct |
1347.85 ns |
1296.1 ns |
1.04 |
array/random/randn/Float32 |
38901.5 ns |
38521.5 ns |
1.01 |
array/random/randn!/Float32 |
31520 ns |
31518 ns |
1.00 |
array/random/rand!/Int64 |
34377 ns |
34237 ns |
1.00 |
array/random/rand!/Float32 |
8547.333333333334 ns |
8618.333333333334 ns |
0.99 |
array/random/rand/Int64 |
36018 ns |
37340 ns |
0.96 |
array/random/rand/Float32 |
13130 ns |
13131 ns |
1.00 |
array/permutedims/4d |
51657 ns |
52877.5 ns |
0.98 |
array/permutedims/2d |
52161 ns |
52537 ns |
0.99 |
array/permutedims/3d |
53114 ns |
53135 ns |
1.00 |
array/sorting/1d |
2735846.5 ns |
2735479 ns |
1.00 |
array/sorting/by |
3304793 ns |
3304665.5 ns |
1.00 |
array/sorting/2d |
1069043 ns |
1068975 ns |
1.00 |
cuda/synchronization/stream/auto |
1049.3 ns |
1032.4 ns |
1.02 |
cuda/synchronization/stream/nonblocking |
7695 ns |
7639.6 ns |
1.01 |
cuda/synchronization/stream/blocking |
831.7951807228916 ns |
843.4102564102565 ns |
0.99 |
cuda/synchronization/context/auto |
1195.9 ns |
1179 ns |
1.01 |
cuda/synchronization/context/nonblocking |
7504.3 ns |
8045 ns |
0.93 |
cuda/synchronization/context/blocking |
971.4230769230769 ns |
931.1724137931035 ns |
1.04 |
This comment was automatically generated by workflow using github-action-benchmark.
aef3728 to
fef539a
Compare
e2d7489 to
180f4a5
Compare
180f4a5 to
d4c271b
Compare
80b75b2 to
14f29e5
Compare
14f29e5 to
b64dcd6
Compare
b64dcd6 to
3c75eb6
Compare
This reverts commit d4c271b.
3c75eb6 to
487ad3b
Compare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
[only tests]
[only benchmarks]