Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/test/runtests.jl b/test/runtests.jl
index 4d5362c1b..96860ee43 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -340,7 +340,7 @@ try
end
end
@sync begin
- function recycle_worker(p, timeout=0)
+ function recycle_worker(p, timeout = 0)
if isdefined(CUDA, :to)
to = remotecall_fetch(p) do
CUDA.to
@@ -348,7 +348,7 @@ try
push!(timings, to)
end
- rmprocs(p, waitfor=timeout)
+ rmprocs(p, waitfor = timeout)
return nothing
end |
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: d2e643c | Previous: 6c6977f | Ratio |
|---|---|---|---|
latency/precompile |
43810727453 ns |
43775122238.5 ns |
1.00 |
latency/ttfp |
7411423048 ns |
7276778491 ns |
1.02 |
latency/import |
3901104766 ns |
3836255124 ns |
1.02 |
integration/volumerhs |
9595280 ns |
9623790.5 ns |
1.00 |
integration/byval/slices=1 |
147327 ns |
146826 ns |
1.00 |
integration/byval/slices=3 |
426418 ns |
426011 ns |
1.00 |
integration/byval/reference |
145318 ns |
145073 ns |
1.00 |
integration/byval/slices=2 |
286811 ns |
286240 ns |
1.00 |
integration/cudadevrt |
103716 ns |
103460 ns |
1.00 |
kernel/indexing |
14482.5 ns |
14196 ns |
1.02 |
kernel/indexing_checked |
15186 ns |
15033 ns |
1.01 |
kernel/occupancy |
679.6730769230769 ns |
670.506329113924 ns |
1.01 |
kernel/launch |
2296.777777777778 ns |
2162.5555555555557 ns |
1.06 |
kernel/rand |
18309 ns |
16555 ns |
1.11 |
array/reverse/1d |
20279 ns |
19646 ns |
1.03 |
array/reverse/2dL_inplace |
67007 ns |
66804 ns |
1.00 |
array/reverse/1dL |
70533 ns |
69923 ns |
1.01 |
array/reverse/2d |
22386 ns |
21677 ns |
1.03 |
array/reverse/1d_inplace |
9883 ns |
9658 ns |
1.02 |
array/reverse/2d_inplace |
13439 ns |
13323 ns |
1.01 |
array/reverse/2dL |
74291 ns |
73803 ns |
1.01 |
array/reverse/1dL_inplace |
67078 ns |
66812 ns |
1.00 |
array/copy |
21075 ns |
20306 ns |
1.04 |
array/iteration/findall/int |
158213 ns |
157164 ns |
1.01 |
array/iteration/findall/bool |
139853 ns |
139633 ns |
1.00 |
array/iteration/findfirst/int |
161948.5 ns |
160554.5 ns |
1.01 |
array/iteration/findfirst/bool |
162627 ns |
160957 ns |
1.01 |
array/iteration/scalar |
73132.5 ns |
72124 ns |
1.01 |
array/iteration/logical |
220878 ns |
215036.5 ns |
1.03 |
array/iteration/findmin/1d |
51462 ns |
49445 ns |
1.04 |
array/iteration/findmin/2d |
97306 ns |
96493.5 ns |
1.01 |
array/reductions/reduce/Int64/1d |
44372 ns |
42960 ns |
1.03 |
array/reductions/reduce/Int64/dims=1 |
46703 ns |
44742.5 ns |
1.04 |
array/reductions/reduce/Int64/dims=2 |
62221 ns |
61453 ns |
1.01 |
array/reductions/reduce/Int64/dims=1L |
89635 ns |
88951 ns |
1.01 |
array/reductions/reduce/Int64/dims=2L |
88963 ns |
88014.5 ns |
1.01 |
array/reductions/reduce/Float32/1d |
37983 ns |
35769 ns |
1.06 |
array/reductions/reduce/Float32/dims=1 |
52746 ns |
51586 ns |
1.02 |
array/reductions/reduce/Float32/dims=2 |
60519 ns |
59511 ns |
1.02 |
array/reductions/reduce/Float32/dims=1L |
52633 ns |
52474 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
73068 ns |
71419 ns |
1.02 |
array/reductions/mapreduce/Int64/1d |
44274 ns |
43189 ns |
1.03 |
array/reductions/mapreduce/Int64/dims=1 |
45309 ns |
46540.5 ns |
0.97 |
array/reductions/mapreduce/Int64/dims=2 |
62373 ns |
61279.5 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=1L |
89419 ns |
88854 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2L |
89007 ns |
88014 ns |
1.01 |
array/reductions/mapreduce/Float32/1d |
38576 ns |
36287 ns |
1.06 |
array/reductions/mapreduce/Float32/dims=1 |
52724 ns |
41466 ns |
1.27 |
array/reductions/mapreduce/Float32/dims=2 |
60482 ns |
59744 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=1L |
52978 ns |
52550 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=2L |
73209 ns |
71985 ns |
1.02 |
array/broadcast |
20586 ns |
20047 ns |
1.03 |
array/copyto!/gpu_to_gpu |
13223 ns |
11191 ns |
1.18 |
array/copyto!/cpu_to_gpu |
218360 ns |
213964 ns |
1.02 |
array/copyto!/gpu_to_cpu |
287664.5 ns |
284661.5 ns |
1.01 |
array/accumulate/Int64/1d |
125306 ns |
124888 ns |
1.00 |
array/accumulate/Int64/dims=1 |
83775 ns |
83130 ns |
1.01 |
array/accumulate/Int64/dims=2 |
158237 ns |
157680 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1710481 ns |
1709578 ns |
1.00 |
array/accumulate/Int64/dims=2L |
967325 ns |
966045 ns |
1.00 |
array/accumulate/Float32/1d |
109910 ns |
108910 ns |
1.01 |
array/accumulate/Float32/dims=1 |
80965 ns |
80564 ns |
1.00 |
array/accumulate/Float32/dims=2 |
147964 ns |
147715 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1619372 ns |
1618612 ns |
1.00 |
array/accumulate/Float32/dims=2L |
698753 ns |
698318 ns |
1.00 |
array/construct |
1283.4 ns |
1287.5 ns |
1.00 |
array/random/randn/Float32 |
48408 ns |
43976 ns |
1.10 |
array/random/randn!/Float32 |
25279 ns |
24816 ns |
1.02 |
array/random/rand!/Int64 |
27472 ns |
27267 ns |
1.01 |
array/random/rand!/Float32 |
8734.333333333334 ns |
8653.333333333334 ns |
1.01 |
array/random/rand/Int64 |
38402 ns |
38285 ns |
1.00 |
array/random/rand/Float32 |
13489.5 ns |
13026 ns |
1.04 |
array/permutedims/4d |
60306 ns |
60152.5 ns |
1.00 |
array/permutedims/2d |
54290 ns |
53934 ns |
1.01 |
array/permutedims/3d |
55175 ns |
54649.5 ns |
1.01 |
array/sorting/1d |
2759273.5 ns |
2757180 ns |
1.00 |
array/sorting/by |
3345456 ns |
3343619 ns |
1.00 |
array/sorting/2d |
1081461 ns |
1080730 ns |
1.00 |
cuda/synchronization/stream/auto |
1024.9 ns |
1029.6 ns |
1.00 |
cuda/synchronization/stream/nonblocking |
7647.3 ns |
7270.700000000001 ns |
1.05 |
cuda/synchronization/stream/blocking |
826.7578947368421 ns |
850.7415730337078 ns |
0.97 |
cuda/synchronization/context/auto |
1172.5 ns |
1157 ns |
1.01 |
cuda/synchronization/context/nonblocking |
7160 ns |
6964 ns |
1.03 |
cuda/synchronization/context/blocking |
907.3777777777777 ns |
888.5208333333334 ns |
1.02 |
This comment was automatically generated by workflow using github-action-benchmark.
|
Didn't help much. Going to try to figure out why our test times have regressed so badly. Looking at CI, on Julia 1.10:
whereas on 1.11:
Nearly 2 minutes worse. Something weird is going on! |
|
Looking at a single test file, For Julia 1.10: For Julia 1.11: So it looks like compilation time is a big driver of some of this. |
|
I did some more digging on this file, and we're spending 30s on the first call here. It's mostly compiling the GPUArrays |
This seemed to help the test duration for me locally, but remains to be seen how it affects CI