Wrapper for Blocksparse CuTensor code#3057
Conversation
… to make it a union type of CuTensorBS and AbstractArray?
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cutensor/src/blocksparse/interfaces.jl b/lib/cutensor/src/blocksparse/interfaces.jl
index c6eef0e5b..0a479ddf8 100644
--- a/lib/cutensor/src/blocksparse/interfaces.jl
+++ b/lib/cutensor/src/blocksparse/interfaces.jl
@@ -1,4 +1,4 @@
-## For now call contract in ITensor and rely on UnallocatedArrays to make
+## For now call contract in ITensor and rely on UnallocatedArrays to make
## C in a dry-run of the contraction.
# function Base.:(*)(A::CuTensorBS, B::CuTensorBs)
# tC = promote_type(eltype(A), eltype(B))
@@ -18,11 +18,13 @@
using LinearAlgebra
function LinearAlgebra.mul!(C::CuTensorBS, A::CuTensorBS, B::CuTensorBS, α::Number, β::Number)
- contract!(α,
- A, A.inds, CUTENSOR_OP_IDENTITY,
- B, B.inds, CUTENSOR_OP_IDENTITY,
- β,
- C, C.inds, CUTENSOR_OP_IDENTITY,
- CUTENSOR_OP_IDENTITY; jit=CUTENSOR_JIT_MODE_DEFAULT)
- return C
-end
\ No newline at end of file
+ contract!(
+ α,
+ A, A.inds, CUTENSOR_OP_IDENTITY,
+ B, B.inds, CUTENSOR_OP_IDENTITY,
+ β,
+ C, C.inds, CUTENSOR_OP_IDENTITY,
+ CUTENSOR_OP_IDENTITY; jit = CUTENSOR_JIT_MODE_DEFAULT
+ )
+ return C
+end
diff --git a/lib/cutensor/src/blocksparse/operations.jl b/lib/cutensor/src/blocksparse/operations.jl
index 19542e5de..0f98c92ef 100644
--- a/lib/cutensor/src/blocksparse/operations.jl
+++ b/lib/cutensor/src/blocksparse/operations.jl
@@ -9,23 +9,26 @@ function contract!(
@nospecialize(beta::Number),
@nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
opOut::cutensorOperator_t;
- jit::cutensorJitMode_t=JIT_MODE_NONE,
- workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
- algo::cutensorAlgo_t=ALGO_DEFAULT,
- compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing,
- plan::Union{CuTensorPlan, Nothing}=nothing)
+ jit::cutensorJitMode_t = JIT_MODE_NONE,
+ workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+ algo::cutensorAlgo_t = ALGO_DEFAULT,
+ compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing,
+ plan::Union{CuTensorPlan, Nothing} = nothing
+ )
actual_plan = if plan === nothing
- plan_contraction(A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
- jit, workspace, algo, compute_type)
+ plan_contraction(
+ A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
+ jit, workspace, algo, compute_type
+ )
else
plan
end
contractBS!(actual_plan, alpha, nonzero_blocks(A), nonzero_blocks(B), beta, nonzero_blocks(C))
-
+
if plan === nothing
- CUDA.unsafe_free!(actual_plan)
+ CUDA.unsafe_free!(actual_plan)
end
return C
@@ -33,12 +36,14 @@ end
## This function assumes A, B, and C are Arrays of pointers to CuArrays.
## Please overwrite the `nonzero_blocks` function for your datatype to access this function from contract!
-function contractBS!(plan::CuTensorPlan,
- @nospecialize(alpha::Number),
- @nospecialize(A::AbstractArray),
- @nospecialize(B::AbstractArray),
- @nospecialize(beta::Number),
- @nospecialize(C::AbstractArray))
+function contractBS!(
+ plan::CuTensorPlan,
+ @nospecialize(alpha::Number),
+ @nospecialize(A::AbstractArray),
+ @nospecialize(B::AbstractArray),
+ @nospecialize(beta::Number),
+ @nospecialize(C::AbstractArray)
+ )
scalar_type = plan.scalar_type
# Extract GPU pointers from each CuArray block
@@ -46,11 +51,13 @@ function contractBS!(plan::CuTensorPlan,
A_ptrs = CuPtr{Cvoid}[pointer(block) for block in A]
B_ptrs = CuPtr{Cvoid}[pointer(block) for block in B]
C_ptrs = CuPtr{Cvoid}[pointer(block) for block in C]
-
- cutensorBlockSparseContract(handle(), plan,
- Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
- Ref{scalar_type}(beta), C_ptrs, C_ptrs,
- plan.workspace, sizeof(plan.workspace), stream())
+
+ cutensorBlockSparseContract(
+ handle(), plan,
+ Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
+ Ref{scalar_type}(beta), C_ptrs, C_ptrs,
+ plan.workspace, sizeof(plan.workspace), stream()
+ )
synchronize(stream())
return C
end
@@ -60,21 +67,22 @@ function plan_contraction(
@nospecialize(B), Binds::ModeType, opB::cutensorOperator_t,
@nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
opOut::cutensorOperator_t;
- jit::cutensorJitMode_t=JIT_MODE_NONE,
- workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
- algo::cutensorAlgo_t=ALGO_DEFAULT,
- compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing)
+ jit::cutensorJitMode_t = JIT_MODE_NONE,
+ workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+ algo::cutensorAlgo_t = ALGO_DEFAULT,
+ compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing
+ )
!is_unary(opA) && throw(ArgumentError("opA must be a unary op!"))
!is_unary(opB) && throw(ArgumentError("opB must be a unary op!"))
!is_unary(opC) && throw(ArgumentError("opC must be a unary op!"))
!is_unary(opOut) && throw(ArgumentError("opOut must be a unary op!"))
-
+
descA = CuTensorBSDescriptor(A)
descB = CuTensorBSDescriptor(B)
descC = CuTensorBSDescriptor(C)
# for now, D must be identical to C (and thus, descD must be identical to descC)
-
+
modeA = collect(Cint, Ainds)
modeB = collect(Cint, Binds)
modeC = collect(Cint, Cinds)
@@ -87,17 +95,19 @@ function plan_contraction(
desc = Ref{cutensorOperationDescriptor_t}()
- cutensorCreateBlockSparseContraction(handle(),
- desc,
- descA, modeA, opA,
- descB, modeB, opB,
- descC, modeC, opC,
- descC, modeC, actual_compute_type)
+ cutensorCreateBlockSparseContraction(
+ handle(),
+ desc,
+ descA, modeA, opA,
+ descB, modeB, opB,
+ descC, modeC, opC,
+ descC, modeC, actual_compute_type
+ )
plan_pref = Ref{cutensorPlanPreference_t}()
cutensorCreatePlanPreference(handle(), plan_pref, algo, jit)
- plan = CuTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+ plan = CuTensorPlan(desc[], plan_pref[]; workspacePref = workspace)
# cutensorDestroyOperationDescriptor(desc[])
cutensorDestroyPlanPreference(plan_pref[])
return plan
diff --git a/lib/cutensor/src/blocksparse/types.jl b/lib/cutensor/src/blocksparse/types.jl
index 292dc4d00..41cbebdbd 100644
--- a/lib/cutensor/src/blocksparse/types.jl
+++ b/lib/cutensor/src/blocksparse/types.jl
@@ -12,20 +12,26 @@ mutable struct CuTensorBS{T, N}
## This expects a Vector{Tuple(Int)} right now
nonzero_block_coords
- function CuTensorBS{T, N}(nonzero_data::Vector{<:CuArray},
- blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number, N}
+ function CuTensorBS{T, N}(
+ nonzero_data::Vector{<:CuArray},
+ blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector
+ ) where {T <: Number, N}
CuArrayT = eltype(nonzero_data)
@assert eltype(CuArrayT) == T
# @assert ndims(CuArrayT) == N
@assert length(block_extents) == N
- new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
+ return new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
end
end
-function CuTensorBS(nonzero_data::Vector{<:CuArray{T}},
- blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number}
- CuTensorBS{T,length(block_extents)}(nonzero_data,
- blocks_per_mode, block_extents, nonzero_block_coords, inds)
+function CuTensorBS(
+ nonzero_data::Vector{<:CuArray{T}},
+ blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector
+ ) where {T <: Number}
+ return CuTensorBS{T, length(block_extents)}(
+ nonzero_data,
+ blocks_per_mode, block_extents, nonzero_block_coords, inds
+ )
end
# array interface
function Base.size(T::CuTensorBS)
@@ -39,8 +45,8 @@ Base.strides(T::CuTensorBS) = vcat([[st...] for st in strides.(T.nonzero_data)].
Base.eltype(T::CuTensorBS) = eltype(eltype(T.nonzero_data))
function block_extents(T::CuTensorBS)
- extents = Vector{Int64}()
-
+ extents = Vector{Int64}()
+
for ex in T.block_extents
extents = vcat(extents, ex...)
end
@@ -66,18 +72,21 @@ mutable struct CuTensorBSDescriptor
handle::cutensorBlockSparseTensorDescriptor_t
# inner constructor handles creation and finalizer of the descriptor
function CuTensorBSDescriptor(
- numModes,
- numNonZeroBlocks,
- numSectionsPerMode,
- extent,
- nonZeroCoordinates,
- stride,
- eltype)
+ numModes,
+ numNonZeroBlocks,
+ numSectionsPerMode,
+ extent,
+ nonZeroCoordinates,
+ stride,
+ eltype
+ )
desc = Ref{cuTENSOR.cutensorBlockSparseTensorDescriptor_t}()
- cutensorCreateBlockSparseTensorDescriptor(handle(), desc,
- numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
- stride, eltype)
+ cutensorCreateBlockSparseTensorDescriptor(
+ handle(), desc,
+ numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
+ stride, eltype
+ )
obj = new(desc[])
finalizer(unsafe_destroy!, obj)
@@ -86,12 +95,13 @@ mutable struct CuTensorBSDescriptor
end
function CuTensorBSDescriptor(
- numModes,
- numNonZeroBlocks,
- numSectionsPerMode,
- extent,
- nonZeroCoordinates,
- eltype)
+ numModes,
+ numNonZeroBlocks,
+ numSectionsPerMode,
+ extent,
+ nonZeroCoordinates,
+ eltype
+ )
return CuTensorBSDescriptor(numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, C_NULL, eltype)
end
@@ -101,7 +111,7 @@ Base.show(io::IO, desc::CuTensorBSDescriptor) = @printf(io, "CuTensorBSDescripto
Base.unsafe_convert(::Type{cutensorBlockSparseTensorDescriptor_t}, obj::CuTensorBSDescriptor) = obj.handle
function unsafe_destroy!(obj::CuTensorBSDescriptor)
- cutensorDestroyBlockSparseTensorDescriptor(obj)
+ return cutensorDestroyBlockSparseTensorDescriptor(obj)
end
## Descriptor function for CuTensorBS type. Please overwrite for custom objects
@@ -110,11 +120,13 @@ function CuTensorBSDescriptor(A::CuTensorBS)
numNonZeroBlocks = Int64(length(A.nonzero_block_coords))
numSectionsPerMode = collect(Int32, A.blocks_per_mode)
extent = block_extents(A)
- nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
+ nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
st = strides(A)
- dataType = eltype(A)#convert(cuTENSOR.cutensorDataType_t, eltype(A))
+ dataType = eltype(A) #convert(cuTENSOR.cutensorDataType_t, eltype(A))
## Right now assume stride is NULL. I am not sure if stride works, need to discuss with cuTENSOR team.
- CuTensorBSDescriptor(numModes, numNonZeroBlocks,
- numSectionsPerMode, extent, nonZeroCoordinates, dataType)
+ return CuTensorBSDescriptor(
+ numModes, numNonZeroBlocks,
+ numSectionsPerMode, extent, nonZeroCoordinates, dataType
+ )
end
diff --git a/lib/cutensor/src/libcutensor.jl b/lib/cutensor/src/libcutensor.jl
index b33560b72..4e7ba168d 100644
--- a/lib/cutensor/src/libcutensor.jl
+++ b/lib/cutensor/src/libcutensor.jl
@@ -545,12 +545,12 @@ end
@gcsafe_ccall libcutensor.cutensorBlockSparseContract(handle::cutensorHandle_t,
plan::cutensorPlan_t,
alpha::Ptr{Cvoid},
- A::Ptr{CuPtr{Cvoid}},
- B::Ptr{CuPtr{Cvoid}},
+ A::Ptr{CuPtr{Cvoid}},
+ B::Ptr{CuPtr{Cvoid}},
beta::Ptr{Cvoid},
- C::Ptr{CuPtr{Cvoid}},
- D::Ptr{CuPtr{Cvoid}},
- workspace::CuPtr{Cvoid},
+ C::Ptr{CuPtr{Cvoid}},
+ D::Ptr{CuPtr{Cvoid}},
+ workspace::CuPtr{Cvoid},
workspaceSize::UInt64,
stream::cudaStream_t)::cutensorStatus_t
end
diff --git a/lib/cutensor/test/contractions.jl b/lib/cutensor/test/contractions.jl
index 636600a74..baf56949a 100644
--- a/lib/cutensor/test/contractions.jl
+++ b/lib/cutensor/test/contractions.jl
@@ -188,62 +188,73 @@ end
end
end
-eltypes_compact = [
- (Float32, Float32, Float32, Float32),
- (ComplexF32, ComplexF32, ComplexF32, Float32),
- (Float64, Float64, Float64, Float64),
- (ComplexF64, ComplexF64, ComplexF64, Float64)
-]
-@testset "Blocksparse Contraction" begin
- ## There are many unsupported types because this is a new functionality
- ## So I will test with Float32 and ComplexF32 only
- @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
- ## i = [20,20,25]
- ## k = [10,10,15]
- ## l = [30,30,35]
- ## A = Tensor(k,i,l)
- ## Nonzero blocks are
- ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
- A = Vector{CuArray{eltyA, 3}}()
- for k in [10,15]
- for i in [20,25]
- for l in [30,35]
- push!(A, CuArray(ones(eltyA, k,i,l)))
+ eltypes_compact = [
+ (Float32, Float32, Float32, Float32),
+ (ComplexF32, ComplexF32, ComplexF32, Float32),
+ (Float64, Float64, Float64, Float64),
+ (ComplexF64, ComplexF64, ComplexF64, Float64),
+ ]
+ @testset "Blocksparse Contraction" begin
+ ## There are many unsupported types because this is a new functionality
+ ## So I will test with Float32 and ComplexF32 only
+ @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
+ ## i = [20,20,25]
+ ## k = [10,10,15]
+ ## l = [30,30,35]
+ ## A = Tensor(k,i,l)
+ ## Nonzero blocks are
+ ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
+ A = Vector{CuArray{eltyA, 3}}()
+ for k in [10, 15]
+ for i in [20, 25]
+ for l in [30, 35]
+ push!(A, CuArray(ones(eltyA, k, i, l)))
+ end
end
end
- end
- ## B = Tensor(k,l)
- ## Nonzero blocks are
- ## [1,1], [2,3]
- B = Array{CuArray{eltyB, 2}}(
- [CuArray(randn(eltyB, 10, 30)),
- CuArray(randn(eltyB, 10, 35))])
-
- ## C = Tensor(i)
- ## Nonzero blocks are
- ## [1,], [3,]
- C = Vector{CuArray{eltyC, 1}}(
- [CuArray(zeros(eltyC, 20)),
- CuArray(zeros(eltyC, 25))]
- )
-
- cuTenA = cuTENSOR.CuTensorBS(A, [3,3,3],
- [(10,10,15), (20,20,25), (30,30,35)],
- [(1,1,1), (1,1,3), (1,3,1), (1,3,3), (3,1,1), (3,1,3), (3,3,1), (3,3,3)],
- [1,3,2])
- cuTenB = cuTENSOR.CuTensorBS(B, [3,3],
- [(10,10,15), (30,30,35)],
- [(1,1),(2,3)], [1,2], )
- cuTenC = cuTENSOR.CuTensorBS(C, [3],
- [(20,20,25)],[(1,),(3,)], [3])
-
- mul!(cuTenC, cuTenA, cuTenB, 1, 0)
- ## C[1] = A[1,1,1] * B[1,1]
- @test C[1] ≈ reshape(permutedims(A[1], (2,1,3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
- ## C[3] = A[1,3,1] * B[1,1]
- @test C[2] ≈ reshape(permutedims(A[3], (2,1,3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+ ## B = Tensor(k,l)
+ ## Nonzero blocks are
+ ## [1,1], [2,3]
+ B = Array{CuArray{eltyB, 2}}(
+ [
+ CuArray(randn(eltyB, 10, 30)),
+ CuArray(randn(eltyB, 10, 35)),
+ ]
+ )
+
+ ## C = Tensor(i)
+ ## Nonzero blocks are
+ ## [1,], [3,]
+ C = Vector{CuArray{eltyC, 1}}(
+ [
+ CuArray(zeros(eltyC, 20)),
+ CuArray(zeros(eltyC, 25)),
+ ]
+ )
+
+ cuTenA = cuTENSOR.CuTensorBS(
+ A, [3, 3, 3],
+ [(10, 10, 15), (20, 20, 25), (30, 30, 35)],
+ [(1, 1, 1), (1, 1, 3), (1, 3, 1), (1, 3, 3), (3, 1, 1), (3, 1, 3), (3, 3, 1), (3, 3, 3)],
+ [1, 3, 2]
+ )
+ cuTenB = cuTENSOR.CuTensorBS(
+ B, [3, 3],
+ [(10, 10, 15), (30, 30, 35)],
+ [(1, 1), (2, 3)], [1, 2],
+ )
+ cuTenC = cuTENSOR.CuTensorBS(
+ C, [3],
+ [(20, 20, 25)], [(1,), (3,)], [3]
+ )
+
+ mul!(cuTenC, cuTenA, cuTenB, 1, 0)
+ ## C[1] = A[1,1,1] * B[1,1]
+ @test C[1] ≈ reshape(permutedims(A[1], (2, 1, 3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
+ ## C[3] = A[1,3,1] * B[1,1]
+ @test C[2] ≈ reshape(permutedims(A[3], (2, 1, 3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+ end
end
-end
end |
|
There were some issues in the Clang.jl's conversion of the cuTENSOR.h file into Julia wrapper functions. Specifically I had a runtime issue when trying to convert arrays of cuarray into |
…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #3057 +/- ##
===========================================
+ Coverage 76.94% 89.44% +12.49%
===========================================
Files 148 151 +3
Lines 12984 13149 +165
===========================================
+ Hits 9991 11761 +1770
+ Misses 2993 1388 -1605 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 26735e0 | Previous: a9a687c | Ratio |
|---|---|---|---|
array/accumulate/Float32/1d |
102064.5 ns |
102250.5 ns |
1.00 |
array/accumulate/Float32/dims=1 |
77401 ns |
77483 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1586555 ns |
1593535 ns |
1.00 |
array/accumulate/Float32/dims=2 |
144456.5 ns |
144723 ns |
1.00 |
array/accumulate/Float32/dims=2L |
658808 ns |
661407 ns |
1.00 |
array/accumulate/Int64/1d |
119464 ns |
120091 ns |
0.99 |
array/accumulate/Int64/dims=1 |
81006 ns |
81265.5 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1695964 ns |
1707476 ns |
0.99 |
array/accumulate/Int64/dims=2 |
157314 ns |
158147 ns |
0.99 |
array/accumulate/Int64/dims=2L |
962862 ns |
962833.5 ns |
1.00 |
array/broadcast |
20706 ns |
20861 ns |
0.99 |
array/construct |
1322.2 ns |
1305.3 ns |
1.01 |
array/copy |
18958 ns |
18977 ns |
1.00 |
array/copyto!/cpu_to_gpu |
218370 ns |
218984.5 ns |
1.00 |
array/copyto!/gpu_to_cpu |
284923 ns |
288110 ns |
0.99 |
array/copyto!/gpu_to_gpu |
11561 ns |
11568 ns |
1.00 |
array/iteration/findall/bool |
132953.5 ns |
133871.5 ns |
0.99 |
array/iteration/findall/int |
150434 ns |
150576 ns |
1.00 |
array/iteration/findfirst/bool |
83153 ns |
82372 ns |
1.01 |
array/iteration/findfirst/int |
85052 ns |
84621.5 ns |
1.01 |
array/iteration/findmin/1d |
89597.5 ns |
85921.5 ns |
1.04 |
array/iteration/findmin/2d |
117775 ns |
117665 ns |
1.00 |
array/iteration/logical |
205168 ns |
204118.5 ns |
1.01 |
array/iteration/scalar |
68208 ns |
69670.5 ns |
0.98 |
array/permutedims/2d |
53054.5 ns |
53206.5 ns |
1.00 |
array/permutedims/3d |
53252 ns |
53542.5 ns |
0.99 |
array/permutedims/4d |
52393.5 ns |
52933.5 ns |
0.99 |
array/random/rand/Float32 |
13138 ns |
13218 ns |
0.99 |
array/random/rand/Int64 |
30107 ns |
34787 ns |
0.87 |
array/random/rand!/Float32 |
8722.833333333332 ns |
8525.333333333334 ns |
1.02 |
array/random/rand!/Int64 |
34298 ns |
34352 ns |
1.00 |
array/random/randn/Float32 |
43710.5 ns |
38965 ns |
1.12 |
array/random/randn!/Float32 |
31717 ns |
31554 ns |
1.01 |
array/reductions/mapreduce/Float32/1d |
35874 ns |
35525 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=1 |
40914 ns |
40453.5 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=1L |
52233 ns |
52015 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2 |
57036 ns |
56943 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2L |
69864 ns |
69861 ns |
1.00 |
array/reductions/mapreduce/Int64/1d |
44167 ns |
43677 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=1 |
42894.5 ns |
42176 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=1L |
88042 ns |
87856 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2 |
60132 ns |
59673 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2L |
85480 ns |
84933 ns |
1.01 |
array/reductions/reduce/Float32/1d |
36303 ns |
35822.5 ns |
1.01 |
array/reductions/reduce/Float32/dims=1 |
40549 ns |
42636 ns |
0.95 |
array/reductions/reduce/Float32/dims=1L |
52260 ns |
51953 ns |
1.01 |
array/reductions/reduce/Float32/dims=2 |
56963 ns |
57183 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
70514.5 ns |
69983 ns |
1.01 |
array/reductions/reduce/Int64/1d |
44149 ns |
43946 ns |
1.00 |
array/reductions/reduce/Int64/dims=1 |
42928 ns |
52864 ns |
0.81 |
array/reductions/reduce/Int64/dims=1L |
88043.5 ns |
88045 ns |
1.00 |
array/reductions/reduce/Int64/dims=2 |
60020 ns |
59735 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
85400 ns |
84891 ns |
1.01 |
array/reverse/1d |
18645 ns |
18427 ns |
1.01 |
array/reverse/1dL |
69202.5 ns |
68976.5 ns |
1.00 |
array/reverse/1dL_inplace |
66112 ns |
66026 ns |
1.00 |
array/reverse/1d_inplace |
10404.666666666666 ns |
10297.333333333334 ns |
1.01 |
array/reverse/2d |
21106.5 ns |
20813 ns |
1.01 |
array/reverse/2dL |
73177 ns |
72948 ns |
1.00 |
array/reverse/2dL_inplace |
66345 ns |
66137 ns |
1.00 |
array/reverse/2d_inplace |
10573 ns |
11426 ns |
0.93 |
array/sorting/1d |
2736767 ns |
2736028 ns |
1.00 |
array/sorting/2d |
1069659 ns |
1076314.5 ns |
0.99 |
array/sorting/by |
3305866 ns |
3305633 ns |
1.00 |
cuda/synchronization/context/auto |
1210.2 ns |
1177.1 ns |
1.03 |
cuda/synchronization/context/blocking |
956.6451612903226 ns |
930.1428571428571 ns |
1.03 |
cuda/synchronization/context/nonblocking |
7936.4 ns |
7222.2 ns |
1.10 |
cuda/synchronization/stream/auto |
1053.3 ns |
1023.8235294117648 ns |
1.03 |
cuda/synchronization/stream/blocking |
797.33 ns |
829.5588235294117 ns |
0.96 |
cuda/synchronization/stream/nonblocking |
7811.8 ns |
7804.5 ns |
1.00 |
integration/byval/reference |
144086 ns |
144003 ns |
1.00 |
integration/byval/slices=1 |
145786 ns |
145940 ns |
1.00 |
integration/byval/slices=2 |
284769.5 ns |
284688 ns |
1.00 |
integration/byval/slices=3 |
423169 ns |
423167.5 ns |
1.00 |
integration/cudadevrt |
102599 ns |
102604 ns |
1.00 |
integration/volumerhs |
9436129 ns |
9452453.5 ns |
1.00 |
kernel/indexing |
13518 ns |
13481 ns |
1.00 |
kernel/indexing_checked |
14295 ns |
14205 ns |
1.01 |
kernel/launch |
2226.5555555555557 ns |
2129 ns |
1.05 |
kernel/occupancy |
659.7407407407408 ns |
660.2115384615385 ns |
1.00 |
kernel/rand |
17384 ns |
18430 ns |
0.94 |
latency/import |
3811206939.5 ns |
3804590639.5 ns |
1.00 |
latency/precompile |
4594645577.5 ns |
4586818051.5 ns |
1.00 |
latency/ttfp |
4392251532.5 ns |
4387745362.5 ns |
1.00 |
This comment was automatically generated by workflow using github-action-benchmark.
|
Thanks very much for putting this together, I'm happy to help with the header issues if needed! |
…but the C++ code is still in flux)
|
@kshyatt I removed the extra code, made the functions that linked to the library relatively agnostic (i.e. you are not forced to use CuTensorBS but can buy in if you'd like) and added a unit test. If you could help with the Clang.jl issue, that would be amazing! |
|
I'll try to take a look today! |
|
Did you use the scripts in |
…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
Yes I did use the scripts but this produced the ERROR: MethodError: no method matching unsafe_convert(::Type{Ptr{Nothing}}, ::CuPtr{Nothing})
The function `unsafe_convert` exists, but no method is defined for this combination of argument types.
Closest candidates are:
unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitBlame)
@ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitRevWalker)
@ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
unsafe_convert(::Type{Ptr{Nothing}}, ::LibGit2.GitDiffStats)
@ LibGit2 ~/.julia/juliaup/julia-1.12.1+0.x64.linux.gnu/share/julia/stdlib/v1.12/LibGit2/src/types.jl:1096
...
Stacktrace:
[1] Ref{Ptr{Nothing}}(a::Vector{CuPtr{Nothing}})
@ Base ./refpointer.jl:166
[2] cconvert
@ ./refpointer.jl:178 [inlined]
[3] macro expansion
@ ~/.julia/dev/CUDA.jl/lib/cutensor/src/libcutensor.jl:545 [inlined]
[4] (::cuTENSOR.var"#cutensorBlockSparseContract##0#cutensorBlockSparseContract##1"{…})()
@ cuTENSOR ~/.julia/packages/GPUToolbox/JLBB1/src/ccalls.jl:34
[5] retry_reclaim
@ ~/.julia/packages/CUDA/Il00B/src/memory.jl:434 [inlined]
[6] check
@ ~/.julia/dev/CUDA.jl/lib/cutensor/src/libcutensor.jl:22 [inlined]
[7] cutensorBlockSparseContract
@ ~/.julia/packages/GPUToolbox/JLBB1/src/ccalls.jl:33 [inlined]
[8]
@ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/operations.jl:50
[9] contract!(alpha::Number, A::Any, Ainds::Vector{…}, opA::cuTENSOR.cutensorOperator_t, B::Any, Binds::Vector{…}, opB::cuTENSOR.cutensorOperator_t, beta::Number, C::Any, Cinds::Vector{…}, opC::cuTENSOR.cutensorOperator_t, opOut::cuTENSOR.cutensorOperator_t; jit::cuTENSOR.cutensorJitMode_t, workspace::cuTENSOR.cutensorWorksizePreference_t, algo::cuTENSOR.cutensorAlgo_t, compute_type::Nothing, plan::Nothing)
@ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/operations.jl:25
[10] mul!(C::CuTensorBS{Float64, 1}, A::CuTensorBS{Float64, 3}, B::CuTensorBS{Float64, 2}, α::Float64, β::Float64)
@ cuTENSOR ~/.julia/dev/CUDA.jl/lib/cutensor/src/blocksparse/interfaces.jl:21However, I found that If I modify the code to be |
|
Probably you missed some of the weird esoterica in |
17806da to
cc4b826
Compare
|
Thanks for doing all the work to get this going, I think it will be quite useful for a bunch of TN packages... |
lkdvos
left a comment
There was a problem hiding this comment.
Left some remaining comments, but for me I think most of the parts that I would use are there, since I don't really see myself going through the CuTensorBS construction (we also never used the CuTensor in TensorOperations so that is completely fine)
| mutable struct CuTensorBSDescriptor | ||
| handle::cutensorBlockSparseTensorDescriptor_t | ||
| # inner constructor handles creation and finalizer of the descriptor | ||
| function CuTensorBSDescriptor( |
There was a problem hiding this comment.
I think it would be both helpful for clarity/self-documentation and for avoiding hard to decypher errors to restrict the types of these arguments in the inner constructor. This would also be more in line with the CuTensorDescriptor type + constructors.
There was a problem hiding this comment.
This makes sense. I added datatypes here for reference. I did have trouble mapping C_NULL to a datatype for the union so I just added a comment here and accept Any for now
Remove left over code. Will need to make something like this to define mul! in the future Co-authored-by: Lukas Devos <ldevos98@gmail.com>
…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
…to a contigous memory block)
Hi,
This is a wrapper type and functions to access the newly introduced blocksparse cutensor backend. Right now the code is expert level, i.e. users need to write a type that converts their object to CuTensorBS types or can achieve the low-level operations required by cutensor kernels. I am still writing a test but the code is fully operational.
Thanks,
Karl