Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CUDA/GB_cuda_apply.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ GrB_Info GB_cuda_apply_unop_jit
const GB_Operator op,
const bool flipij,
const GrB_Matrix A,
const bool do_iso_expansion,
const GB_void *ythunk,
// CUDA stream and launch parameters:
cudaStream_t stream,
Expand All @@ -35,6 +36,7 @@ GrB_Info GB_cuda_apply_bind1st_jit
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A,
const bool do_iso_expansion,
const GB_void *xscalar,
// CUDA stream and launch parameters:
cudaStream_t stream,
Expand All @@ -50,6 +52,7 @@ GrB_Info GB_cuda_apply_bind2nd_jit
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A,
const bool do_iso_expansion,
const GB_void *xscalar,
// CUDA stream and launch parameters:
cudaStream_t stream,
Expand Down
5 changes: 3 additions & 2 deletions CUDA/GB_cuda_apply_bind1st_jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ GrB_Info GB_cuda_apply_bind1st_jit
const GrB_Type ctype,
const GrB_BinaryOp binaryop,
const GrB_Matrix B,
const bool do_iso_expansion,
const GB_void *scalarx,
// CUDA stream and launch parameters:
cudaStream_t stream,
Expand Down Expand Up @@ -50,6 +51,6 @@ GrB_Info GB_cuda_apply_bind1st_jit
//--------------------------------------------------------------------------

GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
return (GB_jit_kernel (Cx, scalarx, B, stream, gridsz, blocksz,
&GB_callback)) ;
return (GB_jit_kernel (Cx, do_iso_expansion, scalarx, B, stream,
gridsz, blocksz, &GB_callback)) ;
}
5 changes: 3 additions & 2 deletions CUDA/GB_cuda_apply_bind2nd_jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ GrB_Info GB_cuda_apply_bind2nd_jit
const GrB_Type ctype,
const GrB_BinaryOp binaryop,
const GrB_Matrix A,
const bool do_iso_expansion,
const GB_void *scalarx,
// CUDA stream and launch parameters:
cudaStream_t stream,
Expand Down Expand Up @@ -50,6 +51,6 @@ GrB_Info GB_cuda_apply_bind2nd_jit
//--------------------------------------------------------------------------

GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
return (GB_jit_kernel (Cx, A, scalarx, stream, gridsz, blocksz,
&GB_callback)) ;
return (GB_jit_kernel (Cx, A, do_iso_expansion, scalarx, stream,
gridsz, blocksz, &GB_callback)) ;
}
5 changes: 3 additions & 2 deletions CUDA/GB_cuda_apply_binop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ GrB_Info GB_cuda_apply_binop
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A,
const bool do_iso_expansion,
const GB_void *scalarx,
const bool bind1st
)
Expand Down Expand Up @@ -68,12 +69,12 @@ GrB_Info GB_cuda_apply_binop
if (bind1st)
{
GB_OK (GB_cuda_apply_bind1st_jit (Cx, ctype, op, A,
scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ;
do_iso_expansion, scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ;
}
else
{
GB_OK (GB_cuda_apply_bind2nd_jit (Cx, ctype, op, A,
scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ;
do_iso_expansion, scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ;
}

GB_FREE_WORKSPACE ;
Expand Down
3 changes: 2 additions & 1 deletion CUDA/GB_cuda_apply_unop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ GrB_Info GB_cuda_apply_unop
const GB_Operator op,
const bool flipij,
const GrB_Matrix A,
const bool do_iso_expansion,
const GB_void *ythunk
)
{
Expand Down Expand Up @@ -61,7 +62,7 @@ GrB_Info GB_cuda_apply_unop
int32_t gridsz = std::min (raw_gridsz, (int64_t) (number_of_sms * 256)) ;

GB_OK (GB_cuda_apply_unop_jit (Cx, ctype, op, flipij, A,
ythunk_cuda, stream, gridsz, BLOCK_SIZE)) ;
do_iso_expansion, ythunk_cuda, stream, gridsz, BLOCK_SIZE)) ;

GB_FREE_WORKSPACE ;
return GrB_SUCCESS ;
Expand Down
5 changes: 3 additions & 2 deletions CUDA/GB_cuda_apply_unop_jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ GrB_Info GB_cuda_apply_unop_jit
const GB_Operator op,
const bool flipij,
const GrB_Matrix A,
const bool do_iso_expansion,
const GB_void *ythunk,
// CUDA stream and launch parameters:
cudaStream_t stream,
Expand Down Expand Up @@ -51,6 +52,6 @@ GrB_Info GB_cuda_apply_unop_jit
//--------------------------------------------------------------------------

GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
return (GB_jit_kernel (Cx, A, ythunk, stream, gridsz, blocksz,
&GB_callback)) ;
return (GB_jit_kernel (Cx, A, do_iso_expansion, ythunk, stream, gridsz,
blocksz, &GB_callback)) ;
}
20 changes: 19 additions & 1 deletion CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ using namespace cooperative_groups ;
__global__ void GB_cuda_apply_bind1st_kernel
(
GB_void *Cx_out,
const bool do_iso_expansion,
const GB_void *scalarx,
GrB_Matrix B
)
Expand All @@ -19,11 +20,22 @@ __global__ void GB_cuda_apply_bind1st_kernel

GB_B_NHELD (nvals) ;

GB_C_TYPE iso_val ;
if (do_iso_expansion)
{
iso_val = Cx [0] ;
}

int tid = blockDim.x * blockIdx.x + threadIdx.x;
int nthreads = blockDim.x * gridDim.x ;

for (int64_t p = tid ; p < nvals ; p += nthreads)
{
if (do_iso_expansion)
{
Cx [p] = iso_val ;
}

if (!GBb_B (Bb, p)) { continue ; }
GB_DECLAREB (bij) ;
GB_GETB (bij, Bx, p, false) ;
Expand All @@ -45,9 +57,15 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND1ST_PROTO (GB_jit_kernel)
GB_B_NHELD (nvals) ;
if (nvals == 0) return (GrB_SUCCESS) ;

if (do_iso_expansion)
{
ASSERT (!B->iso) ;
}

CUDA_OK (cudaGetLastError ( )) ;
CUDA_OK (cudaStreamSynchronize (stream)) ;
GB_cuda_apply_bind1st_kernel <<<grid, block, 0, stream>>> (Cx, scalarx, B) ;
GB_cuda_apply_bind1st_kernel <<<grid, block, 0, stream>>> (Cx,
do_iso_expansion, scalarx, B) ;
CUDA_OK (cudaGetLastError ( )) ;
CUDA_OK (cudaStreamSynchronize (stream)) ;

Expand Down
20 changes: 19 additions & 1 deletion CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ __global__ void GB_cuda_apply_bind2nd_kernel
(
GB_void *Cx_out,
GrB_Matrix A,
const bool do_iso_expansion,
const GB_void *scalarx
)
{
Expand All @@ -19,11 +20,22 @@ __global__ void GB_cuda_apply_bind2nd_kernel

GB_A_NHELD (nvals) ;

GB_C_TYPE iso_val ;
if (do_iso_expansion)
{
iso_val = Cx [0] ;
}

int tid = blockDim.x * blockIdx.x + threadIdx.x;
int nthreads = blockDim.x * gridDim.x ;

for (int64_t p = tid ; p < nvals ; p += nthreads)
{
if (do_iso_expansion)
{
Cx [p] = iso_val ;
}

if (!GBb_A (Ab, p)) { continue ; }
GB_DECLAREA (aij) ;
GB_GETA (aij, Ax, p, false) ;
Expand All @@ -45,9 +57,15 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND2ND_PROTO (GB_jit_kernel)
GB_A_NHELD (nvals) ;
if (nvals == 0) return (GrB_SUCCESS) ;

if (do_iso_expansion)
{
ASSERT (!A->iso) ;
}

CUDA_OK (cudaGetLastError ( )) ;
CUDA_OK (cudaStreamSynchronize (stream)) ;
GB_cuda_apply_bind2nd_kernel <<<grid, block, 0, stream>>> (Cx, A, scalarx) ;
GB_cuda_apply_bind2nd_kernel <<<grid, block, 0, stream>>> (Cx, A,
do_iso_expansion, scalarx) ;
CUDA_OK (cudaGetLastError ( )) ;
CUDA_OK (cudaStreamSynchronize (stream)) ;

Expand Down
25 changes: 24 additions & 1 deletion CUDA/template/GB_jit_kernel_cuda_apply_unop.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ using namespace cooperative_groups ;
__global__ void GB_cuda_apply_unop_kernel
(
GB_void *Cx_out,
const bool do_iso_expansion,
const GB_void *thunk,
GrB_Matrix A
)
Expand Down Expand Up @@ -46,6 +47,12 @@ __global__ void GB_cuda_apply_unop_kernel
GB_Y_TYPE thunk_value = * ((GB_Y_TYPE *) thunk) ;
#endif

GB_C_TYPE iso_val ;
if (do_iso_expansion)
{
iso_val = Cx [0] ;
}

#if ( GB_A_IS_BITMAP || GB_A_IS_FULL )
// bitmap/full case
int tid = blockDim.x * blockIdx.x + threadIdx.x ;
Expand All @@ -61,6 +68,11 @@ __global__ void GB_cuda_apply_unop_kernel
#if ( GB_DEPENDS_ON_J )
int64_t col_idx = p / A->vlen ;
#endif

if (do_iso_expansion)
{
Cx [p] = iso_val ;
}

GB_UNOP (Cx, p, Ax, p, A_iso, row_idx, col_idx, thunk_value) ;
}
Expand All @@ -84,6 +96,11 @@ __global__ void GB_cuda_apply_unop_kernel
int64_t k = GB_cuda_ek_slice_entry<GB_Ap_TYPE> (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
int64_t col_idx = GBh_A (Ah, k) ;

if (do_iso_expansion)
{
Cx [p] = iso_val ;
}

#if ( GB_DEPENDS_ON_I )
int64_t row_idx = GBi_A (Ai, p_final, A->vlen) ;
#endif
Expand Down Expand Up @@ -122,9 +139,15 @@ GB_JIT_CUDA_KERNEL_APPLY_UNOP_PROTO (GB_jit_kernel)
GB_A_NHELD (anz) ;
if (anz == 0) return (GrB_SUCCESS) ;

if (do_iso_expansion)
{
ASSERT (!A->iso) ;
}

CUDA_OK (cudaGetLastError ( )) ;
CUDA_OK (cudaStreamSynchronize (stream)) ;
GB_cuda_apply_unop_kernel <<<grid, block, 0, stream>>> (Cx, ythunk, A) ;
GB_cuda_apply_unop_kernel <<<grid, block, 0, stream>>> (Cx,
do_iso_expansion, ythunk, A) ;
CUDA_OK (cudaGetLastError ( )) ;
CUDA_OK (cudaStreamSynchronize (stream)) ;

Expand Down
2 changes: 1 addition & 1 deletion Source/apply/GB_apply.c
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ GrB_Info GB_apply // C<M> = accum (C, op(A)) or op(A')
{
// expand C to non-iso; initialize C->x unless the op
// is positional
info = GB_convert_any_to_non_iso (C, !op_is_positional) ;
info = GB_convert_any_to_non_iso (C, false) ;
}
if (info == GrB_SUCCESS)
{
Expand Down
50 changes: 44 additions & 6 deletions Source/apply/GB_apply_op.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,25 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A)
//--------------------------------------------------------------------------

// A->x is not const since the operator might be applied in-place, if
// C is aliased to C.
// A is aliased to C.

GB_void *Ax = (GB_void *) A->x ; // A->x has type A->type
const int8_t *Ab = A->b ; // only if A is bitmap
const GrB_Type Atype = A->type ; // type of A->x
const int64_t anz = GB_nnz_held (A) ; // size of A->x and Cx
#define GB_A_IS_BITMAP (Ab != NULL)
bool do_iso_expansion = false ;

if ((C_code_iso == GB_NON_ISO) &&
(Cx == Ax) &&
!GB_OPCODE_IS_POSITIONAL (op_in->opcode))
{
// We should have already realloc'd C->x using
// GB_convert_any_to_non_iso
ASSERT (!A->iso) ;
// Expand the iso value to all of C->x
do_iso_expansion = true ;
}

//--------------------------------------------------------------------------
// determine the maximum number of threads to use
Expand Down Expand Up @@ -186,10 +198,12 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A)
if (GB_cuda_apply_unop_branch (ctype, A, op))
{
info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A,
(GB_void *) &thunk) ;
false, (GB_void *) &thunk) ;
}
#endif

// No iso expansion needed as this is a non-user positional op

//----------------------------------------------------------------------
// positional op via the CPU factory kernel
//----------------------------------------------------------------------
Expand Down Expand Up @@ -411,10 +425,18 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A)
#if defined ( GRAPHBLAS_HAS_CUDA )
if (GB_cuda_apply_unop_branch (ctype, A, op))
{
info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A, NULL) ;
info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A,
do_iso_expansion, NULL) ;
}
#endif

if ((info == GrB_NO_VALUE) && do_iso_expansion)
{
// This will just do the GB_iso_expand, since C->x
// has already been realloc'd to the right size
GB_convert_any_to_non_iso (A, true) ;
}

//----------------------------------------------------------------------
// unary op via the factory kernel
//----------------------------------------------------------------------
Expand Down Expand Up @@ -556,10 +578,15 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A)
if (GB_cuda_apply_binop_branch (ctype, (GrB_BinaryOp) op, A))
{
info = GB_cuda_apply_binop (Cx, ctype, (GrB_BinaryOp) op, A,
scalarx, true) ;
do_iso_expansion, scalarx, true) ;
}
#endif

if ((info == GrB_NO_VALUE) && do_iso_expansion)
{
GB_convert_any_to_non_iso (A, true) ;
}

//------------------------------------------------------------------
// binary op (bind 1st) via the CPU factory kernel
//------------------------------------------------------------------
Expand Down Expand Up @@ -622,10 +649,15 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A)
if (GB_cuda_apply_binop_branch (ctype, (GrB_BinaryOp) op, A))
{
info = GB_cuda_apply_binop (Cx, ctype, (GrB_BinaryOp) op, A,
scalarx, false) ;
do_iso_expansion, scalarx, false) ;
}
#endif

if ((info == GrB_NO_VALUE) && do_iso_expansion)
{
GB_convert_any_to_non_iso (A, true) ;
}

//------------------------------------------------------------------
// binary op (bind 2nd) via the CPU factory kernel
//------------------------------------------------------------------
Expand Down Expand Up @@ -758,10 +790,16 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A)
#if defined ( GRAPHBLAS_HAS_CUDA )
if (GB_cuda_apply_unop_branch (ctype, A, op))
{
info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A, ythunk) ;
info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A,
do_iso_expansion, ythunk) ;
}
#endif

if ((info == GrB_NO_VALUE) && do_iso_expansion)
{
GB_convert_any_to_non_iso (A, true) ;
}

//----------------------------------------------------------------------
// user-defined index-unary op via the JIT or PreJIT kernel
//----------------------------------------------------------------------
Expand Down
Loading