diff --git a/CUDA/GB_cuda_apply.hpp b/CUDA/GB_cuda_apply.hpp index d7fa101dcf..c1f2bd1f1f 100644 --- a/CUDA/GB_cuda_apply.hpp +++ b/CUDA/GB_cuda_apply.hpp @@ -20,6 +20,7 @@ GrB_Info GB_cuda_apply_unop_jit const GB_Operator op, const bool flipij, const GrB_Matrix A, + const bool do_iso_expansion, const GB_void *ythunk, // CUDA stream and launch parameters: cudaStream_t stream, @@ -35,6 +36,7 @@ GrB_Info GB_cuda_apply_bind1st_jit const GrB_Type ctype, const GrB_BinaryOp op, const GrB_Matrix A, + const bool do_iso_expansion, const GB_void *xscalar, // CUDA stream and launch parameters: cudaStream_t stream, @@ -50,6 +52,7 @@ GrB_Info GB_cuda_apply_bind2nd_jit const GrB_Type ctype, const GrB_BinaryOp op, const GrB_Matrix A, + const bool do_iso_expansion, const GB_void *xscalar, // CUDA stream and launch parameters: cudaStream_t stream, diff --git a/CUDA/GB_cuda_apply_bind1st_jit.cpp b/CUDA/GB_cuda_apply_bind1st_jit.cpp index ba3a03da24..4353840750 100644 --- a/CUDA/GB_cuda_apply_bind1st_jit.cpp +++ b/CUDA/GB_cuda_apply_bind1st_jit.cpp @@ -14,6 +14,7 @@ GrB_Info GB_cuda_apply_bind1st_jit const GrB_Type ctype, const GrB_BinaryOp binaryop, const GrB_Matrix B, + const bool do_iso_expansion, const GB_void *scalarx, // CUDA stream and launch parameters: cudaStream_t stream, @@ -50,6 +51,6 @@ GrB_Info GB_cuda_apply_bind1st_jit //-------------------------------------------------------------------------- GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ; - return (GB_jit_kernel (Cx, scalarx, B, stream, gridsz, blocksz, - &GB_callback)) ; + return (GB_jit_kernel (Cx, do_iso_expansion, scalarx, B, stream, + gridsz, blocksz, &GB_callback)) ; } diff --git a/CUDA/GB_cuda_apply_bind2nd_jit.cpp b/CUDA/GB_cuda_apply_bind2nd_jit.cpp index d96b4772ad..ea943ff4dc 100644 --- a/CUDA/GB_cuda_apply_bind2nd_jit.cpp +++ b/CUDA/GB_cuda_apply_bind2nd_jit.cpp @@ -14,6 +14,7 @@ GrB_Info GB_cuda_apply_bind2nd_jit const GrB_Type ctype, const GrB_BinaryOp binaryop, const GrB_Matrix A, + const bool do_iso_expansion, const GB_void *scalarx, // CUDA stream and launch parameters: cudaStream_t stream, @@ -50,6 +51,6 @@ GrB_Info GB_cuda_apply_bind2nd_jit //-------------------------------------------------------------------------- GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ; - return (GB_jit_kernel (Cx, A, scalarx, stream, gridsz, blocksz, - &GB_callback)) ; + return (GB_jit_kernel (Cx, A, do_iso_expansion, scalarx, stream, + gridsz, blocksz, &GB_callback)) ; } diff --git a/CUDA/GB_cuda_apply_binop.cpp b/CUDA/GB_cuda_apply_binop.cpp index 603b06bad4..068b845ba2 100644 --- a/CUDA/GB_cuda_apply_binop.cpp +++ b/CUDA/GB_cuda_apply_binop.cpp @@ -24,6 +24,7 @@ GrB_Info GB_cuda_apply_binop const GrB_Type ctype, const GrB_BinaryOp op, const GrB_Matrix A, + const bool do_iso_expansion, const GB_void *scalarx, const bool bind1st ) @@ -68,12 +69,12 @@ GrB_Info GB_cuda_apply_binop if (bind1st) { GB_OK (GB_cuda_apply_bind1st_jit (Cx, ctype, op, A, - scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ; + do_iso_expansion, scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ; } else { GB_OK (GB_cuda_apply_bind2nd_jit (Cx, ctype, op, A, - scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ; + do_iso_expansion, scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ; } GB_FREE_WORKSPACE ; diff --git a/CUDA/GB_cuda_apply_unop.cpp b/CUDA/GB_cuda_apply_unop.cpp index 9b294307b5..5843bbab9f 100644 --- a/CUDA/GB_cuda_apply_unop.cpp +++ b/CUDA/GB_cuda_apply_unop.cpp @@ -25,6 +25,7 @@ GrB_Info GB_cuda_apply_unop const GB_Operator op, const bool flipij, const GrB_Matrix A, + const bool do_iso_expansion, const GB_void *ythunk ) { @@ -61,7 +62,7 @@ GrB_Info GB_cuda_apply_unop int32_t gridsz = std::min (raw_gridsz, (int64_t) (number_of_sms * 256)) ; GB_OK (GB_cuda_apply_unop_jit (Cx, ctype, op, flipij, A, - ythunk_cuda, stream, gridsz, BLOCK_SIZE)) ; + do_iso_expansion, ythunk_cuda, stream, gridsz, BLOCK_SIZE)) ; GB_FREE_WORKSPACE ; return GrB_SUCCESS ; diff --git a/CUDA/GB_cuda_apply_unop_jit.cpp b/CUDA/GB_cuda_apply_unop_jit.cpp index fccbf6746f..c0430e04bc 100644 --- a/CUDA/GB_cuda_apply_unop_jit.cpp +++ b/CUDA/GB_cuda_apply_unop_jit.cpp @@ -15,6 +15,7 @@ GrB_Info GB_cuda_apply_unop_jit const GB_Operator op, const bool flipij, const GrB_Matrix A, + const bool do_iso_expansion, const GB_void *ythunk, // CUDA stream and launch parameters: cudaStream_t stream, @@ -51,6 +52,6 @@ GrB_Info GB_cuda_apply_unop_jit //-------------------------------------------------------------------------- GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ; - return (GB_jit_kernel (Cx, A, ythunk, stream, gridsz, blocksz, - &GB_callback)) ; + return (GB_jit_kernel (Cx, A, do_iso_expansion, ythunk, stream, gridsz, + blocksz, &GB_callback)) ; } diff --git a/CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu b/CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu index 9e050e2661..f4aa246764 100644 --- a/CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu +++ b/CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu @@ -5,6 +5,7 @@ using namespace cooperative_groups ; __global__ void GB_cuda_apply_bind1st_kernel ( GB_void *Cx_out, + const bool do_iso_expansion, const GB_void *scalarx, GrB_Matrix B ) @@ -19,11 +20,22 @@ __global__ void GB_cuda_apply_bind1st_kernel GB_B_NHELD (nvals) ; + GB_C_TYPE iso_val ; + if (do_iso_expansion) + { + iso_val = Cx [0] ; + } + int tid = blockDim.x * blockIdx.x + threadIdx.x; int nthreads = blockDim.x * gridDim.x ; for (int64_t p = tid ; p < nvals ; p += nthreads) { + if (do_iso_expansion) + { + Cx [p] = iso_val ; + } + if (!GBb_B (Bb, p)) { continue ; } GB_DECLAREB (bij) ; GB_GETB (bij, Bx, p, false) ; @@ -45,9 +57,15 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND1ST_PROTO (GB_jit_kernel) GB_B_NHELD (nvals) ; if (nvals == 0) return (GrB_SUCCESS) ; + if (do_iso_expansion) + { + ASSERT (!B->iso) ; + } + CUDA_OK (cudaGetLastError ( )) ; CUDA_OK (cudaStreamSynchronize (stream)) ; - GB_cuda_apply_bind1st_kernel <<>> (Cx, scalarx, B) ; + GB_cuda_apply_bind1st_kernel <<>> (Cx, + do_iso_expansion, scalarx, B) ; CUDA_OK (cudaGetLastError ( )) ; CUDA_OK (cudaStreamSynchronize (stream)) ; diff --git a/CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu b/CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu index ff896ac1bc..b5e348f9ae 100644 --- a/CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu +++ b/CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu @@ -6,6 +6,7 @@ __global__ void GB_cuda_apply_bind2nd_kernel ( GB_void *Cx_out, GrB_Matrix A, + const bool do_iso_expansion, const GB_void *scalarx ) { @@ -19,11 +20,22 @@ __global__ void GB_cuda_apply_bind2nd_kernel GB_A_NHELD (nvals) ; + GB_C_TYPE iso_val ; + if (do_iso_expansion) + { + iso_val = Cx [0] ; + } + int tid = blockDim.x * blockIdx.x + threadIdx.x; int nthreads = blockDim.x * gridDim.x ; for (int64_t p = tid ; p < nvals ; p += nthreads) { + if (do_iso_expansion) + { + Cx [p] = iso_val ; + } + if (!GBb_A (Ab, p)) { continue ; } GB_DECLAREA (aij) ; GB_GETA (aij, Ax, p, false) ; @@ -45,9 +57,15 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND2ND_PROTO (GB_jit_kernel) GB_A_NHELD (nvals) ; if (nvals == 0) return (GrB_SUCCESS) ; + if (do_iso_expansion) + { + ASSERT (!A->iso) ; + } + CUDA_OK (cudaGetLastError ( )) ; CUDA_OK (cudaStreamSynchronize (stream)) ; - GB_cuda_apply_bind2nd_kernel <<>> (Cx, A, scalarx) ; + GB_cuda_apply_bind2nd_kernel <<>> (Cx, A, + do_iso_expansion, scalarx) ; CUDA_OK (cudaGetLastError ( )) ; CUDA_OK (cudaStreamSynchronize (stream)) ; diff --git a/CUDA/template/GB_jit_kernel_cuda_apply_unop.cu b/CUDA/template/GB_jit_kernel_cuda_apply_unop.cu index 598451fdbe..6f81e7bfeb 100644 --- a/CUDA/template/GB_jit_kernel_cuda_apply_unop.cu +++ b/CUDA/template/GB_jit_kernel_cuda_apply_unop.cu @@ -10,6 +10,7 @@ using namespace cooperative_groups ; __global__ void GB_cuda_apply_unop_kernel ( GB_void *Cx_out, + const bool do_iso_expansion, const GB_void *thunk, GrB_Matrix A ) @@ -46,6 +47,12 @@ __global__ void GB_cuda_apply_unop_kernel GB_Y_TYPE thunk_value = * ((GB_Y_TYPE *) thunk) ; #endif + GB_C_TYPE iso_val ; + if (do_iso_expansion) + { + iso_val = Cx [0] ; + } + #if ( GB_A_IS_BITMAP || GB_A_IS_FULL ) // bitmap/full case int tid = blockDim.x * blockIdx.x + threadIdx.x ; @@ -61,6 +68,11 @@ __global__ void GB_cuda_apply_unop_kernel #if ( GB_DEPENDS_ON_J ) int64_t col_idx = p / A->vlen ; #endif + + if (do_iso_expansion) + { + Cx [p] = iso_val ; + } GB_UNOP (Cx, p, Ax, p, A_iso, row_idx, col_idx, thunk_value) ; } @@ -84,6 +96,11 @@ __global__ void GB_cuda_apply_unop_kernel int64_t k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ; int64_t col_idx = GBh_A (Ah, k) ; + if (do_iso_expansion) + { + Cx [p] = iso_val ; + } + #if ( GB_DEPENDS_ON_I ) int64_t row_idx = GBi_A (Ai, p_final, A->vlen) ; #endif @@ -122,9 +139,15 @@ GB_JIT_CUDA_KERNEL_APPLY_UNOP_PROTO (GB_jit_kernel) GB_A_NHELD (anz) ; if (anz == 0) return (GrB_SUCCESS) ; + if (do_iso_expansion) + { + ASSERT (!A->iso) ; + } + CUDA_OK (cudaGetLastError ( )) ; CUDA_OK (cudaStreamSynchronize (stream)) ; - GB_cuda_apply_unop_kernel <<>> (Cx, ythunk, A) ; + GB_cuda_apply_unop_kernel <<>> (Cx, + do_iso_expansion, ythunk, A) ; CUDA_OK (cudaGetLastError ( )) ; CUDA_OK (cudaStreamSynchronize (stream)) ; diff --git a/Source/apply/GB_apply.c b/Source/apply/GB_apply.c index 12195f388a..af521fd955 100644 --- a/Source/apply/GB_apply.c +++ b/Source/apply/GB_apply.c @@ -295,7 +295,7 @@ GrB_Info GB_apply // C = accum (C, op(A)) or op(A') { // expand C to non-iso; initialize C->x unless the op // is positional - info = GB_convert_any_to_non_iso (C, !op_is_positional) ; + info = GB_convert_any_to_non_iso (C, false) ; } if (info == GrB_SUCCESS) { diff --git a/Source/apply/GB_apply_op.c b/Source/apply/GB_apply_op.c index 914a0c7464..9e4d1313dd 100644 --- a/Source/apply/GB_apply_op.c +++ b/Source/apply/GB_apply_op.c @@ -62,13 +62,25 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A) //-------------------------------------------------------------------------- // A->x is not const since the operator might be applied in-place, if - // C is aliased to C. + // A is aliased to C. GB_void *Ax = (GB_void *) A->x ; // A->x has type A->type const int8_t *Ab = A->b ; // only if A is bitmap const GrB_Type Atype = A->type ; // type of A->x const int64_t anz = GB_nnz_held (A) ; // size of A->x and Cx #define GB_A_IS_BITMAP (Ab != NULL) + bool do_iso_expansion = false ; + + if ((C_code_iso == GB_NON_ISO) && + (Cx == Ax) && + !GB_OPCODE_IS_POSITIONAL (op_in->opcode)) + { + // We should have already realloc'd C->x using + // GB_convert_any_to_non_iso + ASSERT (!A->iso) ; + // Expand the iso value to all of C->x + do_iso_expansion = true ; + } //-------------------------------------------------------------------------- // determine the maximum number of threads to use @@ -186,10 +198,12 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A) if (GB_cuda_apply_unop_branch (ctype, A, op)) { info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A, - (GB_void *) &thunk) ; + false, (GB_void *) &thunk) ; } #endif + // No iso expansion needed as this is a non-user positional op + //---------------------------------------------------------------------- // positional op via the CPU factory kernel //---------------------------------------------------------------------- @@ -411,10 +425,18 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A) #if defined ( GRAPHBLAS_HAS_CUDA ) if (GB_cuda_apply_unop_branch (ctype, A, op)) { - info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A, NULL) ; + info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A, + do_iso_expansion, NULL) ; } #endif + if ((info == GrB_NO_VALUE) && do_iso_expansion) + { + // This will just do the GB_iso_expand, since C->x + // has already been realloc'd to the right size + GB_convert_any_to_non_iso (A, true) ; + } + //---------------------------------------------------------------------- // unary op via the factory kernel //---------------------------------------------------------------------- @@ -556,10 +578,15 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A) if (GB_cuda_apply_binop_branch (ctype, (GrB_BinaryOp) op, A)) { info = GB_cuda_apply_binop (Cx, ctype, (GrB_BinaryOp) op, A, - scalarx, true) ; + do_iso_expansion, scalarx, true) ; } #endif + if ((info == GrB_NO_VALUE) && do_iso_expansion) + { + GB_convert_any_to_non_iso (A, true) ; + } + //------------------------------------------------------------------ // binary op (bind 1st) via the CPU factory kernel //------------------------------------------------------------------ @@ -622,10 +649,15 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A) if (GB_cuda_apply_binop_branch (ctype, (GrB_BinaryOp) op, A)) { info = GB_cuda_apply_binop (Cx, ctype, (GrB_BinaryOp) op, A, - scalarx, false) ; + do_iso_expansion, scalarx, false) ; } #endif + if ((info == GrB_NO_VALUE) && do_iso_expansion) + { + GB_convert_any_to_non_iso (A, true) ; + } + //------------------------------------------------------------------ // binary op (bind 2nd) via the CPU factory kernel //------------------------------------------------------------------ @@ -758,10 +790,16 @@ GrB_Info GB_apply_op // apply a unary op, idxunop, or binop, Cx = op (A) #if defined ( GRAPHBLAS_HAS_CUDA ) if (GB_cuda_apply_unop_branch (ctype, A, op)) { - info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A, ythunk) ; + info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A, + do_iso_expansion, ythunk) ; } #endif + if ((info == GrB_NO_VALUE) && do_iso_expansion) + { + GB_convert_any_to_non_iso (A, true) ; + } + //---------------------------------------------------------------------- // user-defined index-unary op via the JIT or PreJIT kernel //---------------------------------------------------------------------- diff --git a/Source/apply/GB_shallow_op.c b/Source/apply/GB_shallow_op.c index 4774d85d48..78b641e11f 100644 --- a/Source/apply/GB_shallow_op.c +++ b/Source/apply/GB_shallow_op.c @@ -186,6 +186,7 @@ GrB_Info GB_shallow_op // create shallow matrix and apply operator //-------------------------------------------------------------------------- // allocate new space for the numerical values of C; use calloc if bitmap + // TODO: This is bad for CUDA C->x = GB_XALLOC_MEMORY (GB_IS_BITMAP (C), C_iso, anz, C->type->size, &(C->x_size)) ; C->x_shallow = false ; // free C->x when freeing C diff --git a/Source/gateway/GB_cuda_gateway.h b/Source/gateway/GB_cuda_gateway.h index 764333fcd6..b97878bfb7 100644 --- a/Source/gateway/GB_cuda_gateway.h +++ b/Source/gateway/GB_cuda_gateway.h @@ -189,6 +189,7 @@ GrB_Info GB_cuda_apply_unop const GB_Operator op, const bool flipij, const GrB_Matrix A, + const bool do_iso_expansion, const GB_void *ythunk ) ; @@ -197,7 +198,8 @@ GrB_Info GB_cuda_apply_binop GB_void *Cx, const GrB_Type ctype, const GrB_BinaryOp op, - const GrB_Matrix A, + const GrB_Matrix A, + const bool do_iso_expansion, const GB_void *scalarx, const bool bind1st ) ; diff --git a/Source/jit_kernels/include/GB_jit_kernel_proto.h b/Source/jit_kernels/include/GB_jit_kernel_proto.h index f7216b6863..3a2349c441 100644 --- a/Source/jit_kernels/include/GB_jit_kernel_proto.h +++ b/Source/jit_kernels/include/GB_jit_kernel_proto.h @@ -786,6 +786,7 @@ GrB_Info GB_jit_kernel_colscale \ GrB_Info GB_jit_kernel_apply_bind1st \ ( \ GB_void *Cx, \ + const bool do_iso_expansion, \ const GB_void *scalarx, \ GrB_Matrix B, \ cudaStream_t stream, \ @@ -799,6 +800,7 @@ GrB_Info GB_jit_kernel_apply_bind2nd \ ( \ GB_void *Cx, \ GrB_Matrix A, \ + const bool do_iso_expansion, \ const GB_void *scalarx, \ cudaStream_t stream, \ int32_t gridsz, \ @@ -811,6 +813,7 @@ GrB_Info GB_jit_kernel_apply_unop \ ( \ GB_void *Cx, \ GrB_Matrix A, \ + const bool do_iso_expansion, \ const GB_void *ythunk, \ cudaStream_t stream, \ int32_t gridsz, \ diff --git a/Source/transpose/GB_transpose.c b/Source/transpose/GB_transpose.c index 8b689ab08e..fa4798746c 100644 --- a/Source/transpose/GB_transpose.c +++ b/Source/transpose/GB_transpose.c @@ -992,7 +992,7 @@ GrB_Info GB_transpose // C=A', C=(ctype)A' or C=op(A') { if (C->iso) { - // If C was constructed as iso; it needs to be expanded first, + // If C was constructed as iso, it needs to be expanded first, // but do not initialize the values. These are computed by // GB_apply_op below. GB_OK (GB_convert_any_to_non_iso (C, false)) ; @@ -1008,9 +1008,10 @@ GrB_Info GB_transpose // C=A', C=(ctype)A' or C=op(A') if (C->iso) { - // If C was constructed as iso; it needs to be expanded and - // initialized first. - GB_OK (GB_convert_any_to_non_iso (C, true)) ; + // If C was constructed as iso, it needs to be expanded + // and the values initialized to the iso value. The initialization + // is handled by GB_apply_op, so don't do it here. + GB_OK (GB_convert_any_to_non_iso (C, false)) ; } if (C->type == op->ztype)