DrTimothyAldenDavis · VidithM · Mar 12, 2025 · Mar 12, 2025
diff --git a/CUDA/GB_cuda_apply.hpp b/CUDA/GB_cuda_apply.hpp
@@ -20,6 +20,7 @@ GrB_Info GB_cuda_apply_unop_jit
     const GB_Operator op,
     const bool flipij,
     const GrB_Matrix A,
+    const bool do_iso_expansion,
     const GB_void *ythunk,
     // CUDA stream and launch parameters:
     cudaStream_t stream,
@@ -35,6 +36,7 @@ GrB_Info GB_cuda_apply_bind1st_jit
     const GrB_Type ctype,
     const GrB_BinaryOp op,
     const GrB_Matrix A,
+    const bool do_iso_expansion,
     const GB_void *xscalar,
     // CUDA stream and launch parameters:
     cudaStream_t stream,
@@ -50,6 +52,7 @@ GrB_Info GB_cuda_apply_bind2nd_jit
     const GrB_Type ctype,
     const GrB_BinaryOp op,
     const GrB_Matrix A,
+    const bool do_iso_expansion,
     const GB_void *xscalar,
     // CUDA stream and launch parameters:
     cudaStream_t stream,

diff --git a/CUDA/GB_cuda_apply_bind1st_jit.cpp b/CUDA/GB_cuda_apply_bind1st_jit.cpp
@@ -14,6 +14,7 @@ GrB_Info GB_cuda_apply_bind1st_jit
     const GrB_Type ctype,
     const GrB_BinaryOp binaryop,
     const GrB_Matrix B,
+    const bool do_iso_expansion,
     const GB_void *scalarx,
     // CUDA stream and launch parameters:
     cudaStream_t stream,
@@ -50,6 +51,6 @@ GrB_Info GB_cuda_apply_bind1st_jit
     //--------------------------------------------------------------------------
 
     GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
-    return (GB_jit_kernel (Cx, scalarx, B, stream, gridsz, blocksz,
-        &GB_callback)) ;
+    return (GB_jit_kernel (Cx, do_iso_expansion, scalarx, B, stream,
+        gridsz, blocksz, &GB_callback)) ;
 }
diff --git a/CUDA/GB_cuda_apply_bind2nd_jit.cpp b/CUDA/GB_cuda_apply_bind2nd_jit.cpp
@@ -14,6 +14,7 @@ GrB_Info GB_cuda_apply_bind2nd_jit
     const GrB_Type ctype,
     const GrB_BinaryOp binaryop,
     const GrB_Matrix A,
+    const bool do_iso_expansion,
     const GB_void *scalarx,
     // CUDA stream and launch parameters:
     cudaStream_t stream,
@@ -50,6 +51,6 @@ GrB_Info GB_cuda_apply_bind2nd_jit
     //--------------------------------------------------------------------------
 
     GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
-    return (GB_jit_kernel (Cx, A, scalarx, stream, gridsz, blocksz,
-        &GB_callback)) ;
+    return (GB_jit_kernel (Cx, A, do_iso_expansion, scalarx, stream,
+        gridsz, blocksz, &GB_callback)) ;
 }
diff --git a/CUDA/GB_cuda_apply_binop.cpp b/CUDA/GB_cuda_apply_binop.cpp
@@ -24,6 +24,7 @@ GrB_Info GB_cuda_apply_binop
     const GrB_Type ctype,
     const GrB_BinaryOp op,
     const GrB_Matrix A,
+    const bool do_iso_expansion,
     const GB_void *scalarx,
     const bool bind1st
 )
@@ -68,12 +69,12 @@ GrB_Info GB_cuda_apply_binop
     if (bind1st)
     {
         GB_OK (GB_cuda_apply_bind1st_jit (Cx, ctype, op, A, 
-            scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ;
+            do_iso_expansion, scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ;
     }
     else
     {
         GB_OK (GB_cuda_apply_bind2nd_jit (Cx, ctype, op, A,
-            scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ;
+            do_iso_expansion, scalarx_cuda, stream, gridsz, BLOCK_SIZE)) ;
     }
 
     GB_FREE_WORKSPACE ;

diff --git a/CUDA/GB_cuda_apply_unop.cpp b/CUDA/GB_cuda_apply_unop.cpp
@@ -25,6 +25,7 @@ GrB_Info GB_cuda_apply_unop
     const GB_Operator op,
     const bool flipij,
     const GrB_Matrix A,
+    const bool do_iso_expansion,
     const GB_void *ythunk
 )
 {
@@ -61,7 +62,7 @@ GrB_Info GB_cuda_apply_unop
     int32_t gridsz = std::min (raw_gridsz, (int64_t) (number_of_sms * 256)) ;
 
     GB_OK (GB_cuda_apply_unop_jit (Cx, ctype, op, flipij, A,
-        ythunk_cuda, stream, gridsz, BLOCK_SIZE)) ;
+        do_iso_expansion, ythunk_cuda, stream, gridsz, BLOCK_SIZE)) ;
 
     GB_FREE_WORKSPACE ;
     return GrB_SUCCESS ;

diff --git a/CUDA/GB_cuda_apply_unop_jit.cpp b/CUDA/GB_cuda_apply_unop_jit.cpp
@@ -15,6 +15,7 @@ GrB_Info GB_cuda_apply_unop_jit
     const GB_Operator op,
     const bool flipij,
     const GrB_Matrix A,
+    const bool do_iso_expansion,
     const GB_void *ythunk,
     // CUDA stream and launch parameters:
     cudaStream_t stream,
@@ -51,6 +52,6 @@ GrB_Info GB_cuda_apply_unop_jit
     //--------------------------------------------------------------------------
 
     GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
-    return (GB_jit_kernel (Cx, A, ythunk, stream, gridsz, blocksz,
-        &GB_callback)) ;
+    return (GB_jit_kernel (Cx, A, do_iso_expansion, ythunk, stream, gridsz,
+        blocksz, &GB_callback)) ;
 }
diff --git a/CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu b/CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu
@@ -5,6 +5,7 @@ using namespace cooperative_groups ;
 __global__ void GB_cuda_apply_bind1st_kernel
 (
     GB_void *Cx_out,
+    const bool do_iso_expansion,
     const GB_void *scalarx,
     GrB_Matrix B
 )
@@ -19,11 +20,22 @@ __global__ void GB_cuda_apply_bind1st_kernel
 
     GB_B_NHELD (nvals) ;
 
+    GB_C_TYPE iso_val ;
+    if (do_iso_expansion)
+    {
+        iso_val = Cx [0] ;
+    }
+
     int tid = blockDim.x * blockIdx.x + threadIdx.x;
     int nthreads = blockDim.x * gridDim.x ;
 
     for (int64_t p = tid ; p < nvals ; p += nthreads)
     {
+        if (do_iso_expansion)
+        {
+            Cx [p] = iso_val ;
+        }
+
         if (!GBb_B (Bb, p)) { continue ; }
         GB_DECLAREB (bij) ;
         GB_GETB (bij, Bx, p, false) ;
@@ -45,9 +57,15 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND1ST_PROTO (GB_jit_kernel)
     GB_B_NHELD (nvals) ;
     if (nvals == 0) return (GrB_SUCCESS) ;
 
+    if (do_iso_expansion)
+    {
+        ASSERT (!B->iso) ;
+    }
+
     CUDA_OK (cudaGetLastError ( )) ;
     CUDA_OK (cudaStreamSynchronize (stream)) ;
-    GB_cuda_apply_bind1st_kernel <<<grid, block, 0, stream>>> (Cx, scalarx, B) ;
+    GB_cuda_apply_bind1st_kernel <<<grid, block, 0, stream>>> (Cx,
+        do_iso_expansion, scalarx, B) ;
     CUDA_OK (cudaGetLastError ( )) ;
     CUDA_OK (cudaStreamSynchronize (stream)) ;
 

diff --git a/CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu b/CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu
@@ -6,6 +6,7 @@ __global__ void GB_cuda_apply_bind2nd_kernel
 (
     GB_void *Cx_out,
     GrB_Matrix A,
+    const bool do_iso_expansion,
     const GB_void *scalarx
 )
 {
@@ -19,11 +20,22 @@ __global__ void GB_cuda_apply_bind2nd_kernel
 
     GB_A_NHELD (nvals) ;
 
+    GB_C_TYPE iso_val ;
+    if (do_iso_expansion)
+    {
+        iso_val = Cx [0] ;
+    }
+
     int tid = blockDim.x * blockIdx.x + threadIdx.x;
     int nthreads = blockDim.x * gridDim.x ;
 
     for (int64_t p = tid ; p < nvals ; p += nthreads)
     {
+        if (do_iso_expansion)
+        {
+            Cx [p] = iso_val ;
+        }
+
         if (!GBb_A (Ab, p)) { continue ; }
         GB_DECLAREA (aij) ;
         GB_GETA (aij, Ax, p, false) ;
@@ -45,9 +57,15 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND2ND_PROTO (GB_jit_kernel)
     GB_A_NHELD (nvals) ;
     if (nvals == 0) return (GrB_SUCCESS) ;
 
+    if (do_iso_expansion)
+    {
+        ASSERT (!A->iso) ;
+    }
+
     CUDA_OK (cudaGetLastError ( )) ;
     CUDA_OK (cudaStreamSynchronize (stream)) ;
-    GB_cuda_apply_bind2nd_kernel <<<grid, block, 0, stream>>> (Cx, A, scalarx) ;
+    GB_cuda_apply_bind2nd_kernel <<<grid, block, 0, stream>>> (Cx, A,
+        do_iso_expansion, scalarx) ;
     CUDA_OK (cudaGetLastError ( )) ;
     CUDA_OK (cudaStreamSynchronize (stream)) ;
 

diff --git a/CUDA/template/GB_jit_kernel_cuda_apply_unop.cu b/CUDA/template/GB_jit_kernel_cuda_apply_unop.cu
@@ -10,6 +10,7 @@ using namespace cooperative_groups ;
 __global__ void GB_cuda_apply_unop_kernel
 (
     GB_void *Cx_out,
+    const bool do_iso_expansion,
     const GB_void *thunk,
     GrB_Matrix A
 )
@@ -46,6 +47,12 @@ __global__ void GB_cuda_apply_unop_kernel
         GB_Y_TYPE thunk_value = * ((GB_Y_TYPE *) thunk) ;
     #endif
 
+    GB_C_TYPE iso_val ;
+    if (do_iso_expansion)
+    {
+        iso_val = Cx [0] ;
+    }
+
     #if ( GB_A_IS_BITMAP || GB_A_IS_FULL )
         // bitmap/full case
         int tid = blockDim.x * blockIdx.x + threadIdx.x ;
@@ -61,6 +68,11 @@ __global__ void GB_cuda_apply_unop_kernel
             #if ( GB_DEPENDS_ON_J )
             int64_t col_idx = p / A->vlen ;
             #endif
+
+            if (do_iso_expansion)
+            {
+                Cx [p] = iso_val ;
+            }
 
             GB_UNOP (Cx, p, Ax, p, A_iso, row_idx, col_idx, thunk_value) ;
         }
@@ -84,6 +96,11 @@ __global__ void GB_cuda_apply_unop_kernel
                         int64_t k = GB_cuda_ek_slice_entry<GB_Ap_TYPE> (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
                         int64_t col_idx = GBh_A (Ah, k) ;
 
+                        if (do_iso_expansion)
+                        {
+                            Cx [p] = iso_val ;
+                        }
+
                         #if ( GB_DEPENDS_ON_I )
                         int64_t row_idx = GBi_A (Ai, p_final, A->vlen) ;
                         #endif
@@ -122,9 +139,15 @@ GB_JIT_CUDA_KERNEL_APPLY_UNOP_PROTO (GB_jit_kernel)
     GB_A_NHELD (anz) ;
     if (anz == 0) return (GrB_SUCCESS) ;
 
+    if (do_iso_expansion)
+    {
+        ASSERT (!A->iso) ;
+    }
+
     CUDA_OK (cudaGetLastError ( )) ;
     CUDA_OK (cudaStreamSynchronize (stream)) ;
-    GB_cuda_apply_unop_kernel <<<grid, block, 0, stream>>> (Cx, ythunk, A) ;
+    GB_cuda_apply_unop_kernel <<<grid, block, 0, stream>>> (Cx, 
+        do_iso_expansion, ythunk, A) ;
     CUDA_OK (cudaGetLastError ( )) ;
     CUDA_OK (cudaStreamSynchronize (stream)) ;
 

diff --git a/Source/apply/GB_apply.c b/Source/apply/GB_apply.c
@@ -295,7 +295,7 @@ GrB_Info GB_apply                   // C<M> = accum (C, op(A)) or op(A')
             { 
                 // expand C to non-iso; initialize C->x unless the op
                 // is positional
-                info = GB_convert_any_to_non_iso (C, !op_is_positional) ;
+                info = GB_convert_any_to_non_iso (C, false) ;
             }
             if (info == GrB_SUCCESS)
             { 

diff --git a/Source/apply/GB_apply_op.c b/Source/apply/GB_apply_op.c
@@ -62,13 +62,25 @@ GrB_Info GB_apply_op        // apply a unary op, idxunop, or binop, Cx = op (A)
     //--------------------------------------------------------------------------
 
     // A->x is not const since the operator might be applied in-place, if
-    // C is aliased to C.
+    // A is aliased to C.
 
     GB_void *Ax = (GB_void *) A->x ;        // A->x has type A->type
     const int8_t *Ab = A->b ;               // only if A is bitmap
     const GrB_Type Atype = A->type ;        // type of A->x
     const int64_t anz = GB_nnz_held (A) ;   // size of A->x and Cx
     #define GB_A_IS_BITMAP (Ab != NULL)
+    bool do_iso_expansion = false ;
+
+    if ((C_code_iso == GB_NON_ISO) &&
+        (Cx == Ax) &&
+        !GB_OPCODE_IS_POSITIONAL (op_in->opcode))
+    {
+        // We should have already realloc'd C->x using
+        // GB_convert_any_to_non_iso
+        ASSERT (!A->iso) ;
+        // Expand the iso value to all of C->x
+        do_iso_expansion = true ;
+    }
 
     //--------------------------------------------------------------------------
     // determine the maximum number of threads to use
@@ -186,10 +198,12 @@ GrB_Info GB_apply_op        // apply a unary op, idxunop, or binop, Cx = op (A)
         if (GB_cuda_apply_unop_branch (ctype, A, op))
         {
             info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A,
-                (GB_void *) &thunk) ;
+                false, (GB_void *) &thunk) ;
         }
         #endif
 
+        // No iso expansion needed as this is a non-user positional op
+
         //----------------------------------------------------------------------
         // positional op via the CPU factory kernel
         //----------------------------------------------------------------------
@@ -411,10 +425,18 @@ GrB_Info GB_apply_op        // apply a unary op, idxunop, or binop, Cx = op (A)
         #if defined ( GRAPHBLAS_HAS_CUDA )
         if (GB_cuda_apply_unop_branch (ctype, A, op))
         {
-            info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A, NULL) ;
+            info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A,
+                do_iso_expansion, NULL) ;
         }
         #endif
 
+        if ((info == GrB_NO_VALUE) && do_iso_expansion)
+        {
+            // This will just do the GB_iso_expand, since C->x
+            // has already been realloc'd to the right size
+            GB_convert_any_to_non_iso (A, true) ;
+        }
+
         //----------------------------------------------------------------------
         // unary op via the factory kernel
         //----------------------------------------------------------------------
@@ -556,10 +578,15 @@ GrB_Info GB_apply_op        // apply a unary op, idxunop, or binop, Cx = op (A)
             if (GB_cuda_apply_binop_branch (ctype, (GrB_BinaryOp) op, A))
             {
                 info = GB_cuda_apply_binop (Cx, ctype, (GrB_BinaryOp) op, A,
-                    scalarx, true) ;
+                    do_iso_expansion, scalarx, true) ;
             }
             #endif
 
+            if ((info == GrB_NO_VALUE) && do_iso_expansion)
+            {
+                GB_convert_any_to_non_iso (A, true) ;
+            }
+
             //------------------------------------------------------------------
             // binary op (bind 1st) via the CPU factory kernel
             //------------------------------------------------------------------
@@ -622,10 +649,15 @@ GrB_Info GB_apply_op        // apply a unary op, idxunop, or binop, Cx = op (A)
             if (GB_cuda_apply_binop_branch (ctype, (GrB_BinaryOp) op, A))
             {
                 info = GB_cuda_apply_binop (Cx, ctype, (GrB_BinaryOp) op, A,
-                scalarx, false) ;
+                    do_iso_expansion, scalarx, false) ;
             }
             #endif
 
+            if ((info == GrB_NO_VALUE) && do_iso_expansion)
+            {
+                GB_convert_any_to_non_iso (A, true) ;
+            }
+
             //------------------------------------------------------------------
             // binary op (bind 2nd) via the CPU factory kernel
             //------------------------------------------------------------------
@@ -758,10 +790,16 @@ GrB_Info GB_apply_op        // apply a unary op, idxunop, or binop, Cx = op (A)
         #if defined ( GRAPHBLAS_HAS_CUDA )
         if (GB_cuda_apply_unop_branch (ctype, A, op))
         {
-            info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A, ythunk) ;
+            info = GB_cuda_apply_unop (Cx, ctype, op, flipij, A,
+                do_iso_expansion, ythunk) ;
         }
         #endif
 
+        if ((info == GrB_NO_VALUE) && do_iso_expansion)
+        {
+            GB_convert_any_to_non_iso (A, true) ;
+        }
+
         //----------------------------------------------------------------------
         // user-defined index-unary op via the JIT or PreJIT kernel
         //----------------------------------------------------------------------