Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ export OBJS := $(OBJS) $(SRCDIR)/PETSc_Helperk.o \
$(SRCDIR)/MatDiagDomk.o \
$(SRCDIR)/PMISR_Modulek.o \
$(SRCDIR)/DDC_Modulek.o \
$(SRCDIR)/Gmres_Polyk.o
$(SRCDIR)/Gmres_Polyk.o \
$(SRCDIR)/SAI_Zk.o
endif

OBJS := $(OBJS) $(SRCDIR)/PETSc_Helper.o \
Expand Down
6 changes: 3 additions & 3 deletions docs/new_methods.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ PCPFLAREINV contains methods for computing approximate inverses, most of which c
| newton_no_extra | PFLAREINV_NEWTON_NO_EXTRA | GMRES polynomial, applied as a Newton polynomial, with roots computed with an Arnoldi method and with no extra roots added | Matrix-free: Yes Assembled: No |
| neumann | PFLAREINV_NEUMANN | Neumann polynomial | Yes |
| sai | PFLAREINV_SAI | Sparse approximate inverse | No |
| isai | PFLAREINV_ISAI | Incomplete sparse approximate inverse (equivalent to a one-level RAS) | No |
| isai | PFLAREINV_ISAI | Incomplete sparse approximate inverse (equivalent to a one-level RAS) | Yes |
| wjacobi | PFLAREINV_WJACOBI | Weighted Jacobi | Partial |
| jacobi | PFLAREINV_JACOBI | Jacobi | Yes |

Expand All @@ -27,9 +27,9 @@ PCAIR contains different types of reduction multigrids. PCAIR can be used with t
| product | power, arnoldi or newton | AIRG | Yes |
| product | neumann | nAIR with Neumann smoothing | Yes |
| product | sai | SAI reduction multigrid | No |
| product | isai | ISAI reduction multigrid | No |
| product | isai | ISAI reduction multigrid | Yes |
| product | wjacobi or jacobi | Distance 0 reduction multigrid | Yes |
| lair | wjacobi or jacobi | lAIR | No |
| lair | wjacobi or jacobi | lAIR | Yes |
| lair_sai | wjacobi or jacobi | SAI version of lAIR | No |

Different combinations of these types can also be used, e.g., ``-pc_air_z_type lair -pc_air_inverse_type power`` uses a lAIR grid transfer operator and GMRES polynomial smoothing with the power basis.
Expand Down
21 changes: 21 additions & 0 deletions include/kokkos_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <Kokkos_Core.hpp>
#include <Kokkos_DualView.hpp>
#include <KokkosSparse_spadd.hpp>
#include <Kokkos_NestedSort.hpp>
#include <KokkosBatched_Gesv.hpp>

using DefaultExecutionSpace = Kokkos::DefaultExecutionSpace;
using DefaultMemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
Expand Down Expand Up @@ -125,4 +127,23 @@ namespace Kokkos {
};
}

// Binary search for target in a sorted array, returns the index or -1 if not found
template <typename ViewType>
KOKKOS_INLINE_FUNCTION
PetscInt binary_search_sorted(const ViewType &sorted_view, const PetscInt size, const PetscInt target)
{
PetscInt lo = 0, hi = size - 1;
while (lo <= hi)
{
PetscInt mid = (lo + hi) / 2;
if (sorted_view(mid) == target)
return mid;
else if (sorted_view(mid) < target)
lo = mid + 1;
else
hi = mid - 1;
}
return -1;
}

#endif
19 changes: 16 additions & 3 deletions src/C_PETSc_Interfaces.F90
Original file line number Diff line number Diff line change
Expand Up @@ -550,10 +550,23 @@ subroutine mat_mult_powers_share_sparsity_kokkos(A_array, poly_order, poly_spars
integer(c_int), value :: reuse_int_cmat, reuse_int_reuse_mat
end subroutine mat_mult_powers_share_sparsity_kokkos

end interface
end interface

interface

subroutine calculate_and_build_sai_z_kokkos(A_ff_array, A_cf_array, sparsity_array, &
reuse_int_reuse_mat, reuse_array, z_array) &
bind(c, name="calculate_and_build_sai_z_kokkos")
use iso_c_binding
integer(c_long_long) :: A_ff_array, A_cf_array, sparsity_array
integer(c_long_long) :: reuse_array, z_array
integer(c_int), value :: reuse_int_reuse_mat
end subroutine calculate_and_build_sai_z_kokkos

end interface

interface

interface

subroutine mat_duplicate_copy_plus_diag_kokkos(A_array, reuse_int, B_array) &
bind(c, name="mat_duplicate_copy_plus_diag_kokkos")
use iso_c_binding
Expand Down
140 changes: 135 additions & 5 deletions src/SAI_Z.F90
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ module sai_z
use binary_tree, only: itree, itree2vector, flush_tree
use sorting, only: create_knuth_shuffle_tree_array, intersect_pre_sorted_indices_only, &
merge_pre_sorted, sorted_binary_search
use c_petsc_interfaces, only: mat_mat_symbolic_c
use petsc_helper, only: generate_identity
use c_petsc_interfaces, only: mat_mat_symbolic_c, calculate_and_build_sai_z_kokkos
use petsc_helper, only: generate_identity, kokkos_debug, destroy_matrix_reuse, MatAXPYWrapper
use pflare_parameters, only: AIR_Z_PRODUCT, AIR_Z_LAIR, AIR_Z_LAIR_SAI, PFLAREINV_SAI, PFLAREINV_ISAI

#include "petsc/finclude/petscksp.h"
Expand All @@ -18,7 +18,8 @@ module sai_z

! -------------------------------------------------------------------------------------------------------------------------------

subroutine calculate_and_build_sai_z(A_ff_input, A_cf, sparsity_mat_cf, incomplete, reuse_mat, reuse_submatrices, z)
subroutine calculate_and_build_sai_z_cpu(A_ff_input, A_cf, sparsity_mat_cf, incomplete, &
reuse_mat, reuse_submatrices, z, no_approx_solve)

! Computes an approximation to z using sai/isai
! If incomplete is true then this is lAIR
Expand All @@ -30,6 +31,7 @@ subroutine calculate_and_build_sai_z(A_ff_input, A_cf, sparsity_mat_cf, incomple
logical, intent(in) :: incomplete
type(tMat), intent(inout) :: reuse_mat, z
type(tMat), dimension(:), pointer, intent(inout) :: reuse_submatrices
logical, intent(in), optional :: no_approx_solve

! Local variables
PetscInt :: local_rows, local_cols, ncols, global_row_start, global_row_end_plus_one, ncols_sparsity_max
Expand All @@ -55,7 +57,7 @@ subroutine calculate_and_build_sai_z(A_ff_input, A_cf, sparsity_mat_cf, incomple
type(itree) :: i_rows_tree
PetscReal, dimension(:), allocatable :: work
type(tVec) :: solution, rhs, diag_vec
logical :: approx_solve
logical :: approx_solve, disable_approx_solve
type(tMat) :: Ao, Ad, temp_mat
type(tKSP) :: ksp
type(tPC) :: pc
Expand All @@ -73,6 +75,9 @@ subroutine calculate_and_build_sai_z(A_ff_input, A_cf, sparsity_mat_cf, incomple

! ~~~~~~

disable_approx_solve = .FALSE.
if (present(no_approx_solve)) disable_approx_solve = no_approx_solve

call PetscObjectGetComm(A_ff_input, MPI_COMM_MATRIX, ierr)
! Get the comm size
call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode)
Expand Down Expand Up @@ -375,7 +380,7 @@ subroutine calculate_and_build_sai_z(A_ff_input, A_cf, sparsity_mat_cf, incomple
end if

! If we have a big system to solve, do its iteratively
if (size(i_rows) > 40 .OR. j_size > 40) approx_solve = .TRUE.
if ((size(i_rows) > 40 .OR. j_size > 40) .AND. .NOT. disable_approx_solve) approx_solve = .TRUE.

! This determines the indices of J^* in I*
! Both i_rows and j_rows are global indices
Expand Down Expand Up @@ -631,6 +636,131 @@ subroutine calculate_and_build_sai_z(A_ff_input, A_cf, sparsity_mat_cf, incomple
call MatAssemblyBegin(z, MAT_FINAL_ASSEMBLY, ierr)
call MatAssemblyEnd(z, MAT_FINAL_ASSEMBLY, ierr)

end subroutine calculate_and_build_sai_z_cpu

! -------------------------------------------------------------------------------------------------------------------------------

subroutine calculate_and_build_sai_z(A_ff_input, A_cf, sparsity_mat_cf, incomplete, reuse_mat, reuse_submatrices, z)

! Wrapper around calculate_and_build_sai_z_cpu and calculate_and_build_sai_z_kokkos
! Dispatches to Kokkos for the incomplete (lAIR & isai) case with aijkokkos matrices

! ~~~~~~
type(tMat), intent(in) :: A_ff_input, A_cf, sparsity_mat_cf
logical, intent(in) :: incomplete
type(tMat), intent(inout) :: reuse_mat, z
type(tMat), dimension(:), pointer, intent(inout) :: reuse_submatrices

#if defined(PETSC_HAVE_KOKKOS)
integer(c_long_long) :: A_ff_arr, A_cf_arr, sparsity_arr, reuse_arr, z_arr
integer :: errorcode, reuse_int
PetscErrorCode :: ierr
MatType :: mat_type
logical :: reuse_triggered
type(tMat) :: temp_mat, temp_mat_reuse, temp_mat_compare
type(tMat) :: reuse_mat_cpu
type(tMat), dimension(:), pointer :: reuse_submatrices_cpu
PetscScalar :: normy
type(tVec) :: max_vec
PetscInt :: row_loc
#endif
! ~~~~~~

#if defined(PETSC_HAVE_KOKKOS)

call MatGetType(A_ff_input, mat_type, ierr)
if ((mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. &
mat_type == MATAIJKOKKOS) .AND. incomplete) then

! We're enforcing the same sparsity
! If not re-using
if (PetscObjectIsNull(z)) then
call MatDuplicate(sparsity_mat_cf, MAT_DO_NOT_COPY_VALUES, z, ierr)
end if

! We know we will never have non-zero locations outside of the sparsity
call MatSetOption(z, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE, ierr)
call MatSetOption(z, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE, ierr)
call MatSetOption(z, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE, ierr)

! Extract opaque pointers for C interop
A_ff_arr = A_ff_input%v
A_cf_arr = A_cf%v
sparsity_arr = sparsity_mat_cf%v
reuse_arr = reuse_mat%v
z_arr = z%v

reuse_triggered = .NOT. PetscObjectIsNull(reuse_mat)
reuse_int = 0
if (reuse_triggered) reuse_int = 1

! Call the Kokkos implementation
call calculate_and_build_sai_z_kokkos(A_ff_arr, A_cf_arr, sparsity_arr, &
reuse_int, reuse_arr, z_arr)

! Copy back the opaque pointers
reuse_mat%v = reuse_arr
z%v = z_arr

! If debugging do a comparison between CPU and Kokkos results
if (kokkos_debug()) then

! If we're doing reuse and debug, then we have to always output the result
! from the cpu version, as it will have coo preallocation structures set
if (reuse_triggered) then
temp_mat = z
call MatConvert(z, MATSAME, MAT_INITIAL_MATRIX, temp_mat_compare, ierr)
else
temp_mat_compare = z
end if

! We send in an empty reuse_mat_cpu here always, as we can't pass through
! the same one Kokkos uses as it now only gets out the non-local rows we need
! We also disable the approximate solve if the size of any of the dense
! matrices are greater than 40x40, as the kokkos code always does direct solves
reuse_submatrices_cpu => null()
call calculate_and_build_sai_z_cpu(A_ff_input, A_cf, sparsity_mat_cf, incomplete, &
reuse_mat_cpu, reuse_submatrices_cpu, temp_mat, &
no_approx_solve = .TRUE.)
call destroy_matrix_reuse(reuse_mat_cpu, reuse_submatrices_cpu)

call MatConvert(temp_mat, MATSAME, MAT_INITIAL_MATRIX, &
temp_mat_reuse, ierr)

call MatAXPYWrapper(temp_mat_reuse, -1d0, temp_mat_compare)
! Find the biggest entry in the difference
call MatCreateVecs(temp_mat_reuse, PETSC_NULL_VEC, max_vec, ierr)
call MatGetRowMaxAbs(temp_mat_reuse, max_vec, PETSC_NULL_INTEGER_POINTER, ierr)
call VecMax(max_vec, row_loc, normy, ierr)
call VecDestroy(max_vec, ierr)

! There is floating point compute in these inverses, so we have to be a
! bit more tolerant to rounding differences
if (normy .gt. 4d-11 .OR. normy/=normy) then
print *, "Diff Kokkos and CPU calculate_and_build_sai_z", normy, "row", row_loc

call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode)
end if
call MatDestroy(temp_mat_reuse, ierr)
if (.NOT. reuse_triggered) then
call MatDestroy(z, ierr)
else
call MatDestroy(temp_mat_compare, ierr)
end if
z = temp_mat
end if

else

call calculate_and_build_sai_z_cpu(A_ff_input, A_cf, sparsity_mat_cf, incomplete, &
reuse_mat, reuse_submatrices, z)

end if
#else
call calculate_and_build_sai_z_cpu(A_ff_input, A_cf, sparsity_mat_cf, incomplete, &
reuse_mat, reuse_submatrices, z)
#endif

end subroutine calculate_and_build_sai_z

! -------------------------------------------------------------------------------------------------------------------------------
Expand Down
Loading
Loading