diff --git a/Makefile b/Makefile index 8a976078..56b3a825 100644 --- a/Makefile +++ b/Makefile @@ -105,7 +105,10 @@ ifeq ($(PETSC_HAVE_KOKKOS),1) export OBJS := $(OBJS) $(SRCDIR)/PETSc_Helperk.o \ $(SRCDIR)/Grid_Transferk.o \ $(SRCDIR)/VecISCopyLocalk.o \ - $(SRCDIR)/PMISR_DDCk.o \ + $(SRCDIR)/Device_Datak.o \ + $(SRCDIR)/MatDiagDomk.o \ + $(SRCDIR)/PMISR_Modulek.o \ + $(SRCDIR)/DDC_Modulek.o \ $(SRCDIR)/Gmres_Polyk.o endif @@ -116,9 +119,13 @@ OBJS := $(OBJS) $(SRCDIR)/PETSc_Helper.o \ $(SRCDIR)/AIR_MG_Stats.o \ $(SRCDIR)/SAI_Z.o \ $(SRCDIR)/Constrain_Z_or_W.o \ - $(SRCDIR)/PMISR_DDC.o \ + $(SRCDIR)/MatDiagDom.o \ + $(SRCDIR)/SAbs.o \ + $(SRCDIR)/DDC_Module.o \ + $(SRCDIR)/PMISR_Module.o \ $(SRCDIR)/Aggregation.o \ $(SRCDIR)/CF_Splitting.o \ + $(SRCDIR)/MatDiagDomSubmatrix.o \ $(SRCDIR)/Repartition.o \ $(SRCDIR)/Timers.o \ $(SRCDIR)/Weighted_Jacobi.o \ diff --git a/Makefile.deps b/Makefile.deps index 9add80f2..26e17b6e 100644 --- a/Makefile.deps +++ b/Makefile.deps @@ -7,15 +7,18 @@ src/AIR_Operators_Setup.o : src/AIR_Operators_Setup.F90 src/PETSc_Helper.o src/S src/Approx_Inverse_Setup.o : src/Approx_Inverse_Setup.F90 src/Matshell_Data_Type.o src/PETSc_Helper.o src/Repartition.o src/SAI_Z.o src/Weighted_Jacobi.o src/Neumann_Poly.o src/Gmres_Poly_Newton.o src/Gmres_Poly.o src/Pflare_Parameters.o src/TSQR.o src/Binary_Tree.o : src/Binary_Tree.F90 src/C_Fortran_Bindings.o : src/C_Fortran_Bindings.F90 src/AIR_Data_Type_Routines.o src/CF_Splitting.o src/Approx_Inverse_Setup.o src/PCAIR_Shell.o src/PCAIR_Data_Type.o -src/CF_Splitting.o : src/CF_Splitting.F90 src/PETSc_Helper.o src/Aggregation.o src/C_PETSc_Interfaces.o src/PMISR_DDC.o src/Pflare_Parameters.o +src/CF_Splitting.o : src/CF_Splitting.F90 src/PETSc_Helper.o src/Aggregation.o src/C_PETSc_Interfaces.o src/SAbs.o src/DDC_Module.o src/PMISR_Module.o src/Pflare_Parameters.o src/Constrain_Z_or_W.o : src/Constrain_Z_or_W.F90 src/PETSc_Helper.o src/C_PETSc_Interfaces.o src/C_PETSc_Interfaces.o : src/C_PETSc_Interfaces.F90 +src/DDC_Module.o : src/DDC_Module.F90 src/MatDiagDom.o src/Pflare_Parameters.o src/PMISR_Module.o src/SAbs.o src/C_PETSc_Interfaces.o src/PETSc_Helper.o src/FC_Smooth.o : src/FC_Smooth.F90 src/Matshell_Data_Type.o src/PETSc_Helper.o src/AIR_Data_Type.o src/C_PETSc_Interfaces.o src/Gmres_Poly_Data_Type.o : src/Gmres_Poly_Data_Type.F90 src/TSQR.o src/Gmres_Poly.o : src/Gmres_Poly.F90 src/PETSc_Helper.o src/Gmres_Poly_Data_Type.o src/TSQR.o src/Matshell_Data_Type.o src/Pflare_Parameters.o src/C_PETSc_Interfaces.o src/Sorting.o src/Gmres_Poly_Newton.o : src/Gmres_Poly_Newton.F90 src/C_PETSc_Interfaces.o src/Gmres_Poly.o src/Grid_Transfer.o : src/Grid_Transfer.F90 src/PETSc_Helper.o src/C_PETSc_Interfaces.o src/Grid_Transfer_Improve.o : src/Grid_Transfer_Improve.F90 src/PETSc_Helper.o src/Pflare_Parameters.o src/Timers.o +src/MatDiagDom.o : src/MatDiagDom.F90 src/Pflare_Parameters.o src/C_PETSc_Interfaces.o src/PETSc_Helper.o +src/MatDiagDomSubmatrix.o : src/MatDiagDomSubmatrix.F90 src/Pflare_Parameters.o src/PETSc_Helper.o src/CF_Splitting.o src/Matshell_Data_Type.o : src/Matshell_Data_Type.F90 src/Pflare_Parameters.o src/AIR_Data_Type.o src/Neumann_Poly.o : src/Neumann_Poly.F90 src/Pflare_Parameters.o src/TSQR.o src/Matshell_Data_Type.o src/PETSc_Helper.o src/Gmres_Poly.o src/PCAIR_C_Fortran_Bindings.o : src/PCAIR_C_Fortran_Bindings.F90 src/PCAIR_Interfaces.o @@ -26,8 +29,9 @@ src/PCPFLAREINV_Interfaces.o : src/PCPFLAREINV_Interfaces.F90 src/PETSc_Helper.o : src/PETSc_Helper.F90 src/C_PETSc_Interfaces.o src/PFLARE.o : src/PFLARE.F90 src/Pflare_Parameters.o src/PCAIR_Interfaces.o src/PCPFLAREINV_Interfaces.o src/Pflare_Parameters.o : src/Pflare_Parameters.F90 -src/PMISR_DDC.o : src/PMISR_DDC.F90 src/Pflare_Parameters.o src/C_PETSc_Interfaces.o src/PETSc_Helper.o +src/PMISR_Module.o : src/PMISR_Module.F90 src/Pflare_Parameters.o src/C_PETSc_Interfaces.o src/PETSc_Helper.o src/Repartition.o : src/Repartition.F90 src/PETSc_Helper.o src/C_PETSc_Interfaces.o +src/SAbs.o : src/SAbs.F90 src/PETSc_Helper.o src/SAI_Z.o : src/SAI_Z.F90 src/Pflare_Parameters.o src/PETSc_Helper.o src/C_PETSc_Interfaces.o src/Sorting.o src/Binary_Tree.o src/Sorting.o : src/Sorting.F90 src/Binary_Tree.o src/Timers.o : src/Timers.F90 src/Pflare_Parameters.o diff --git a/README.md b/README.md index 9037b05d..1d3d4259 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ PFLARE adds new methods to PETSc, including: 1) Polynomial approximate inverses, e.g., GMRES and Neumann polynomials 2) Reduction multigrids, e.g., AIRG, nAIR and lAIR 3) CF splittings, e.g., PMISR DDC +4) Extracting diagonally dominant submatrices ## Quick start @@ -46,12 +47,13 @@ For details about PFLARE, please see: | [docs/gpus.md](docs/gpus.md) | Using GPUs with PFLARE | | [docs/reuse.md](docs/reuse.md) | Re-using components of PFLARE | | [docs/options.md](docs/options.md) | List of the options available in PFLARE | +| [docs/faq.md](docs/faq.md) | Frequently asked questions and help! | and the Jupyter notebooks: | Path | Contents | |---|---| -| [notebooks/01_getting_started.ipynb](notebooks/01_getting_started.ipynb) | Introduce PFLARE | +| [notebooks/01_getting_started.ipynb](notebooks/01_getting_started.ipynb) | Introduction to PFLARE | | [notebooks/02_pcpflareinv.ipynb](notebooks/02_pcpflareinv.ipynb) | Examine some of the approximate inverses found in PCPFLAREINV | | [notebooks/03_cf_splitting.ipynb](notebooks/03_cf_splitting.ipynb) | Visualise the C/F splitting and explore the PMISR-DDC algorithm | | [notebooks/04_pcair.ipynb](notebooks/04_pcair.ipynb) | Introduction to PCAIR and the AIRG method | diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 00000000..c45b0501 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,170 @@ +# Frequently asked questions + +Listed below are some answers to frequently asked questions when using PFLARE. The Jupyter notebooks in the `notebooks/` directory are also a good place to start for understanding the methods in PFLARE. + +## How to improve convergence with `PCAIR` + +Below are some of the factors to consider when trying to improve convergence with `PCAIR`. This is not an exhaustive list, but covers the most common improvements. + +Reduction multigrid methods like AIR can become direct solvers as the approximation to $A_{ff}^{-1}$ improves; this means there is almost always a route to better convergence. In practice, of course, this limit is not the most performant or memory efficient, especially in parallel. The goal is typically to strike a balance between convergence, memory usage and parallel performance. + +### Quick fix + +If the default options do not give good convergence, try: + +`-pc_air_diag_scale_polys -pc_air_coarsest_diag_scale_polys -pc_air_a_lump -pc_air_cf_splitting_type diag_dom -pc_air_strong_threshold 0.3 -pc_air_a_drop 1e-6 -pc_air_r_drop 1e-3` + +If that did not work or for further understanding of these parameters, please see below. + +### 1) Diagonal scaling + +AIR methods work best when the rows of the matrix are all scaled roughly the same. Applying a diagonal scaling before solving with `PCAIR` can help improve convergence. + +If it is difficult to manually apply a diagonal scaling (e.g., inside a non-linear or time-stepping loop), the options `-pc_air_diag_scale_polys` and `-pc_air_coarsest_diag_scale_polys` can be used to scale the diagonals of $A_{ff}$ on each level during the multigrid setup. This can often restore good convergence when poor scaling is the issue. + +For DG FEM discretisations, a block diagonal scaling is typically required, where each block is the inverse of the element matrix. This is particularly important when using high-order basis functions and must be applied prior to solving with `PCAIR`. + +If you suspect scaling is still responsible for poor convergence, try using an algebraic approximate inverse method which is agnostic to scalings. For example, the ISAI (Incomplete Sparse Approximate Inverse) can be used with `-pc_air_inverse_type isai` instead of GMRES polynomials. + +### 2) Lumping + +A relative row-wise drop tolerance is applied to the coarse matrix on each level of the multigrid hierarchy. By default, dropped entries are simply discarded. + +Most discretisations benefit from lumping dropped entries to the diagonal instead, which can be enabled with `-pc_air_a_lump`. + +### 3) Coarsening rate + +Slowing the coarsening rate improves the diagonal dominance of $A_{ff}$ and hence improves the quality of the approximate inverses used to build the hierarchy. The quality of the CF splitting is central to how well reduction multigrids work: the goal is to find a large $A_{ff}$ submatrix whose inverse can be well approximated by a sparse (or low-order polynomial) approximation. + +Several parameters control how fast the default CF splitting (PMISR DDC) coarsens. + +**Strong threshold:** The most impactful parameter is the strong threshold, set with `-pc_air_strong_threshold` (default 0.5). This determines which connections between unknowns are considered "strong". Decreasing it slows the coarsening and should improve convergence. + +Internally, PMISR computes a maximal independent set in the symmetrized strong connections ($S + S^T$). This ensures that no two F-points are strongly connected, keeping large off-diagonal entries out of $A_{ff}$. + +**DDC parameters:** The second pass of PMISR DDC performs a diagonal dominance cleanup (DDC), which converts the least diagonally dominant F-points to C-points. The number of DDC iterations can be increased with `-pc_air_ddc_its` (default 1) and the fraction of local F-points changed per iteration can be modified with `-pc_air_ddc_fraction` (default 0.1, i.e., 10% of F-points per iteration). It is often helpful to do more iterations but change fewer F-points per iteration (e.g., `-pc_air_ddc_its 3 -pc_air_ddc_fraction 0.01`). + +**Direct diagonal dominance control:** Modifying and balancing the strong threshold, DDC iterations and DDC fraction to get optimal performance can be difficult. The easiest way to automate this is to control the diagonal dominance ratio of $A_{ff}$ directly. + +Setting `-pc_air_cf_splitting_type diag_dom` switches to a CF splitting that directly enforces a maximum row-wise diagonal dominance ratio. With this splitting, `-pc_air_strong_threshold` controls the maximum allowed dominance ratio in every row. Decreasing this value from the default 0.5 will slow the coarsening and improve convergence, e.g., test 0.4, 0.3, 0.2, 0.1. + +The `-pc_air_ddc_its` and `-pc_air_ddc_fraction` options are ignored with this CF splitting. This splitting is more expensive to compute than the default PMISR DDC, but can give more reliable and direct control over convergence. + +### 4) Drop tolerances + +Relative row-wise drop tolerances are applied to both the coarse matrix and the approximate ideal grid-transfer operators at every level in the multigrid hierarchy. + +The tolerance on the coarse matrix is controlled with `-pc_air_a_drop` (default 1e-4). + +The tolerance on the grid-transfer operators is controlled with `-pc_air_r_drop` (default 1e-2). + +Decreasing both of these values improves convergence but requires more memory. Decreasing `-pc_air_a_drop` is often more impactful. For example, when solving advection problems, values of `-pc_air_a_drop 1e-6` and `-pc_air_r_drop 1e-3` have been found to give scalable results up to hundreds of billions of unknowns. + +### 5) Further options + +In rough order of importance, additional options that can improve convergence include: + +- Add C-point smoothing with `-pc_air_smooth_type fc`, or multiple iterations of smoothing, e.g., FCF smoothing with `-pc_air_smooth_type fcf`. Note that C-point smoothing requires storing more matrices on each level and hence uses more memory. +- Increase the non-zeros retained in the assembled approximate inverse with `-pc_air_inverse_sparsity_order 2` (default 1). The sparsity order controls how much fill-in is allowed. Higher orders give a better approximation to $A_{ff}^{-1}$ but use more memory and have a more expensive setup. +- Use an approximate ideal prolongator instead of the default classical one-point prolongator with `-pc_air_one_point_classical_prolong false`. +- Improve the approximate ideal operators with Richardson iterations, e.g., `-pc_air_improve_z_its 1 -pc_air_improve_w_its 1`. + +## How to improve scalability with `PCAIR` + +If the iteration count is growing during weak scaling studies, it can usually be improved by following the convergence steps above. Experience with `PCAIR` has shown that starting with options that give a well-converging solve (e.g., solving to a relative tolerance of 1e-10 in 5 or 6 iterations) will typically lead to low growth in iteration count with refinement. + +Starting from a solve that requires 10-20 (or more) iterations almost never leads to scalable iteration counts. Think of reduction multigrid smoothers more like solvers than smoothers: you want all error modes to go to zero, not just a subset. If the solve is not converging well on a coarse problem, it will only get worse at scale. + +## How to improve parallel performance with `PCAIR` + +Once good convergence (and a scalable iteration count) has been achieved, the focus can turn to ensuring the solve time itself is scalable. Growth in the run time in parallel (separate from the iteration count) is typically caused by too many levels in the hierarchy. On coarse levels, the amount of work decreases but the relative amount of communication increases, creating bottlenecks. + +`PCAIR` provides several tools to address this. The strategies below can also be combined. + +### 1) Processor agglomeration + +Processor agglomeration decreases the number of active MPI ranks on lower levels and enables repartitioning with ParMETIS. This is enabled with `-pc_air_processor_agglom` (on by default). + +Repartitioning the coarse grids is important for performance. It improves the ratio of local to non-local work on lower levels, which directly affects the cost of both the matrix-matrix products in the setup and the matrix-vector products in the solve. Without repartitioning, the setup time can grow very quickly, as the matrix-matrix products used to compute the restrictor and coarse grid matrix become communication bound in the middle of the hierarchy. + +This is typically visible in the cost of SpGEMMs during the setup increasing considerably in the middle of the hierarchy. Using the option `-pc_air_print_stats_timings` outputs the cumulative timers across the setup and the Python script `tools/parse_pflare_output.py` can be used to extract timings from the saved terminal output. + +The repartitioning is triggered when the average ratio of local to non-local non-zeros drops below `-pc_air_processor_agglom_ratio` (default 2). When triggered, the number of active MPI ranks is halved (matching the expected coarsening rate of around 1/2 in advection-type problems). + +### 2) Truncating the hierarchy + +The multigrid hierarchy can be truncated early and an iterative coarse-grid solver used instead, often without any change in iteration count. This is one of the most effective ways to improve parallel performance, both on CPUs and GPUs. The key insight is that the lower levels of the hierarchy have very little local work but still incur communication costs (and kernel launch overheads on GPUs). Replacing these levels with an iterative solver that only requires matrix-vector products avoids these bottlenecks entirely. + +**Automatic truncation:** The option `-pc_air_auto_truncate_start_level 1` enables automatic truncation of the multigrid hierarchy. It starts from the specified level and tests if the coarse grid solver can solve the current coarse problem to a given tolerance. If so, the hierarchy is truncated there. + +The tolerance is controlled with `-pc_air_auto_truncate_tol` (default 1e-14). A surprisingly loose tolerance often works well; for advection problems, values between 1e-1 and 1e-3 have been found to allow heavy truncation while giving the same iteration count as no truncation. + +If the iteration count increases with truncation, reduce this tolerance. If the current tolerance is performing well, try increasing it to allow more aggressive truncation and fewer levels. + +**Estimating the start level:** Testing the coarse solver at every level starting from the top of the hierarchy can be expensive and requires more memory. For production runs, the start level should be set based on a rough estimate from smaller problems. Given that the coarsening rate in advection problems is around 1/2, the number of levels can be estimated from the number of unknowns. + +**Choosing a coarse grid solver:** A high-order GMRES polynomial in Newton form applied matrix-free is well suited as a coarse grid solver, particularly on GPUs. For example: + +`-pc_air_coarsest_poly_order 100 -pc_air_coarsest_inverse_type newton -pc_air_coarsest_matrix_free_polys` + +This only requires matrix-vector products (no dot products) and uses asynchronous communication, making it well suited to coarse levels. + +**Impact on cycle complexity:** Truncation increases the cycle complexity (amount of work per V-cycle) because the coarse grid solver does more work. However, this extra work is all matrix-vector products, which often costs less wall time than the communication-bound operations on the truncated levels. In GPU runs, the cycle complexity can increase by 2x or more while the solve time stays the same or even decreases. As you weak scale and add more levels, the relative cost of the coarse grid solve decreases and the cycle complexity with truncation approaches that without. + +### 3) OpenMP with Kokkos on CPUs + +The parallel performance of the SpGEMMs `PCAIR` uses to form much of the hierarchy can depend on the MPI decomposition. Having more unknowns per MPI rank can improve performance and scalability. + +On CPUs, rather than using many MPI ranks per CPU (e.g., one per core), hybrid MPI/OpenMP can be beneficial. This is supported in `PCAIR` through Kokkos with an OpenMP backend. This can also reduce memory use, as the halo sizes throughout the hierarchy decrease. + +## How to get faster solves with GPUs in parallel with `PCAIR` + +Several techniques can improve the solve time on GPUs. Many of these take advantage of the fact that GPUs are well suited to trading extra FLOPS for less communication and less memory. + +### 1) Many unknowns per GPU + +GPU throughput improves significantly with many unknowns per MPI rank (e.g., tens of millions per GPU). This keeps the GPU saturated and ensures good performance on the top levels of the hierarchy. However, strong scaling studies still show reductions in wall time down to around 1M DOFs/rank. + +### 2) Matrix-free smoothing + +GPUs are well suited to repeatedly applying the same matrix-vector product. Turning on the option `-pc_air_matrix_free_polys` applies the GMRES polynomial smoother matrix-free. This uses no extra memory (only the polynomial coefficients are stored) and gives a better approximation to $A_{ff}^{-1}$ than the assembled fixed-sparsity version, at the cost of more SpMVs per smooth. + +In practice, the extra SpMVs are memory-bandwidth bound rather than FLOP bound, so GPUs handle them well. For example, with 6th-order polynomials applied matrix-free, the per V-cycle time is only about 20% more than using the assembled approximation (which requires only one SpMV), despite requiring six SpMVs. The iteration count also typically improves. + +### 3) Truncating the hierarchy + +Truncating the hierarchy and applying a matrix-free coarse grid solver (see above) is often very effective on GPUs. GPUs are poorly suited to the small solves on the bottom levels of a multigrid hierarchy, where kernel launch overheads become significant and there is not enough work to hide communication. Truncation can give large speed-ups in the solve (e.g., 4x) and also reduces memory use. + +## Resolving out-of-memory errors with `PCAIR` + +AIR multigrid methods for asymmetric linear systems require considerable memory, particularly compared with classical multigrid methods for elliptic problems. They often require more memory than a direct LU factorisation, but the benefit is that scalable solves are possible in parallel. + +Typical storage complexity (the sum of non-zeros across all matrices needed in the solve, relative to the top grid matrix) is around 5-6 on unstructured meshes and 10-13 on structured meshes. The higher values on structured meshes are due to the slower coarsening required. For comparison, elliptic multigrid methods typically require much less. + +Besides decreasing the number of unknowns, there are several ways to decrease the memory required. + +### 1) Matrix-free smoothing + +Using `-pc_air_matrix_free_polys` avoids assembling and storing the approximate inverse on each level. This saves significant memory, as only the polynomial coefficients (a few scalars per level) need to be stored instead of a full sparse matrix. + +### 2) F-point only smoothing + +The default F-point only smoothing (`-pc_air_smooth_type f`) avoids storing $A_{cc}$ and $A_{cf}$ on each level. Switching to FC or FCF smoothing can nearly double the storage complexity, so only use it if the convergence improvement justifies the memory cost. + +### 3) Truncating the hierarchy + +Truncating the hierarchy (see above) reduces the total number of levels and hence the total amount of storage required. + +### 4) Processor agglomeration + +In parallel, the processor agglomeration (through ParMETIS) can require considerable memory when repartitioning. Disabling it with `-pc_air_processor_agglom 0` reduces peak memory, but can increase the run time of the setup and solve. + +Instead of disabling it entirely, `-pc_air_processor_agglom_ratio` can control at what level it is first triggered. By lowering the value (e.g., `-pc_air_processor_agglom_ratio 0.1`), processor agglomeration will only be triggered further down the hierarchy, where the coarse grid matrices are smaller and less memory is needed. + +### 5) Drop tolerances + +The drop tolerances `-pc_air_a_drop` and `-pc_air_r_drop` can be increased to make the hierarchy more sparse, but this can negatively affect convergence. + +### 6) Coarsening rate + +Faster coarsenings (higher strong threshold) give fewer levels and lower memory use. The tradeoff is that faster coarsenings can hurt convergence. diff --git a/docs/new_methods.md b/docs/new_methods.md index 33c0abb7..2a3f8181 100644 --- a/docs/new_methods.md +++ b/docs/new_methods.md @@ -45,11 +45,12 @@ There are several features used to improve the parallel performance of PCAIR [2- ### CF splittings -The CF splittings in PFLARE are used within PCAIR to form the multigrid hierarchy. They can also be called independently from PCAIR. The CF splitting type within PCAIR can be specified with ``-pc_air_cf_splitting_type``: +The CF splittings in PFLARE are used within PCAIR to form the multigrid hierarchy. The CF splitting type within PCAIR can be specified with ``-pc_air_cf_splitting_type``: | Command line type | Flag | Description | GPU setup | | ------------- | -- | ------------- | -- | | pmisr_ddc | CF_PMISR_DDC | Two-pass splitting giving diagonally dominant $\mathbf{A}_\textrm{ff}$ | Yes | + | diag_dom | CF_DIAG_DOM | Two-pass splitting enforcing fixed diagonal dominance ratio (set by strong_threshold) in $\mathbf{A}_\textrm{ff}$ | Yes | | pmis | CF_PMIS | PMIS method with symmetrised strength matrix | Yes | | pmis_dist2 | CF_PMIS_DIST2 | Distance 2 PMIS method with strength matrix formed by S'S + S and then symmetrised | Partial | | agg | CF_AGG | Aggregation method with root-nodes as C points. In parallel this is processor local aggregation | No | @@ -66,9 +67,6 @@ in Fortran: int :: ddc_its = 1 ! Fraction of F points to convert to C per ddc it PetscReal :: ddc_fraction = 0.1 - ! If not 0, keep doing ddc its until this diagonal dominance - ! ratio is hit - PetscReal :: max_dd_ratio = 0.0 ! As many steps as needed int :: max_luby_steps = -1 ! PMISR DDC @@ -82,7 +80,6 @@ in Fortran: algorithm, & ddc_its, & ddc_fraction, & - max_dd_ratio, & is_fine, is_coarse) or in C: @@ -94,9 +91,6 @@ or in C: int ddc_its = 1; // Fraction of F points to convert to C per ddc it PetscReal ddc_fraction = 0.1; - // If not 0, keep doing ddc its until this diagonal dominance - // ratio is hit - PetscReal max_dd_ratio = 0.0; // As many steps as needed int max_luby_steps = -1; // PMISR DDC @@ -110,7 +104,6 @@ or in C: algorithm, \ ddc_its, \ ddc_fraction, \ - max_dd_ratio, \ &is_fine, &is_coarse); or in Python with petsc4py: @@ -121,9 +114,6 @@ or in Python with petsc4py: ddc_its = 1 # Fraction of F points to convert to C per ddc it ddc_fraction = 0.1 - # If not 0, keep doing ddc its until this diagonal dominance - # ratio is hit - max_dd_ratio = 0.0 # As many steps as needed max_luby_steps = -1 # PMISR DDC @@ -131,10 +121,49 @@ or in Python with petsc4py: # Is the matrix symmetric? symmetric = False - [is_fine, is_coarse] = pflare.pflare_defs.compute_cf_splitting(A, \ + [is_fine, is_coarse] = pflare.compute_cf_splitting(A, \ symmetric, \ strong_threshold, max_luby_steps, \ algorithm, \ ddc_its, \ - ddc_fraction, \ - max_dd_ratio) \ No newline at end of file + ddc_fraction) + +To enforce a fixed diagonal dominance ratio, set `-pc_air_cf_splitting_type diag_dom` (or `algorithm = CF_DIAG_DOM`) and use `-pc_air_strong_threshold` as the target row-wise ratio in $\mathbf{A}_{ff}$; the section below details a convenience wrapper provided for this purpose. + +### Diagonally dominant submatrix extraction + +PFLARE also provides a standalone convenience routine to extract a diagonally dominant submatrix from a PETSc matrix, enforced to have a row-wise diagonal dominance ratio of less than `max_dd_ratio`. This works in serial, parallel and with Kokkos (and hence GPUs). + +For a row `i` of the extracted submatrix, define the row-wise ratio as + +$$ +r_i = \frac{\sum_{j \neq i} |a_{ij}|}{|a_{ii}|}. +$$ + +Then the extracted submatrix is required to satisfy + +$$ +r_i < \texttt{max\_dd\_ratio} \quad \text{for all rows } i. +$$ + +If a row has zero/missing diagonal, the ratio is treated as `0.0`. The parameter `max_dd_ratio` must satisfy `0.0 < max_dd_ratio < 1.0`. For example, to extract a diagonally dominant submatrix from a PETSc matrix `A`: + +in Fortran: + + type(tMat) :: A_dd + PetscReal :: max_dd_ratio = 0.5 + + call compute_diag_dom_submatrix(A, max_dd_ratio, A_dd) + +or in C: + + Mat A_dd; + PetscReal max_dd_ratio = 0.5; + + compute_diag_dom_submatrix(A, max_dd_ratio, &A_dd); + +or in Python with petsc4py: + + max_dd_ratio = 0.5 + + A_dd = pflare.compute_diag_dom_submatrix(A, max_dd_ratio) \ No newline at end of file diff --git a/docs/options.md b/docs/options.md index ba34ec59..8b4029ad 100644 --- a/docs/options.md +++ b/docs/options.md @@ -45,10 +45,9 @@ All options can be set either through command line arguments or programmatically | ------------- | -- | ------------- | --- | | ``-pc_air_cf_splitting_type`` | PCAIRGetCFSplittingType PCAIRSetCFSplittingType | The type of CF splitting to use, given above | pmisr_ddc | | ``-pc_air_strong_threshold`` | PCAIRGetStrongThreshold PCAIRSetStrongThreshold | The strong threshold to use in the CF splitting | 0.5 | - | ``-pc_air_max_luby_steps`` | PCAIRGetMaxLubySteps PCAIRSetMaxLubySteps | If using CF splitting type pmisr_ddc, pmis, or pmis_dist2, this is the maximum number of Luby steps to use. If negative, use as many steps as necessary | -1 | + | ``-pc_air_max_luby_steps`` | PCAIRGetMaxLubySteps PCAIRSetMaxLubySteps | If using CF splitting type pmisr_ddc, diag_dom, pmis, or pmis_dist2, this is the maximum number of Luby steps to use. If negative, use as many steps as necessary | -1 | | ``-pc_air_ddc_its`` | PCAIRGetDDCIts PCAIRSetDDCIts | If using CF splitting type pmisr_ddc, this is the number of iterations of DDC performed | 1 | | ``-pc_air_ddc_fraction`` | PCAIRGetDDCFraction PCAIRSetDDCFraction | If using CF splitting type pmisr_ddc, this is the local fraction of F points to convert to C points based on diagonal dominance. If negative, any row which has a diagonal dominance ratio less than the absolute value will be converted from F to C | 0.1 | - | ``-pc_air_max_dd_ratio`` | PCAIRGetMaxDDRatio PCAIRSetMaxDDRatio | If using CF splitting type pmisr_ddc, do as many DDC iterations as necessary to hit this diagonal dominance ratio. If 0.0 do the number in -pc_air_ddc_its | 0.0 | #### Approximate inverse options diff --git a/include/kokkos_helper.hpp b/include/kokkos_helper.hpp index 01072626..0c9d9b6b 100644 --- a/include/kokkos_helper.hpp +++ b/include/kokkos_helper.hpp @@ -33,12 +33,19 @@ using ViewPetscIntPtr = std::shared_ptr; PETSC_INTERN void mat_duplicate_copy_plus_diag_kokkos(Mat *, int, Mat *); PETSC_INTERN void rewrite_j_global_to_local(PetscInt, PetscInt&, PetscIntKokkosView, PetscInt**); +PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, PetscIntKokkosView &is_local_d); +PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int); +PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int); +PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local); +PETSC_INTERN void delete_device_diag_dom_ratio(); // Define array of shared pointers representing fine and coarse IS's // on each level on the device extern ViewPetscIntPtr* IS_fine_views_local; extern ViewPetscIntPtr* IS_coarse_views_local; extern int max_levels; +extern intKokkosView cf_markers_local_d; +extern PetscScalarKokkosView diag_dom_ratio_local_d; // ~~~~~~~~~~~~~~~~~~ // Some custom reductions we use diff --git a/include/pflare.h b/include/pflare.h index c12e7e11..2a9d303d 100644 --- a/include/pflare.h +++ b/include/pflare.h @@ -21,6 +21,7 @@ typedef enum { } PCAIRZType; typedef enum { CF_PMISR_DDC, + CF_DIAG_DOM, CF_PMIS, CF_PMIS_DIST2, CF_AGG, @@ -41,7 +42,8 @@ typedef enum { PETSC_EXTERN void PCRegister_PFLARE(); /* Can call the CF splitting separate to everything */ -PETSC_EXTERN void compute_cf_splitting(Mat, int, double, int, int, int, double, double, IS*, IS*); +PETSC_EXTERN void compute_cf_splitting(Mat, int, double, int, int, int, double, IS*, IS*); +PETSC_EXTERN void compute_diag_dom_submatrix(Mat, double, Mat*); /* Define PCPFLAREINV get routines */ PETSC_EXTERN PetscErrorCode PCPFLAREINVGetPolyOrder(PC, PetscInt *); @@ -74,7 +76,6 @@ PETSC_EXTERN PetscErrorCode PCAIRGetProcessEqLimit(PC, PetscInt *); PETSC_EXTERN PetscErrorCode PCAIRGetSubcomm(PC, PetscBool *); PETSC_EXTERN PetscErrorCode PCAIRGetStrongThreshold(PC, PetscReal *); PETSC_EXTERN PetscErrorCode PCAIRGetDDCIts(PC, PetscInt *); -PETSC_EXTERN PetscErrorCode PCAIRGetMaxDDRatio(PC, PetscReal *); PETSC_EXTERN PetscErrorCode PCAIRGetDDCFraction(PC, PetscReal *); PETSC_EXTERN PetscErrorCode PCAIRGetCFSplittingType(PC, CFSplittingType *); PETSC_EXTERN PetscErrorCode PCAIRGetMaxLubySteps(PC, PetscInt *); @@ -123,7 +124,6 @@ PETSC_EXTERN PetscErrorCode PCAIRSetProcessEqLimit(PC, PetscInt); PETSC_EXTERN PetscErrorCode PCAIRSetSubcomm(PC, PetscBool); PETSC_EXTERN PetscErrorCode PCAIRSetStrongThreshold(PC, PetscReal); PETSC_EXTERN PetscErrorCode PCAIRSetDDCIts(PC, PetscInt); -PETSC_EXTERN PetscErrorCode PCAIRSetMaxDDRatio(PC, PetscReal); PETSC_EXTERN PetscErrorCode PCAIRSetDDCFraction(PC, PetscReal); PETSC_EXTERN PetscErrorCode PCAIRSetCFSplittingType(PC, CFSplittingType); PETSC_EXTERN PetscErrorCode PCAIRSetMaxLubySteps(PC, PetscInt); diff --git a/notebooks/03_cf_splitting.ipynb b/notebooks/03_cf_splitting.ipynb index 3c80c583..b2ab1250 100644 --- a/notebooks/03_cf_splitting.ipynb +++ b/notebooks/03_cf_splitting.ipynb @@ -116,7 +116,7 @@ "outputs": [], "source": [ "def run_cf_splitting(A, strong_threshold=0.5, ddc_its=1):\n", - " is_fine, is_coarse = pflare.pflare_defs.compute_cf_splitting(\n", + " is_fine, is_coarse = pflare.compute_cf_splitting(\n", " A,\n", " False,\n", " strong_threshold,\n", @@ -124,7 +124,6 @@ " pflare.CF_PMISR_DDC,\n", " ddc_its,\n", " 0.1,\n", - " 0.0,\n", " )\n", " f_idx = set(is_fine.getIndices().tolist())\n", " c_idx = set(is_coarse.getIndices().tolist())\n", @@ -364,7 +363,7 @@ "n2 = A2.getSize()[0]\n", "\n", "is_fine2, is_coarse2 = pflare.pflare_defs.compute_cf_splitting(\n", - " A2, False, 0.5, -1, pflare.CF_PMISR_DDC, 1, 0.1, 0.0,\n", + " A2, False, 0.5, -1, pflare.CF_PMISR_DDC, 1, 0.1,\n", " )\n", "f2 = set(is_fine2.getIndices().tolist())\n", "c2 = set(is_coarse2.getIndices().tolist())\n", @@ -668,6 +667,46 @@ " pass" ] }, + { + "cell_type": "markdown", + "id": "20f48af0", + "metadata": {}, + "source": [ + "## Targeting a specific DD ratio with `CF_DIAG_DOM`\n", + "\n", + "To enforce a specific diagonal-dominance target in CF splitting, use `CF_DIAG_DOM` and set `strong_threshold` to the desired worst-row DD ratio.\n", + "\n", + "The tradeoff is cost: this can require more setup work than a fixed number of PMISR-DDC cleanup iterations.\n", + "\n", + "For direct extraction of a diagonally-dominant submatrix from a matrix, PFLARE also provides the convenience wrapper `compute_diag_dom_submatrix`, which takes a `max_dd_ratio` argument.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "591370d1", + "metadata": {}, + "outputs": [], + "source": [ + "# Demonstrate the convenience wrapper and verify the achieved DD ratio\n", + "target_max_dd_ratio = 0.35\n", + "\n", + "A2_dd = pflare.compute_diag_dom_submatrix(A2, target_max_dd_ratio)\n", + "n_dd = A2_dd.getSize()[0]\n", + "\n", + "# Reuse the notebook's existing DD-ratio helper on the full extracted matrix.\n", + "all_dd_rows = set(range(n_dd))\n", + "dd_vals = compute_aff_dd_ratios(A2_dd, all_dd_rows)\n", + "max_observed_dd = float(np.max(dd_vals)) if dd_vals.size else 0.0\n", + "\n", + "print(f\"Target max_dd_ratio = {target_max_dd_ratio:.6e}\")\n", + "print(f\"Observed max row dd ratio = {max_observed_dd:.6e}\")\n", + "print(f\"Extracted submatrix size = {n_dd} x {n_dd}\")\n", + "\n", + "A2_dd.destroy()" + ] + }, { "cell_type": "markdown", "id": "cd1c4184", @@ -676,6 +715,7 @@ "## Summary\n", "\n", "- Decreasing `strong_threshold` or increasing `ddc_its` (and `ddc_fraction`) reduces the coarsening rate but yields a more diagonally dominant $A_{ff}$.\n", + "- `CF_DIAG_DOM` uses `strong_threshold` to enforce a target diagonal dominance ratio during CF splitting.\n", "- Enhancing the diagonal dominance of $A_{ff}$ directly improves the convergence of GMRES polynomial solvers.\n", "\n", "## 8. What's next?\n", diff --git a/python/Makefile b/python/Makefile index 07f5c554..54edefba 100644 --- a/python/Makefile +++ b/python/Makefile @@ -55,9 +55,9 @@ run_tests: echo "Test single level GMRES polynomial preconditioning with Python in parallel"; \ $(MPIEXEC) -n 2 $(PYTHON) ex2.py -pc_type pflareinv -pc_pflareinv_type power; \ echo ""; \ - echo "Test PMISR DDC CF splitting with Python"; \ + echo "Test PMISR DDC CF splitting and diagonal dominance extract with Python"; \ $(PYTHON) ex2_cf_splitting.py; \ - echo "Test PMISR DDC CF splitting with Python in parallel"; \ + echo "Test PMISR DDC CF splitting and diagonal dominance extract with Python in parallel"; \ $(MPIEXEC) -n 2 $(PYTHON) ex2_cf_splitting.py; \ echo ""; \ echo "Test PCAIR direct option set/get API with Python"; \ diff --git a/python/ex2_cf_splitting.py b/python/ex2_cf_splitting.py index f86c317f..a0fa137f 100644 --- a/python/ex2_cf_splitting.py +++ b/python/ex2_cf_splitting.py @@ -157,9 +157,6 @@ ddc_its = 1 # Fraction of F points to convert to C per ddc it ddc_fraction = 0.1 -# If not 0, keep doing ddc its until this diagonal dominance -# ratio is hit -max_dd_ratio = 0.0 # As many steps as needed max_luby_steps = -1 # PMISR DDC @@ -167,13 +164,12 @@ # Is the matrix symmetric? symmetric = False -[is_fine, is_coarse] = pflare.pflare_defs.compute_cf_splitting(A,\ +[is_fine, is_coarse] = pflare.compute_cf_splitting(A,\ symmetric,\ strong_threshold, max_luby_steps,\ algorithm,\ ddc_its, \ - ddc_fraction, \ - max_dd_ratio) + ddc_fraction) # ~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~ @@ -187,4 +183,11 @@ PETSc.Sys.Print("- OK",comm=comm) else: PETSc.Sys.Print("- NOT OK",comm=comm) - sys.exit(1) \ No newline at end of file + sys.exit(1) + +# ~~~~~~~~~~~~~~ +# Compute a diagonally dominant submatrix +# ~~~~~~~~~~~~~~ + +A_dd = pflare.compute_diag_dom_submatrix(A, 0.5) +A_dd.destroy() \ No newline at end of file diff --git a/python/pflare.py b/python/pflare.py index 736f4d56..f7b962c9 100644 --- a/python/pflare.py +++ b/python/pflare.py @@ -5,10 +5,11 @@ # CF splitting type constants CF_PMISR_DDC = 0 -CF_PMIS = 1 -CF_PMIS_DIST2 = 2 -CF_AGG = 3 -CF_PMIS_AGG = 4 +CF_DIAG_DOM = 1 +CF_PMIS = 2 +CF_PMIS_DIST2 = 3 +CF_AGG = 4 +CF_PMIS_AGG = 5 # Approximate inverse type constants (PCPFLAREINVType / PCAIRInverseType) PFLAREINV_POWER = 0 @@ -33,6 +34,10 @@ COEFFS_INV_ACC = 2 # Inverse of the coarse-coarse block A_cc COEFFS_INV_COARSE = 3 # Inverse on the coarsest grid +# Standalone matrix utility wrappers +compute_cf_splitting = pflare_defs.compute_cf_splitting +compute_diag_dom_submatrix = pflare_defs.compute_diag_dom_submatrix + # ----------------------------------------------------------------------- # PCAIR Get functions # The exact Python names for all pflare functions are listed here in @@ -52,7 +57,6 @@ pcair_get_subcomm = pflare_defs.pcair_get_subcomm pcair_get_strong_threshold = pflare_defs.pcair_get_strong_threshold pcair_get_ddc_its = pflare_defs.pcair_get_ddc_its -pcair_get_max_dd_ratio = pflare_defs.pcair_get_max_dd_ratio pcair_get_ddc_fraction = pflare_defs.pcair_get_ddc_fraction pcair_get_cf_splitting_type = pflare_defs.pcair_get_cf_splitting_type pcair_get_max_luby_steps = pflare_defs.pcair_get_max_luby_steps @@ -104,7 +108,6 @@ pcair_set_subcomm = pflare_defs.pcair_set_subcomm pcair_set_strong_threshold = pflare_defs.pcair_set_strong_threshold pcair_set_ddc_its = pflare_defs.pcair_set_ddc_its -pcair_set_max_dd_ratio = pflare_defs.pcair_set_max_dd_ratio pcair_set_ddc_fraction = pflare_defs.pcair_set_ddc_fraction pcair_set_cf_splitting_type = pflare_defs.pcair_set_cf_splitting_type pcair_set_max_luby_steps = pflare_defs.pcair_set_max_luby_steps diff --git a/python/pflare_defs.pyx b/python/pflare_defs.pyx index 35fc3d7a..0d2ca3fe 100644 --- a/python/pflare_defs.pyx +++ b/python/pflare_defs.pyx @@ -18,7 +18,8 @@ cdef extern from "petsc.h": cdef extern: void PCRegister_PFLARE() - void compute_cf_splitting_c(PetscMat *A, int symmetric_int, double strong_threshold, int max_luby_steps, int cf_splitting_type, int ddc_its, double fraction_swap, double max_dd_ratio, PetscIS* is_fine, PetscIS* is_coarse) + void compute_cf_splitting_c(PetscMat *A, int symmetric_int, double strong_threshold, int max_luby_steps, int cf_splitting_type, int ddc_its, double fraction_swap, PetscIS* is_fine, PetscIS* is_coarse) + void compute_diag_dom_submatrix_c(PetscMat *A, double max_dd_ratio, PetscMat *output_mat) # ----------------------------------------------------------------------- # PCAIR Get routines @@ -39,7 +40,6 @@ cdef extern: void PCAIRGetSubcomm_c(PetscPC *pc, unsigned char *subcomm) void PCAIRGetStrongThreshold_c(PetscPC *pc, PetscReal *thresh) void PCAIRGetDDCIts_c(PetscPC *pc, PetscInt *its) - void PCAIRGetMaxDDRatio_c(PetscPC *pc, PetscReal *ratio) void PCAIRGetDDCFraction_c(PetscPC *pc, PetscReal *frac) void PCAIRGetCFSplittingType_c(PetscPC *pc, int *algo) void PCAIRGetMaxLubySteps_c(PetscPC *pc, PetscInt *steps) @@ -97,7 +97,6 @@ cdef extern: void PCAIRSetSubcomm_c(PetscPC *pc, int subcomm) void PCAIRSetStrongThreshold_c(PetscPC *pc, PetscReal thresh) void PCAIRSetDDCIts_c(PetscPC *pc, PetscInt its) - void PCAIRSetMaxDDRatio_c(PetscPC *pc, PetscReal ratio) void PCAIRSetDDCFraction_c(PetscPC *pc, PetscReal frac) void PCAIRSetCFSplittingType_c(PetscPC *pc, int algo) void PCAIRSetMaxLubySteps_c(PetscPC *pc, PetscInt steps) @@ -167,14 +166,20 @@ cdef extern: cpdef py_PCRegister_PFLARE(): PCRegister_PFLARE() -cpdef compute_cf_splitting(Mat A, bint symmetric, double strong_threshold, int max_luby_steps, int cf_splitting_type, int ddc_its, double fraction_swap, double max_dd_ratio): +cpdef compute_cf_splitting(Mat A, bint symmetric, double strong_threshold, int max_luby_steps, int cf_splitting_type, int ddc_its, double fraction_swap): cdef IS is_fine cdef IS is_coarse is_fine = IS() is_coarse = IS() - compute_cf_splitting_c(&(A.mat), symmetric, strong_threshold, max_luby_steps, cf_splitting_type, ddc_its, fraction_swap, max_dd_ratio, &(is_fine.iset), &(is_coarse.iset)) + compute_cf_splitting_c(&(A.mat), symmetric, strong_threshold, max_luby_steps, cf_splitting_type, ddc_its, fraction_swap, &(is_fine.iset), &(is_coarse.iset)) return is_fine, is_coarse +cpdef compute_diag_dom_submatrix(Mat A, double max_dd_ratio): + cdef Mat output_mat + output_mat = Mat() + compute_diag_dom_submatrix_c(&(A.mat), max_dd_ratio, &(output_mat.mat)) + return output_mat + # ----------------------------------------------------------------------- # PCAIR Get wrappers # ----------------------------------------------------------------------- @@ -245,11 +250,6 @@ cpdef int pcair_get_ddc_its(PC pc): PCAIRGetDDCIts_c(&(pc.pc), &result) return result -cpdef double pcair_get_max_dd_ratio(PC pc): - cdef PetscReal result = 0.0 - PCAIRGetMaxDDRatio_c(&(pc.pc), &result) - return result - cpdef double pcair_get_ddc_fraction(PC pc): cdef PetscReal result = 0.0 PCAIRGetDDCFraction_c(&(pc.pc), &result) @@ -491,9 +491,6 @@ cpdef pcair_set_strong_threshold(PC pc, double thresh): cpdef pcair_set_ddc_its(PC pc, int its): PCAIRSetDDCIts_c(&(pc.pc), its) -cpdef pcair_set_max_dd_ratio(PC pc, double ratio): - PCAIRSetMaxDDRatio_c(&(pc.pc), ratio) - cpdef pcair_set_ddc_fraction(PC pc, double frac): PCAIRSetDDCFraction_c(&(pc.pc), frac) diff --git a/python/run_parse_tests.py b/python/run_parse_tests.py index 22e8a166..605c6dfa 100644 --- a/python/run_parse_tests.py +++ b/python/run_parse_tests.py @@ -53,12 +53,13 @@ 3, ), ( - "adv_diff_fd max_dd_ratio 0.9 (8x8)", + "adv_diff_fd diag_dom threshold 0.9 (8x8)", [ "./adv_diff_fd", "-da_grid_x", "8", "-da_grid_y", "8", "-pc_type", "air", "-ksp_max_it", "3", - "-pc_air_max_dd_ratio", "0.9", + "-pc_air_cf_splitting_type", "diag_dom", + "-pc_air_strong_threshold", "0.9", "-ksp_monitor", "-pc_air_print_stats_timings", "-second_solve", diff --git a/src/AIR_Data_Type.F90 b/src/AIR_Data_Type.F90 index 5684cb3f..87a50131 100644 --- a/src/AIR_Data_Type.F90 +++ b/src/AIR_Data_Type.F90 @@ -87,9 +87,6 @@ module air_data_type ! How many passes of DDC to do ! -pc_air_ddc_its integer :: ddc_its = 1 - ! The max diagonal dominace ratio we want in ddc iterations - ! -pc_air_max_dd_ratio - PetscReal :: max_dd_ratio = 0.0 ! Second pass in the PMISR DDC CF splitting converts ! this fraction of local F points to C based on diagonal dominance ! -pc_air_ddc_fraction diff --git a/src/AIR_Data_Type_Routines.F90 b/src/AIR_Data_Type_Routines.F90 index d6ce2561..10cbb14d 100644 --- a/src/AIR_Data_Type_Routines.F90 +++ b/src/AIR_Data_Type_Routines.F90 @@ -315,7 +315,6 @@ subroutine destroy_air_data(air_data) air_data%options%strong_threshold = 0.5 air_data%options%ddc_its = 1 air_data%options%ddc_fraction = 0.1 - air_data%options%max_dd_ratio = 0.0 air_data%options%cf_splitting_type = 0 air_data%options%max_luby_steps = -1 diff --git a/src/AIR_MG_Setup.F90 b/src/AIR_MG_Setup.F90 index ba44c4a0..244c9788 100644 --- a/src/AIR_MG_Setup.F90 +++ b/src/AIR_MG_Setup.F90 @@ -273,7 +273,6 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) air_data%options%cf_splitting_type, & air_data%options%ddc_its, & air_data%options%ddc_fraction, & - air_data%options%max_dd_ratio, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level)) air_data%allocated_is(our_level) = .TRUE. end if diff --git a/src/CF_Splitting.F90 b/src/CF_Splitting.F90 index 1a8228bc..a566e4d3 100644 --- a/src/CF_Splitting.F90 +++ b/src/CF_Splitting.F90 @@ -2,8 +2,10 @@ module cf_splitting use petscmat use pflare_parameters, only: C_POINT, F_POINT - use pmisr_ddc, only: pmisr, ddc - use c_petsc_interfaces, only: create_cf_is_kokkos, delete_device_cf_markers + use pmisr_module, only: pmisr + use ddc_module, only: ddc + use sabs, only: generate_sabs + use c_petsc_interfaces, only: create_cf_is_kokkos, delete_device_cf_markers, delete_device_diag_dom_ratio use aggregation, only: generate_serial_aggregation use petsc_helper, only: MatAXPYWrapper, MatSetAllValues, kokkos_debug, remove_small_from_sparse @@ -13,10 +15,11 @@ module cf_splitting public PetscEnum, parameter :: CF_PMISR_DDC=0 - PetscEnum, parameter :: CF_PMIS=1 - PetscEnum, parameter :: CF_PMIS_DIST2=2 - PetscEnum, parameter :: CF_AGG=3 - PetscEnum, parameter :: CF_PMIS_AGG=4 + PetscEnum, parameter :: CF_DIAG_DOM=1 + PetscEnum, parameter :: CF_PMIS=2 + PetscEnum, parameter :: CF_PMIS_DIST2=3 + PetscEnum, parameter :: CF_AGG=4 + PetscEnum, parameter :: CF_PMIS_AGG=5 contains @@ -77,126 +80,6 @@ subroutine create_cf_is(input_mat, cf_markers_local, is_fine, is_coarse) end subroutine create_cf_is -!------------------------------------------------------------------------------------------------------------------------ - - subroutine generate_sabs(input_mat, strong_threshold, symmetrize, square, output_mat, allow_drop_diagonal) - - ! Generate strength of connection matrix with absolute value - ! Output has no diagonal entries - - ! ~~~~~~~~~~ - ! Input - type(tMat), intent(in) :: input_mat - type(tMat), intent(inout) :: output_mat - PetscReal, intent(in) :: strong_threshold - logical, intent(in) :: symmetrize, square - logical, intent(in), optional :: allow_drop_diagonal - - PetscInt :: ifree - PetscInt :: local_rows, local_cols, global_rows, global_cols - PetscInt :: global_row_start, global_row_end_plus_one - PetscInt :: global_col_start, global_col_end_plus_one, counter - integer :: errorcode, comm_size - PetscErrorCode :: ierr - PetscInt, parameter :: nz_ignore = -1, one=1, zero=0 - MPIU_Comm :: MPI_COMM_MATRIX - type(tMat) :: transpose_mat - type(tIS) :: zero_diags - PetscInt, dimension(:), pointer :: zero_diags_pointer - logical :: drop_diag - - ! ~~~~~~~~~~ - - drop_diag = .TRUE. - if (present(allow_drop_diagonal)) drop_diag = allow_drop_diagonal - - call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) - ! Get the comm size - call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) - - ! Get the local sizes - call MatGetLocalSize(input_mat, local_rows, local_cols, ierr) - call MatGetSize(input_mat, global_rows, global_cols, ierr) - ! This returns the global index of the local portion of the matrix - call MatGetOwnershipRange(input_mat, global_row_start, global_row_end_plus_one, ierr) - call MatGetOwnershipRangeColumn(input_mat, global_col_start, global_col_end_plus_one, ierr) - - ! Drop entries smaller than the strong_threshold, with a relative tolerance measured - ! against the biggest abs non-diagonal entry, don't lump and always drop the diagonal - call remove_small_from_sparse(input_mat, strong_threshold, output_mat, & - relative_max_row_tol_int = -1, lump=.FALSE., drop_diagonal_int=-1) - - ! Now symmetrize if desired - if (symmetrize) then - - ! We could just do a symbolic transpose and add the two sets of indices together, - ! but its so much simpler to just add the two together - and the symbolic will be the expensive part - ! anyway - call MatTranspose(output_mat, MAT_INITIAL_MATRIX, transpose_mat, ierr) - ! Kokkos + MPI doesn't have a gpu mataxpy yet, so we have a wrapper around our own version - call MatAXPYWrapper(output_mat, 1d0, transpose_mat) - - ! Don't forget to destroy the explicit transpose - call MatDestroy(transpose_mat, ierr) - - end if - - ! Square the strength matrix to aggressively coarsen (gives a distance 2 MIS) - if (square) then - - if (symmetrize) then - call MatMatMult(output_mat, output_mat, & - MAT_INITIAL_MATRIX, 1d0, transpose_mat, ierr) - else - call MatTransposeMatMult(output_mat, output_mat, & - MAT_INITIAL_MATRIX, 1d0, transpose_mat, ierr) - endif - - ! Also have to add in the original distance 1 connections to the square - ! as the dist 1 strength matrix has had the diagonals removed, so the square won't - ! have the dist 1 connetions in it - call MatAXPYWrapper(transpose_mat, 1d0, output_mat) - call MatDestroy(output_mat, ierr) - - ! Can end up with diagonal entries we have to remove - ! Let's get the diagonals that are zero or unassigned - call MatFindZeroDiagonals(transpose_mat, zero_diags, ierr) - call ISGetIndices(zero_diags, zero_diags_pointer, ierr) - ! Then let's just set every other row to have a zero diagonal - ! as we know they're already preallocated - counter = 1 - do ifree = 1, local_rows - - if (counter .le. size(zero_diags_pointer)) then - ! Skip over any rows that don't have diagonals or are already zero - if (zero_diags_pointer(counter) - global_row_start + 1 == ifree) then - counter = counter + 1 - cycle - end if - end if - - ! Set the diagonal to 0 - call MatSetValue(transpose_mat, ifree - 1 + global_row_start, ifree - 1 + global_row_start, 0d0, INSERT_VALUES, ierr) - end do - - call ISRestoreIndices(zero_diags, zero_diags_pointer, ierr) - - call MatAssemblyBegin(transpose_mat, MAT_FINAL_ASSEMBLY, ierr) - call MatAssemblyEnd(transpose_mat, MAT_FINAL_ASSEMBLY, ierr) - - ! Could call MatEliminateZeros in later versions of petsc, but for here - ! given we know the entries are ==1, we will just create a copy with "small" stuff removed - ! ie the zero diagonal - call remove_small_from_sparse(transpose_mat, 1d-100, output_mat, drop_diagonal_int = 1) - call MatDestroy(transpose_mat, ierr) - - end if - - ! Reset the entries in the strength matrix back to 1 - if (symmetrize .OR. square) call MatSetAllValues(output_mat, 1d0) - - end subroutine generate_sabs - ! ------------------------------------------------------------------------------------------------------------------------------- subroutine first_pass_splitting(input_mat, symmetric, strong_threshold, max_luby_steps, cf_splitting_type, cf_markers_local) @@ -252,6 +135,16 @@ subroutine first_pass_splitting(input_mat, symmetric, strong_threshold, max_luby ! Note we are symmetrizing the strength matrix here call generate_sabs(input_mat, strong_threshold, .TRUE., .FALSE., strength_mat) + else if (cf_splitting_type == CF_DIAG_DOM) then + ! Only symmetrize if not already symmetric + + ! Tried to generate a strength matrix based on the relative size compared to the + ! diagonal, but it produces a worse initial coarsening when fed to PMISR, making + ! the DDC cleanup take a lot more work + !call generate_sabs(input_mat, strong_threshold, .NOT. symmetric, .FALSE., strength_mat, & + ! allow_diag_strength = .TRUE.) + call generate_sabs(input_mat, strong_threshold, .NOT. symmetric, .FALSE., strength_mat) + ! PMISR DDC and Aggregation else ! Only symmetrize if not already symmetric @@ -263,7 +156,7 @@ subroutine first_pass_splitting(input_mat, symmetric, strong_threshold, max_luby ! ~~~~~~~~~~~~ ! PMISR - if (cf_splitting_type == CF_PMISR_DDC) then + if (cf_splitting_type == CF_PMISR_DDC .OR. cf_splitting_type == CF_DIAG_DOM) then call pmisr(strength_mat, max_luby_steps, .FALSE., cf_markers_local) @@ -338,7 +231,7 @@ end subroutine first_pass_splitting subroutine compute_cf_splitting(input_mat, symmetric, & strong_threshold, max_luby_steps, & - cf_splitting_type, ddc_its, fraction_swap, max_dd_ratio, & + cf_splitting_type, ddc_its, fraction_swap, & is_fine, is_coarse) ! Computes a CF splitting and returns the F and C point ISs @@ -346,7 +239,7 @@ subroutine compute_cf_splitting(input_mat, symmetric, & ! ~~~~~~ type(tMat), target, intent(in) :: input_mat logical, intent(in) :: symmetric - PetscReal, intent(in) :: strong_threshold, max_dd_ratio + PetscReal, intent(in) :: strong_threshold integer, intent(in) :: max_luby_steps, cf_splitting_type, ddc_its PetscReal, intent(in) :: fraction_swap type(tIS), intent(inout) :: is_fine, is_coarse @@ -396,23 +289,25 @@ subroutine compute_cf_splitting(input_mat, symmetric, & ! Only do the DDC pass if we're doing PMISR_DDC ! and if we haven't requested an exact independent set, ie strong threshold is not zero ! as this gives diagonal Aff) - if (strong_threshold /= 0d0 .AND. cf_splitting_type == CF_PMISR_DDC) then + if (strong_threshold /= 0d0 .AND. & + (cf_splitting_type == CF_PMISR_DDC .OR. cf_splitting_type == CF_DIAG_DOM)) then - ! Do a set number of ddc iterations, unless we are aiming for a set diagonal - ! dominance ratio, in which case we do as many iterations as necessary + ! Do a set number of DDC iterations for PMISR_DDC. + ! For CF_DIAG_DOM, iterate until the requested strong_threshold ratio is reached. ddc_its_max = ddc_its - if (max_dd_ratio > 0) ddc_its_max = huge(ddc_its_max) + if (cf_splitting_type == CF_DIAG_DOM) ddc_its_max = huge(ddc_its_max) ddc_its_loop: do its = 1, ddc_its_max ! Do the second pass cleanup - this will directly modify the values in cf_markers_local ! (or the equivalent device cf_markers, is_fine is ignored if on the device) - max_dd_ratio_achieved = max_dd_ratio + max_dd_ratio_achieved = 0d0 + if (cf_splitting_type == CF_DIAG_DOM) max_dd_ratio_achieved = strong_threshold call ddc(input_mat, is_fine, fraction_swap, max_dd_ratio_achieved, cf_markers_local) ! If we did anything in our ddc second pass and hence need to rebuild ! the is_fine and is_coarse - if ((fraction_swap /= 0d0 .OR. max_dd_ratio_achieved /= max_dd_ratio) & + if ((fraction_swap /= 0d0 .OR. max_dd_ratio_achieved /= 0d0) & .AND. need_intermediate_is) then ! These are now outdated @@ -424,7 +319,7 @@ subroutine compute_cf_splitting(input_mat, symmetric, & end if ! Terminate if we've reached the ratio - if (max_dd_ratio > 0 .AND. max_dd_ratio_achieved < max_dd_ratio) exit ddc_its_loop + if (cf_splitting_type == CF_DIAG_DOM .AND. max_dd_ratio_achieved < strong_threshold) exit ddc_its_loop end do ddc_its_loop end if @@ -443,6 +338,7 @@ subroutine compute_cf_splitting(input_mat, symmetric, & if (cf_splitting_type == CF_PMIS_AGG) then ! Destroys the device cf_markers_local call delete_device_cf_markers() + call delete_device_diag_dom_ratio() end if ! Aggregation is not on the device at all @@ -483,6 +379,7 @@ subroutine compute_cf_splitting(input_mat, symmetric, & ! Destroys the device cf_markers_local call delete_device_cf_markers() + call delete_device_diag_dom_ratio() end if end if diff --git a/src/C_Fortran_Bindings.F90 b/src/C_Fortran_Bindings.F90 index c0ab6983..4c005ef1 100644 --- a/src/C_Fortran_Bindings.F90 +++ b/src/C_Fortran_Bindings.F90 @@ -6,6 +6,7 @@ module c_fortran_bindings use pcair_shell, only: PCReset_AIR_Shell, create_pc_air_shell use approx_inverse_setup, only: calculate_and_build_approximate_inverse, reset_inverse_mat use cf_splitting, only: compute_cf_splitting + use matdiagdomsubmatrix, only: compute_diag_dom_submatrix use air_data_type_routines, only: create_air_data #include "petsc/finclude/petscksp.h" @@ -219,7 +220,7 @@ end subroutine reset_inverse_mat_c subroutine compute_cf_splitting_c(input_mat_ptr, symmetric_int, & strong_threshold, max_luby_steps, & - cf_splitting_type, ddc_its, fraction_swap, max_dd_ratio, & + cf_splitting_type, ddc_its, fraction_swap, & is_fine_ptr, is_coarse_ptr) & bind(C,name='compute_cf_splitting_c') @@ -228,7 +229,7 @@ subroutine compute_cf_splitting_c(input_mat_ptr, symmetric_int, & ! ~~~~~~~~ integer(c_long_long), intent(in) :: input_mat_ptr integer(c_int), value, intent(in) :: symmetric_int, max_luby_steps, cf_splitting_type, ddc_its - real(c_double), value, intent(in) :: strong_threshold, fraction_swap, max_dd_ratio + real(c_double), value, intent(in) :: strong_threshold, fraction_swap integer(c_long_long), intent(inout) :: is_fine_ptr, is_coarse_ptr type(tMat) :: input_mat @@ -244,7 +245,7 @@ subroutine compute_cf_splitting_c(input_mat_ptr, symmetric_int, & if (symmetric_int == 1) symmetric = .TRUE. call compute_cf_splitting(input_mat, symmetric, & strong_threshold, max_luby_steps, & - cf_splitting_type, ddc_its, fraction_swap, max_dd_ratio, & + cf_splitting_type, ddc_its, fraction_swap, & is_fine, is_coarse) ! Pass out the IS's @@ -255,5 +256,30 @@ end subroutine compute_cf_splitting_c !------------------------------------------------------------------------------------------------------------------------ + subroutine compute_diag_dom_submatrix_c(input_mat_ptr, max_dd_ratio, output_mat_ptr) & + bind(C,name='compute_diag_dom_submatrix_c') + + ! Computes a diagonally dominant submatrix + + ! ~~~~~~~~ + integer(c_long_long), intent(in) :: input_mat_ptr + real(c_double), value, intent(in) :: max_dd_ratio + integer(c_long_long), intent(inout) :: output_mat_ptr + + type(tMat) :: input_mat, output_mat + ! ~~~~~~~~ + + ! Copy the input matrix pointer into the Fortran PETSc handle wrapper + input_mat%v = input_mat_ptr + + call compute_diag_dom_submatrix(input_mat, max_dd_ratio, output_mat) + + ! Pass out the resulting submatrix handle + output_mat_ptr = output_mat%v + + end subroutine compute_diag_dom_submatrix_c + + !------------------------------------------------------------------------------------------------------------------------ + end module c_fortran_bindings diff --git a/src/C_PETSc_Interfaces.F90 b/src/C_PETSc_Interfaces.F90 index 2619d6e5..c91ddd86 100644 --- a/src/C_PETSc_Interfaces.F90 +++ b/src/C_PETSc_Interfaces.F90 @@ -286,7 +286,8 @@ end subroutine generate_identity_is_kokkos interface subroutine remove_small_from_sparse_kokkos(A_array, tol, B_array, & - relative_max_row_tolerance_int, lump_int, allow_drop_diagonal_int) & + relative_max_row_tolerance_int, lump_int, allow_drop_diagonal_int, & + allow_diag_strength_int) & bind(c, name="remove_small_from_sparse_kokkos") use iso_c_binding integer(c_long_long) :: A_array @@ -294,7 +295,7 @@ subroutine remove_small_from_sparse_kokkos(A_array, tol, B_array, & integer(c_long_long) :: B_array integer(c_int), value :: relative_max_row_tolerance_int integer(c_int), value :: lump_int - integer(c_int), value :: allow_drop_diagonal_int + integer(c_int), value :: allow_drop_diagonal_int, allow_diag_strength_int end subroutine remove_small_from_sparse_kokkos end interface @@ -404,12 +405,28 @@ end subroutine pmisr_kokkos interface - subroutine ddc_kokkos(A_array, fraction_swap, max_dd_ratio) & + subroutine MatDiagDomRatio_kokkos(A_array, max_dd_ratio_achieved, local_rows_aff) & + bind(c, name="MatDiagDomRatio_kokkos") + use iso_c_binding + integer(c_long_long) :: A_array + real(PFLARE_PETSCREAL_C_KIND) :: max_dd_ratio_achieved + integer(PFLARE_PETSCINT_C_KIND) :: local_rows_aff + end subroutine MatDiagDomRatio_kokkos + + end interface + + interface + + subroutine ddc_kokkos(A_array, fraction_swap, max_dd_ratio, max_dd_ratio_achieved, Aff_array, & + random_numbers_ptr) & bind(c, name="ddc_kokkos") use iso_c_binding integer(c_long_long) :: A_array real(PFLARE_PETSCREAL_C_KIND), value :: fraction_swap - real(PFLARE_PETSCREAL_C_KIND) :: max_dd_ratio + real(PFLARE_PETSCREAL_C_KIND), value :: max_dd_ratio + real(PFLARE_PETSCREAL_C_KIND), value :: max_dd_ratio_achieved + integer(c_long_long) :: Aff_array + type(c_ptr), value :: random_numbers_ptr end subroutine ddc_kokkos end interface @@ -424,6 +441,16 @@ end subroutine copy_cf_markers_d2h end interface + interface + + subroutine copy_diag_dom_ratio_d2h(diag_dom_ratio_local) & + bind(c, name="copy_diag_dom_ratio_d2h") + use iso_c_binding + type(c_ptr), value :: diag_dom_ratio_local + end subroutine copy_diag_dom_ratio_d2h + + end interface + interface subroutine delete_device_cf_markers() & @@ -433,6 +460,15 @@ end subroutine delete_device_cf_markers end interface + interface + + subroutine delete_device_diag_dom_ratio() & + bind(c, name="delete_device_diag_dom_ratio") + use iso_c_binding + end subroutine delete_device_diag_dom_ratio + + end interface + interface subroutine compute_P_from_W_kokkos(A_array, global_row_start, indices_fine, & diff --git a/src/DDC_Module.F90 b/src/DDC_Module.F90 new file mode 100644 index 00000000..4204f3fa --- /dev/null +++ b/src/DDC_Module.F90 @@ -0,0 +1,480 @@ +module ddc_module + + use iso_c_binding + use petscmat + use petsc_helper, only: kokkos_debug, remove_small_from_sparse, MatCreateSubMatrixWrapper + use c_petsc_interfaces, only: copy_cf_markers_d2h, copy_diag_dom_ratio_d2h, ddc_kokkos, & + MatDiagDomRatio_kokkos, create_cf_is_kokkos, & + vecscatter_mat_begin_c, vecscatter_mat_end_c, vecscatter_mat_restore_c, MatSeqAIJGetArrayF90_mine + use pmisr_module, only: pmisr_existing_measure_cf_markers, pmisr_existing_measure_implicit_transpose + use pflare_parameters, only: C_POINT, F_POINT + use matdiagdom, only: MatDiagDomRatio + +#include "petsc/finclude/petscmat.h" +#include "finclude/PETSc_ISO_Types.h" + + implicit none + + public + + contains + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine ddc(input_mat, is_fine, fraction_swap, max_dd_ratio, cf_markers_local) + + ! Second pass diagonal dominance cleanup + ! Flips the F definitions to C based on least diagonally dominant local rows + ! If fraction_swap = 0 this does nothing + ! If fraction_swap < 0 it uses abs(fraction_swap) to be a threshold + ! for swapping C to F based on row-wise diagonal dominance (ie alpha_diag) + ! If fraction_swap > 0 it uses fraction_swap as the local fraction of worst C points to swap to F + ! though it won't hit that fraction exactly as we bin the diag dom ratios for speed, it will be close to the fraction + + ! ~~~~~~ + type(tMat), target, intent(in) :: input_mat + type(tIS), intent(in) :: is_fine + PetscReal, intent(in) :: fraction_swap + PetscReal, intent(inout) :: max_dd_ratio + integer, dimension(:), allocatable, target, intent(inout) :: cf_markers_local + + type(tMat) :: Aff_ddc + PetscErrorCode :: ierr + logical :: trigger_dd_ratio_compute_local + PetscInt :: local_rows, local_cols + PetscReal :: max_dd_ratio_achieved + integer :: seed_size_ddc, comm_rank_ddc, errorcode_ddc, i_loc + integer, dimension(:), allocatable :: seed_ddc + PetscReal, dimension(:), allocatable :: diag_dom_ratio + PetscReal, dimension(:), allocatable, target :: diag_dom_ratio_random + type(c_ptr) :: random_numbers_ptr + MPIU_Comm :: MPI_COMM_MATRIX + +#if defined(PETSC_HAVE_KOKKOS) + integer(c_long_long) :: A_array, Aff_array, is_fine_array, is_coarse_array + MatType :: mat_type + type(c_ptr) :: cf_markers_local_ptr + integer :: errorcode + !integer :: kfree + integer, dimension(:), allocatable :: cf_markers_local_two + type(tIS) :: is_fine_temp, is_coarse_temp +#endif + ! ~~~~~~ + + trigger_dd_ratio_compute_local = max_dd_ratio > 0 + + ! If we don't need to swap anything, return + if (fraction_swap == 0d0 .AND. .NOT. trigger_dd_ratio_compute_local) then + return + end if + + ! Compute the diagonal dominance ratio - either returned in diag_dom_ratio + ! or stored in a device copy for kokkos + ! max_dd_ratio_achieved is always returned and is the max diag dom ratio across + ! all ranks + call MatDiagDomRatio(input_mat, is_fine, cf_markers_local, diag_dom_ratio, max_dd_ratio_achieved) + + ! If we have hit the required diagonal dominance ratio, return + if (trigger_dd_ratio_compute_local .AND. max_dd_ratio_achieved < max_dd_ratio) then + max_dd_ratio = max_dd_ratio_achieved + if (allocated(diag_dom_ratio)) deallocate(diag_dom_ratio) + return + end if + + ! Generate random numbers for the PMIS tie-breaking in the trigger path + ! These are generated here so both CPU and Kokkos use the same randoms + random_numbers_ptr = c_null_ptr + if (trigger_dd_ratio_compute_local) then + + call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) + call MPI_Comm_rank(MPI_COMM_MATRIX, comm_rank_ddc, errorcode_ddc) + call MatGetLocalSize(input_mat, local_rows, local_cols, ierr) + ! We allocate randoms here to be the size of input, rather than + ! just F points as if we are on the device the is_fine won't be allocated + ! on the host yet + allocate(diag_dom_ratio_random(local_rows)) + + call random_seed(size=seed_size_ddc) + allocate(seed_ddc(seed_size_ddc)) + + do i_loc = 1, seed_size_ddc + seed_ddc(i_loc) = comm_rank_ddc + 1 + i_loc + end do + call random_seed(put=seed_ddc) + call random_number(diag_dom_ratio_random) + + deallocate(seed_ddc) + random_numbers_ptr = c_loc(diag_dom_ratio_random) + end if + +#if defined(PETSC_HAVE_KOKKOS) + + call MatGetType(input_mat, mat_type, ierr) + if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & + mat_type == MATAIJKOKKOS) then + + ! Kokkos path: only extract Aff if trigger_dd_ratio_compute + ! as the kokkos ddc computes diag dominance ratio without needing Aff + Aff_array = 0 + A_array = input_mat%v + if (trigger_dd_ratio_compute_local) then + + ! Create the host is_fine and is_coarse based on device cf_markers + call create_cf_is_kokkos(A_array, is_fine_array, is_coarse_array) + is_fine_temp%v = is_fine_array + is_coarse_temp%v = is_coarse_array + + call MatCreateSubMatrixWrapper(input_mat, & + is_fine_temp, is_fine_temp, MAT_INITIAL_MATRIX, & + Aff_ddc) + + Aff_array = Aff_ddc%v + call ISDestroy(is_fine_temp, ierr) + call ISDestroy(is_coarse_temp, ierr) + end if + + cf_markers_local_ptr = c_loc(cf_markers_local) + + ! If debugging do a comparison between CPU and Kokkos results + if (kokkos_debug()) then + allocate(cf_markers_local_two(size(cf_markers_local))) + cf_markers_local_two = cf_markers_local + end if + + ! Modifies the existing device cf_markers created by the pmisr + call ddc_kokkos(A_array, fraction_swap, max_dd_ratio, max_dd_ratio_achieved, Aff_array, & + random_numbers_ptr) + + ! If debugging do a comparison between CPU and Kokkos results + if (kokkos_debug()) then + + ! Kokkos DDC by default now doesn't copy back to the host, as any subsequent ddc calls + ! use the existing device data + call copy_cf_markers_d2h(cf_markers_local_ptr) + if (trigger_dd_ratio_compute_local) then + call ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, max_dd_ratio_achieved, & + diag_dom_ratio, cf_markers_local_two, Aff=Aff_ddc, & + diag_dom_ratio_random=diag_dom_ratio_random) + else + call ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, max_dd_ratio_achieved, & + diag_dom_ratio, cf_markers_local_two) + end if + + if (any(cf_markers_local /= cf_markers_local_two)) then + + ! do kfree = 1, size(cf_markers_local) + ! if (cf_markers_local(kfree) /= cf_markers_local_two(kfree)) then + ! print *, kfree-1, "no match", cf_markers_local(kfree), cf_markers_local_two(kfree) + ! end if + ! end do + print *, "Kokkos and CPU versions of ddc do not match" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + deallocate(cf_markers_local_two) + end if + + ! Cleanup + if (trigger_dd_ratio_compute_local) then + call MatDestroy(Aff_ddc, ierr) + end if + + else + ! CPU path: only extract Aff if trigger_dd_ratio_compute + if (trigger_dd_ratio_compute_local) then + call MatCreateSubMatrix(input_mat, & + is_fine, is_fine, MAT_INITIAL_MATRIX, & + Aff_ddc, ierr) + call ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, max_dd_ratio_achieved, & + diag_dom_ratio, cf_markers_local, & + Aff=Aff_ddc, & + diag_dom_ratio_random=diag_dom_ratio_random) + call MatDestroy(Aff_ddc, ierr) + else + call ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, max_dd_ratio_achieved, & + diag_dom_ratio, cf_markers_local) + end if + end if +#else + ! CPU path: only extract Aff if trigger_dd_ratio_compute + if (trigger_dd_ratio_compute_local) then + call MatCreateSubMatrix(input_mat, & + is_fine, is_fine, MAT_INITIAL_MATRIX, & + Aff_ddc, ierr) + call ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, max_dd_ratio_achieved, & + diag_dom_ratio, cf_markers_local, & + Aff=Aff_ddc, & + diag_dom_ratio_random=diag_dom_ratio_random) + call MatDestroy(Aff_ddc, ierr) + else + call ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, max_dd_ratio_achieved, & + diag_dom_ratio, cf_markers_local) + end if +#endif + + if (allocated(diag_dom_ratio_random)) deallocate(diag_dom_ratio_random) + if (allocated(diag_dom_ratio)) deallocate(diag_dom_ratio) + + end subroutine ddc + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, max_dd_ratio_achieved, diag_dom_ratio, & + cf_markers_local, Aff, diag_dom_ratio_random) + + ! Second pass diagonal dominance cleanup + ! Flips the F definitions to C based on least diagonally dominant local rows + ! If fraction_swap = 0 this does nothing + ! If fraction_swap < 0 it uses abs(fraction_swap) to be a threshold + ! for swapping C to F based on row-wise diagonal dominance (ie alpha_diag) + ! If fraction_swap > 0 it uses fraction_swap as the local fraction of worst C points to swap to F + ! though it won't hit that fraction exactly as we bin the diag dom ratios for speed, it will be close to the fraction + + ! ~~~~~~ + type(tMat), target, intent(in) :: input_mat + type(tIS), intent(in) :: is_fine + PetscReal, intent(in) :: fraction_swap + PetscReal, intent(in) :: max_dd_ratio + PetscReal, intent(in) :: max_dd_ratio_achieved + PetscReal, dimension(:), intent(in) :: diag_dom_ratio + integer, dimension(:), allocatable, intent(inout) :: cf_markers_local + type(tMat), intent(in), optional :: Aff + PetscReal, dimension(:), intent(in), optional :: diag_dom_ratio_random + + ! Local + PetscInt :: local_rows, one=1 + PetscInt :: ifree + PetscInt :: input_row_start, input_row_end_plus_one + PetscInt :: idx, search_size, fine_size, frac_size + integer :: bin_sum, bin_boundary, bin, errorcode + integer :: max_luby_steps + PetscErrorCode :: ierr + PetscReal, dimension(:), allocatable :: diag_dom_ratio_measure + integer, dimension(:), allocatable :: cf_markers_local_aff + PetscInt, dimension(:), pointer :: is_pointer + real(c_double) :: swap_dom_val + integer, dimension(1000) :: dom_bins + MPIU_Comm :: MPI_COMM_MATRIX + logical :: trigger_dd_ratio_compute + + ! ~~~~~~ + + ! Get the communicator + call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) + + ! The indices are the numbering in the local fine row set + call ISGetIndices(is_fine, is_pointer, ierr) + call ISGetLocalSize(is_fine, fine_size, ierr) + + trigger_dd_ratio_compute = max_dd_ratio > 0 + + ! Trigger path requires Aff and pre-generated random numbers + if (trigger_dd_ratio_compute) then + if (.NOT. present(Aff) .OR. .NOT. present(diag_dom_ratio_random)) then + print *, "ddc_cpu missing Aff/randoms for trigger path" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + end if + + ! Do a fixed alpha_diag + if (fraction_swap < 0) then + ! We have to look through all the local rows + search_size = fine_size + + ! Or pick alpha_diag based on the worst % of rows + else + ! Only need to go through the biggest % of indices + frac_size = int(dble(fine_size) * fraction_swap) + + ! If we are trying to hit a given max_dd_ratio, then we need to continue coarsening, even + ! if we only change one dof at a time, otherwise we could get stuck + if (trigger_dd_ratio_compute) then + search_size = max(one, frac_size) + ! If we're not trying to hit a given max_dd_ratio, then if fraction_swap is small + ! we allow it to just not swap anything if the number of local rows is small + ! This stops many lower levels in parallel where we are only changing one dof at a time + else + search_size = frac_size + end if + end if + + call MatGetOwnershipRange(input_mat, input_row_start, input_row_end_plus_one, ierr) + + local_rows = fine_size + dom_bins = 0 + + ! ~~~~~~~~ + ! Trigger path: use PMIS-based independent set to swap F points + ! ~~~~~~~~ + if (trigger_dd_ratio_compute) then + ! ~~~~~~~~ + ! If we haven't hit the required diagonal dominance ratio, + ! then we need to swap some F points to C points, and we will do that with a + ! PMIS style algorithm + ! This lets us swap many points at once, without just picking every point that + ! is above the max ratio and swapping all of them - this would coarsen faster than + ! necessary as the removal of any one F point changes the diag dom of any connected + ! F points + ! We also don't want to do it one F point at a time as that would be slow + ! Hence the independent set is the best of both worlds and very parallel + ! + ! We go over all existing F points and compute an independent set + ! in Aff + Aff^T with a measure given by the diagonal dominance ratio + ! This will build an independent set of the biggest diagonal dominance ratio + ! We then swap all of those to C points and then the outer loop outside + ! this routine can recompute the diagonal dominace ratio and decide if we + ! want to do this again + ! If there are F points > max_ratio but with neighbours all < max_ratio this + ! point will be swapped + ! If there are F points > max_ratio and with some neighbours > max_ratio then + ! only one of those points in the neighbourhood will be swapped, namely the one + ! with the worst diagonal dominance ratio (this is a heuristic). + ! MacLachlan & Saad (2007) page 2120 for example multiply the diagonal dominance ratio + ! by the 1 on the number of neighbours in S - this would prioritise swapping bad entries + ! with many F-F connections (ie keeping Aff sparse) + ! The ratio of all neighbouring + ! rows will change and may be below the max_ratio after the swap + ! That will be picked up in the next outer loop. + ! ~~~~~~~~ + allocate(diag_dom_ratio_measure(local_rows)) + + ! Use the random numbers passed in from the wrapper + ! so that CPU and Kokkos use the same randoms for PMIS tie-breaking + diag_dom_ratio_measure = diag_dom_ratio_random(1:local_rows) + + ! ~~~~~~~~ + ! pmisr_existing_measure_cf_markers tags the points with the smallest + ! measure as F points + ! So if we feed in a measure that is like 10 - diag_dom_ratio, it will + ! pick the points with the biggest diagonal dominace ratio + ! If a point is already below the requested ratio, we set it to be + ! PETSC_MAX_REAL so it will never be picked + ! ~~~~~~~~ + + ! Now we take the existing random number and scale it down + ! to break ties but not change the diagonal dominance very much + ! PMISR sets the smallest measure as F points (which is what + ! we're going to use to denote points that need to swap in the loop below) + ! We feed in only F points and a zero cf_markers_local_aff and then + ! after the PMISR we take any points tagged as "F" from that result + ! and swap them. + ! The reason we feed in something like 10 - diag_dominance_ratio is not only + ! as we want it to pick the biggest diag dominance ratio but also + ! we have to ensure abs(measure) .ge. 1 + ! as the PMISR has a step where it sets anything with measure < 1 as F directly + ! given PMISR is normally called with the measure being the number of strong neighbours + diag_dom_ratio_measure = max(10d0, max_dd_ratio_achieved*2d0) - (diag_dom_ratio - diag_dom_ratio_measure/1d10) + + allocate(cf_markers_local_aff(local_rows)) + cf_markers_local_aff = 0 + + ! And then any points with diagonal dominance ratio already below + ! the minimum, we set the measure to PETSC_MAX_REAL and assign them as "C" already + ! so they won't be swapped + do ifree = 1, local_rows + ! Check against the diag_dom_ratio that we haven't modified + if (diag_dom_ratio(ifree) < max_dd_ratio) then + diag_dom_ratio_measure(ifree) = PETSC_MAX_REAL + cf_markers_local_aff(ifree) = C_POINT + end if + end do + + ! Call PMISR with as many steps as necessary + ! Uses the implicit transpose version which takes Aff directly + ! and handles Aff+Aff^T internally without forming the explicit sum + max_luby_steps = -1 + call pmisr_existing_measure_implicit_transpose(Aff, max_luby_steps, .FALSE., & + diag_dom_ratio_measure, cf_markers_local_aff) + + ! Let's go and swap the badly diagonally dominant rows to F points + do ifree = 1, local_rows + + ! The pmisr_existing_measure_cf_markers marked the points we want to swap as F + if (cf_markers_local_aff(ifree) == F_POINT) then + ! This is the actual numbering in A, rather than Aff + ! Careful here to minus away the row_start of A, not Aff + ! as cf_markers_local is as big as A + idx = is_pointer(ifree) - input_row_start + 1 + + ! Swap by multiplying by -1 + cf_markers_local(idx) = cf_markers_local(idx) * (-1) + end if + end do + + deallocate(cf_markers_local_aff, diag_dom_ratio_measure) + call ISRestoreIndices(is_fine, is_pointer, ierr) + + ! Return as we're done + return + end if + + ! ~~~~~~~~~~~~~ + ! If we got here then the user doesn't want us to hit a given diagonal dominance ratio + ! So we just swap a fixed fraction of the worst F points to C + ! ~~~~~~~~~~~~~ + + ! If we have local points to swap + if (search_size > 0) then + + ! If we reach here then we want to swap some local F points to C points + + do ifree = 1, local_rows + + ! Bin the entries between 0 and 1 + ! The top bin has entries greater than 0.9 (including greater than 1) + bin = min(floor(diag_dom_ratio(ifree) * size(dom_bins)) + 1, size(dom_bins)) + ! If the diagonal dominance ratio is really large the expression above will overflow + ! the int to negative, so we just stick that in the top bin + if (bin < 0) then + bin = size(dom_bins) + end if + dom_bins(bin) = dom_bins(bin) + 1 + + end do + + ! Do a fixed alpha_diag + if (fraction_swap< 0) then + swap_dom_val = -fraction_swap + + ! Otherwise swap everything bigger than a fixed fraction + else + + ! In order to reduce the size of the sort required, we have binned the entries into 1000 bins + ! Let's count backwards from the biggest entries to find which bin we know the nth_element is in + ! and then we only include those bins and higher into the sort + bin_sum = 0 + do bin_boundary = size(dom_bins), 1, -1 + bin_sum = bin_sum + dom_bins(bin_boundary) + if (bin_sum .ge. search_size) exit + end do + ! Now bin_boundary holds the bin whose lower boundary is guaranteed to be <= the n_th element + + ! Rather than do any type of sort, just swap everything above that bin boundary + ! This will give a fraction_swap that is very close to that passed in as long as the + ! size of the bins is small + swap_dom_val = dble(bin_boundary-1)/dble(size(dom_bins)) + + end if + + ! Let's go and swap F points to C points + do ifree = 1, local_rows + + ! If this row only has a single diagonal entry, or is below the threshold we swap, skip + if (diag_dom_ratio(ifree) == 0 .OR. diag_dom_ratio(ifree) < swap_dom_val) cycle + + ! This is the actual numbering in A, rather than Aff + ! Careful here to minus away the row_start of A, not Aff, as cf_markers_local is as big as A + idx = is_pointer(ifree) - input_row_start + 1 + + ! Swap by multiplying by -1 + cf_markers_local(idx) = cf_markers_local(idx) * (-1) + end do + end if + + call ISRestoreIndices(is_fine, is_pointer, ierr) + + end subroutine ddc_cpu + +! ------------------------------------------------------------------------------------------------------------------------------- + +end module ddc_module + diff --git a/src/DDC_Modulek.kokkos.cxx b/src/DDC_Modulek.kokkos.cxx new file mode 100644 index 00000000..61fc89b5 --- /dev/null +++ b/src/DDC_Modulek.kokkos.cxx @@ -0,0 +1,197 @@ +// Our petsc kokkos definitions - has to go first +#include "kokkos_helper.hpp" +#include +#include <../src/mat/impls/aij/seq/aij.h> +#include <../src/mat/impls/aij/mpi/mpiaij.h> + +// The definition of the device copy of the cf markers on a given level +// is stored in Device_Datak.kokkos.cxx and imported as extern from +// kokkos_helper.hpp + +//------------------------------------------------------------------------------------------------------------------------ + +// ddc cleanup but on the device - uses the global variable cf_markers_local_d +// This no longer copies back to the host pointer cf_markers_local at the end +// You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this +PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, const PetscReal max_dd_ratio, const PetscReal max_dd_ratio_achieved, Mat *aff, PetscReal *random_numbers) +{ + // Can't use the global directly within the parallel + // regions on the device + intKokkosView cf_markers_d = cf_markers_local_d; + PetscScalarKokkosView diag_dom_ratio_d = diag_dom_ratio_local_d; + PetscIntKokkosView is_fine_local_d; + + const int match_cf = -1; // F_POINT == -1 + create_cf_is_device_kokkos(input_mat, match_cf, is_fine_local_d); + PetscInt local_rows_aff = is_fine_local_d.extent(0); + + bool trigger_dd_ratio_compute = max_dd_ratio > 0; + auto exec = PetscGetKokkosExecutionSpace(); + + // Do a fixed alpha_diag + PetscInt search_size; + if (fraction_swap < 0) { + // We have to look through all the local rows + search_size = local_rows_aff; + } + // Or pick alpha_diag based on the worst % of rows + else { + // Only need to go through the biggest % of indices + PetscInt one = 1; + + // If we are trying to hit a given max_dd_ratio, then we need to continue coarsening, even + // if we only change one dof at a time + if (trigger_dd_ratio_compute) + { + search_size = std::max(one, static_cast(double(local_rows_aff) * fraction_swap)); + } + // If we're not trying to hit a given max_dd_ratio, then if fraction_swap is small + // we allow it to just not swap anything if the number of local rows is small + // This stops many lower levels in parallel where we are only changing one dof at a time + else + { + search_size = static_cast(double(local_rows_aff) * fraction_swap); + } + } + + if (trigger_dd_ratio_compute) + { + // ~~~~~~~~~~~~~~~ + // Ratio not met - use PMIS-based independent set to swap F points + // This mirrors the CPU ddc_cpu logic when trigger_dd_ratio_compute is true + // We build an independent set in Aff + Aff^T with a measure given by the + // diagonal dominance ratio, swap those to C points, and let the outer loop + // recompute + // ~~~~~~~~~~~~~~~ + { + // Create measure and cf_markers for Aff + PetscScalarKokkosView measure_d("measure_d", local_rows_aff); + intKokkosView cf_markers_aff_d("cf_markers_aff_d", local_rows_aff); + Kokkos::deep_copy(cf_markers_aff_d, 0); + + // Copy the random numbers from host to device + // These are generated in the Fortran wrapper so CPU and Kokkos use the same randoms + PetscScalarKokkosViewHost random_h(random_numbers, local_rows_aff); + PetscScalarKokkosView random_d("random_d", local_rows_aff); + Kokkos::deep_copy(random_d, random_h); + PetscCallVoid(PetscLogCpuToGpu(local_rows_aff * sizeof(PetscReal))); + + const PetscReal max_scale = std::max(10.0, max_dd_ratio_achieved * 2.0); + const PetscReal target_ratio = max_dd_ratio; + + // Build the measure: + // pmisr_existing_measure_cf_markers tags the smallest measure as F points + // So we feed in measure = max(10, max_achieved*2) - (diag_dom_ratio - random/1e10) + // which picks the biggest diagonal dominance ratio + // We have to ensure abs(measure) >= 1 as PMISR sets anything with measure < 1 as F directly + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) { + + // Scale: measure = max(10, max_achieved*2) - (diag_dom_ratio - random/1e10) + measure_d(i) = max_scale - (diag_dom_ratio_d(i) - random_d(i) / 1e10); + + // Points already below threshold: set measure to max and mark as C + // so they won't be swapped + if (diag_dom_ratio_d(i) < target_ratio) { + measure_d(i) = PETSC_MAX_REAL; + cf_markers_aff_d(i) = 1; // C_POINT + } + }); + exec.fence(); + + // Call PMISR with implicit transpose - takes Aff directly, handles Aff+Aff^T internally + // pmis_int=0 means PMISR, zero_measure_c_point_int=0 + pmisr_existing_measure_implicit_transpose_kokkos(aff, -1, 0, measure_d, cf_markers_aff_d, 0); + + // Swap F-tagged points back into cf_markers_d + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) { + if (cf_markers_aff_d(i) == -1) { // F_POINT + PetscInt idx = is_fine_local_d(i); + cf_markers_d(idx) *= -1; + } + }); + exec.fence(); + } + return; + } + + // Can't put this above because of collective operations in parallel (namely the MatDiagDomRatio_kokkos) + // If we have local points to swap + if (search_size > 0) + { + // If we reach here then we want to swap some local F points to C points + + // Create device memory for bins + auto dom_bins_d = PetscIntKokkosView("dom_bins_d", 1000); + Kokkos::deep_copy(dom_bins_d, 0); + + // Bin the diagonal dominance ratio + if (fraction_swap > 0) + { + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) { + + // Let's bin the entry + int bin; + int test_bin = floor(diag_dom_ratio_d(i) * double(dom_bins_d.extent(0))) + 1; + if (test_bin < int(dom_bins_d.extent(0)) && test_bin >= 0) { + bin = test_bin; + } + else { + bin = dom_bins_d.extent(0); + } + // Has to be atomic as many threads from different rows + // may be writing to the same bin + Kokkos::atomic_add(&dom_bins_d(bin - 1), 1); + }); + } + + PetscReal swap_dom_val; + // Do a fixed alpha_diag + if (fraction_swap < 0){ + swap_dom_val = -fraction_swap; + } + // Otherwise swap everything bigger than a fixed fraction + else{ + + // Parallel scan to inclusive sum the number of entries we have in + // the bins + Kokkos::parallel_scan(dom_bins_d.extent(0), KOKKOS_LAMBDA (const PetscInt i, PetscInt& update, const bool final) { + // Inclusive scan + update += dom_bins_d(i); + if (final) { + dom_bins_d(i) = update; // only update array on final pass + } + }); + + // Now if we reduce how many are > the search_size, we know the bin boundary we want + int bin_boundary = 0; + Kokkos::parallel_reduce ("ReductionBin", dom_bins_d.extent(0), KOKKOS_LAMBDA (const int i, int& update) { + if (dom_bins_d(i) > dom_bins_d(dom_bins_d.extent(0)-1) - search_size) update++; + }, bin_boundary); + + bin_boundary = dom_bins_d.extent(0) - bin_boundary; + swap_dom_val = double(bin_boundary) / double(dom_bins_d.extent(0)); + + } + + // Go and swap F points to C points + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) { + + if (diag_dom_ratio_d(i) != 0.0 && diag_dom_ratio_d(i) >= swap_dom_val) + { + // This is the actual numbering in A, rather than Aff + PetscInt idx = is_fine_local_d(i); + cf_markers_d(idx) *= -1; + } + }); + // Ensure we're done before we exit + exec.fence(); + } + + return; +} + +//------------------------------------------------------------------------------------------------------------------------ \ No newline at end of file diff --git a/src/Device_Datak.kokkos.cxx b/src/Device_Datak.kokkos.cxx new file mode 100644 index 00000000..8799ceb6 --- /dev/null +++ b/src/Device_Datak.kokkos.cxx @@ -0,0 +1,185 @@ +// Our petsc kokkos definitions - has to go first +#include "kokkos_helper.hpp" +#include +#include <../src/mat/impls/aij/seq/aij.h> +#include <../src/mat/impls/aij/mpi/mpiaij.h> + +// This is a device copy of the cf markers on a given level +// to save having to copy it to/from the host between pmisr and ddc calls +intKokkosView cf_markers_local_d; +// Device copy of local fine-point diagonal-dominance ratios for DDC +PetscScalarKokkosView diag_dom_ratio_local_d; + +//------------------------------------------------------------------------------------------------------------------------ + +// Copy the global cf_markers_local_d back to the host +PETSC_INTERN void copy_cf_markers_d2h(int *cf_markers_local) +{ + // Host wrapper for cf_markers_local + intKokkosViewHost cf_markers_local_h(cf_markers_local, cf_markers_local_d.extent(0)); + + // Now copy device cf_markers_local_d back to host + Kokkos::deep_copy(cf_markers_local_h, cf_markers_local_d); + // Log copy with petsc + size_t bytes = cf_markers_local_d.extent(0) * sizeof(int); + PetscCallVoid(PetscLogGpuToCpu(bytes)); + + return; +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Copy the global diag_dom_ratio_local_d back to the host +PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local) +{ + // Host wrapper for diag_dom_ratio_local + PetscScalarKokkosViewHost diag_dom_ratio_h(diag_dom_ratio_local, diag_dom_ratio_local_d.extent(0)); + + // Copy device diag_dom_ratio_local_d back to host + Kokkos::deep_copy(diag_dom_ratio_h, diag_dom_ratio_local_d); + // Log copy with petsc + size_t bytes = diag_dom_ratio_local_d.extent(0) * sizeof(PetscReal); + PetscCallVoid(PetscLogGpuToCpu(bytes)); + + return; +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Delete the global cf_markers_local_d +PETSC_INTERN void delete_device_cf_markers() +{ + // Delete the device view - this assigns an empty view + // and hence the old view has its ref counter decremented + cf_markers_local_d = intKokkosView(); + + return; +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Delete the global diag_dom_ratio_local_d +PETSC_INTERN void delete_device_diag_dom_ratio() +{ + // Delete the device view - this assigns an empty view + // and hence the old view has its ref counter decremented + diag_dom_ratio_local_d = PetscScalarKokkosView(); + + return; +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Creates the device local indices for F or C points based on the global cf_markers_local_d +PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, PetscIntKokkosView &is_local_d) +{ + PetscInt local_rows, local_cols; + PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); + auto exec = PetscGetKokkosExecutionSpace(); + + // Can't use the global directly within the parallel + // regions on the device + intKokkosView cf_markers_d = cf_markers_local_d; + + // ~~~~~~~~~~~~ + // Get the F point local indices from cf_markers_local_d + // ~~~~~~~~~~~~ + PetscIntKokkosView point_offsets_d("point_offsets_d", local_rows+1); + + // Doing an exclusive scan to get the offsets for our local indices + // Doing one larger so we can get the total number of points + Kokkos::parallel_scan("point_offsets_d_scan", + Kokkos::RangePolicy<>(0, local_rows+1), + KOKKOS_LAMBDA(const PetscInt i, PetscInt& update, const bool final_pass) { + bool is_f_point = false; + if (i < local_rows) { // Predicate is based on original data up to local_rows-1 + is_f_point = (cf_markers_d(i) == match_cf); // is this point match_cf + } + if (final_pass) { + point_offsets_d(i) = update; + } + if (is_f_point) { + update++; + } + } + ); + + // The last entry in point_offsets_d is the total number of points that match match_cf + PetscInt local_rows_row = 0; + Kokkos::deep_copy(local_rows_row, Kokkos::subview(point_offsets_d, local_rows)); + + // This will be equivalent to is_fine - global_row_start, ie the local indices + is_local_d = PetscIntKokkosView("is_local_d", local_rows_row); + + // ~~~~~~~~~~~~ + // Write the local indices + // ~~~~~~~~~~~~ + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + // Is this point match_cf + if (cf_markers_d(i) == match_cf) { + // point_offsets_d(i) gives the correct local index + is_local_d(point_offsets_d(i)) = i; + } + }); + // Ensure we're done before we exit + exec.fence(); +} + +//------------------------------------------------------------------------------------------------------------------------ + +// Creates the host IS is_fine and is_coarse based on the global cf_markers_local_d +PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse) +{ + PetscIntKokkosView is_fine_local_d, is_coarse_local_d; + MPI_Comm MPI_COMM_MATRIX; + PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); + + // Create the local f point indices + const int match_fine = -1; // F_POINT == -1 + create_cf_is_device_kokkos(input_mat, match_fine, is_fine_local_d); + + // Create the local C point indices + const int match_coarse = 1; // C_POINT == 1 + create_cf_is_device_kokkos(input_mat, match_coarse, is_coarse_local_d); + + // Now convert them back to global indices + PetscInt global_row_start, global_row_end_plus_one; + PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); + + // Convert F points + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, is_fine_local_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + is_fine_local_d(i) += global_row_start; + }); + // Convert C points + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, is_coarse_local_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { + + is_coarse_local_d(i) += global_row_start; + }); + + // Create some host space for the indices + PetscInt *is_fine_array = nullptr, *is_coarse_array = nullptr; + PetscInt n_fine = is_fine_local_d.extent(0); + PetscCallVoid(PetscMalloc1(n_fine, &is_fine_array)); + PetscIntKokkosViewHost is_fine_h = PetscIntKokkosViewHost(is_fine_array, is_fine_local_d.extent(0)); + PetscInt n_coarse = is_coarse_local_d.extent(0); + PetscCallVoid(PetscMalloc1(n_coarse, &is_coarse_array)); + PetscIntKokkosViewHost is_coarse_h = PetscIntKokkosViewHost(is_coarse_array, n_coarse); + + // Copy over the indices to the host + Kokkos::deep_copy(is_fine_h, is_fine_local_d); + Kokkos::deep_copy(is_coarse_h, is_coarse_local_d); + // Log copy with petsc + size_t bytes_fine = is_fine_local_d.extent(0) * sizeof(PetscInt); + size_t bytes_coarse = is_coarse_local_d.extent(0) * sizeof(PetscInt); + PetscCallVoid(PetscLogGpuToCpu(bytes_fine + bytes_coarse)); + + // Now we can create the IS objects + PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, is_fine_local_d.extent(0), is_fine_array, PETSC_OWN_POINTER, is_fine)); + PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, is_coarse_local_d.extent(0), is_coarse_array, PETSC_OWN_POINTER, is_coarse)); +} + +//------------------------------------------------------------------------------------------------------------------------ diff --git a/src/MatDiagDom.F90 b/src/MatDiagDom.F90 new file mode 100644 index 00000000..21bfea18 --- /dev/null +++ b/src/MatDiagDom.F90 @@ -0,0 +1,263 @@ +module matdiagdom + + use iso_c_binding + use petscmat + use petsc_helper, only: kokkos_debug + use c_petsc_interfaces, only: copy_diag_dom_ratio_d2h, MatDiagDomRatio_kokkos, & + vecscatter_mat_begin_c, vecscatter_mat_end_c, vecscatter_mat_restore_c, MatSeqAIJGetArrayF90_mine + use pflare_parameters, only: C_POINT, F_POINT + +#include "petsc/finclude/petscmat.h" +#include "finclude/PETSc_ISO_Types.h" + + implicit none + + PetscReal, parameter :: dd_ratio_abs_tol = 1d-12 + PetscReal, parameter :: dd_ratio_rel_tol = 1d-10 + + public + + contains + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine MatDiagDomRatio(input_mat, is_fine, cf_markers_local, diag_dom_ratio, max_dd_ratio_achieved) + + ! Wrapper for diagonal-dominance ratio computation. + ! Chooses Kokkos or CPU implementation and optionally compares the + ! resulting host ratios in debug mode. + + type(tMat), target, intent(in) :: input_mat + type(tIS), intent(in) :: is_fine + integer, dimension(:), intent(in) :: cf_markers_local + PetscReal, dimension(:), allocatable, target, intent(out) :: diag_dom_ratio + PetscReal, intent(out) :: max_dd_ratio_achieved + + PetscErrorCode :: ierr + MPIU_Comm :: MPI_COMM_MATRIX + +#if defined(PETSC_HAVE_KOKKOS) + integer :: errorcode, ifree + MatType :: mat_type + PetscReal :: tol, diff + integer(c_long_long) :: A_array + PetscInt :: local_rows_aff_kokkos + type(c_ptr) :: diag_dom_ratio_ptr + PetscReal, dimension(:), allocatable :: diag_dom_ratio_cpu + PetscReal :: max_dd_ratio_cpu +#endif + + call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) + +#if defined(PETSC_HAVE_KOKKOS) + call MatGetType(input_mat, mat_type, ierr) + if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & + mat_type == MATAIJKOKKOS) then + + A_array = input_mat%v + local_rows_aff_kokkos = 0 + max_dd_ratio_achieved = 0d0 + call MatDiagDomRatio_kokkos(A_array, max_dd_ratio_achieved, local_rows_aff_kokkos) + + if (kokkos_debug()) then + allocate(diag_dom_ratio(local_rows_aff_kokkos)) + if (local_rows_aff_kokkos > 0) then + diag_dom_ratio_ptr = c_loc(diag_dom_ratio) + call copy_diag_dom_ratio_d2h(diag_dom_ratio_ptr) + end if + + call MatDiagDomRatio_cpu(input_mat, is_fine, cf_markers_local, diag_dom_ratio_cpu, max_dd_ratio_cpu) + + tol = dd_ratio_abs_tol + dd_ratio_rel_tol * max(abs(max_dd_ratio_cpu), abs(max_dd_ratio_achieved)) + if (abs(max_dd_ratio_cpu - max_dd_ratio_achieved) > tol) then + print *, "Kokkos and CPU MatDiagDomRatio global max do not match" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + + do ifree = 1, size(diag_dom_ratio) + diff = abs(diag_dom_ratio(ifree) - diag_dom_ratio_cpu(ifree)) + tol = dd_ratio_abs_tol + dd_ratio_rel_tol * max(abs(diag_dom_ratio(ifree)), abs(diag_dom_ratio_cpu(ifree))) + if (diff > tol) then + print *, "Kokkos and CPU MatDiagDomRatio entries do not match" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + end do + + deallocate(diag_dom_ratio_cpu) + end if + else + call MatDiagDomRatio_cpu(input_mat, is_fine, cf_markers_local, diag_dom_ratio, max_dd_ratio_achieved) + end if +#else + call MatDiagDomRatio_cpu(input_mat, is_fine, cf_markers_local, diag_dom_ratio, max_dd_ratio_achieved) +#endif + + end subroutine MatDiagDomRatio + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine MatDiagDomRatio_cpu(input_mat, is_fine, cf_markers_local, diag_dom_ratio, max_dd_ratio_achieved) + + ! Compute diagonal dominance ratio over local fine rows of input_mat + ! without extracting Aff. This mirrors the Kokkos MatDiagDomRatio path: + ! sum abs(F-neighbour off-diagonals) / abs(F-diagonal), with nonlocal + ! F markers obtained from the matrix halo scatter. + + ! ~~~~~~ + + type(tMat), target, intent(in) :: input_mat + type(tIS), intent(in) :: is_fine + integer, dimension(:), intent(in) :: cf_markers_local + PetscReal, dimension(:), allocatable, intent(out) :: diag_dom_ratio + PetscReal, intent(out) :: max_dd_ratio_achieved + + ! Local + PetscInt :: local_rows, local_cols, global_rows, global_cols, fine_size + PetscInt :: input_row_start, input_row_end_plus_one + PetscInt :: ifree, jfree, local_row, target_col, rows_ao, cols_ao + PetscInt :: n_ad, n_ao + PetscInt, parameter :: one = 1 + PetscErrorCode :: ierr + integer :: errorcode, comm_size + MPIU_Comm :: MPI_COMM_MATRIX + integer(c_long_long) :: A_array, vec_long, Ad_array, Ao_array + type(tMat) :: Ad, Ao + type(tVec) :: cf_markers_vec + PetscInt, dimension(:), pointer :: is_pointer => null(), colmap => null() + PetscInt, dimension(:), pointer :: ad_ia => null(), ad_ja => null(), ao_ia => null(), ao_ja => null() + PetscReal, dimension(:), pointer :: ad_vals => null(), ao_vals => null(), cf_markers_nonlocal => null() + PetscReal, dimension(:), allocatable, target :: cf_markers_local_real + type(c_ptr) :: ad_vals_c_ptr, ao_vals_c_ptr, cf_markers_nonlocal_ptr + PetscInt :: shift = 0 + PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done + PetscReal :: diag_val, off_diag_sum + PetscReal :: max_dd_ratio_local + logical :: mpi + + ! ~~~~~~ + + call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) + call MatGetLocalSize(input_mat, local_rows, local_cols, ierr) + call MatGetSize(input_mat, global_rows, global_cols, ierr) + call MatGetOwnershipRange(input_mat, input_row_start, input_row_end_plus_one, ierr) + call ISGetLocalSize(is_fine, fine_size, ierr) + call ISGetIndices(is_fine, is_pointer, ierr) + + allocate(diag_dom_ratio(fine_size)) + diag_dom_ratio = 0d0 + + call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) + mpi = comm_size /= 1 + + if (mpi) then + call MatMPIAIJGetSeqAIJ(input_mat, Ad, Ao, colmap, ierr) + call MatGetSize(Ao, rows_ao, cols_ao, ierr) + else + Ad = input_mat + end if + + ! Get pointers to the local/off-diagonal CSR structures. + ! This mirrors the Kokkos path, which accesses local and nonlocal CSR directly. + call MatGetRowIJ(Ad, shift, symmetric, inodecompressed, n_ad, ad_ia, ad_ja, done, ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + + if (mpi) then + call MatGetRowIJ(Ao, shift, symmetric, inodecompressed, n_ao, ao_ia, ao_ja, done, ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + end if + + Ad_array = Ad%v + call MatSeqAIJGetArrayF90_mine(Ad_array, ad_vals_c_ptr) + call c_f_pointer(ad_vals_c_ptr, ad_vals, shape=[size(ad_ja)]) + + ! Off-diagonal rows require a halo exchange of cf markers. + ! Start and finish the scatter once, then reuse the received nonlocal markers + ! while looping over all local fine rows. + if (mpi) then + Ao_array = Ao%v + call MatSeqAIJGetArrayF90_mine(Ao_array, ao_vals_c_ptr) + call c_f_pointer(ao_vals_c_ptr, ao_vals, shape=[size(ao_ja)]) + + allocate(cf_markers_local_real(local_rows)) + if (local_rows > 0) cf_markers_local_real = dble(cf_markers_local(1:local_rows)) + + call VecCreateMPIWithArray(MPI_COMM_MATRIX, one, local_rows, global_rows, & + cf_markers_local_real, cf_markers_vec, ierr) + A_array = input_mat%v + vec_long = cf_markers_vec%v + call vecscatter_mat_begin_c(A_array, vec_long, cf_markers_nonlocal_ptr) + call vecscatter_mat_end_c(A_array, vec_long, cf_markers_nonlocal_ptr) + call c_f_pointer(cf_markers_nonlocal_ptr, cf_markers_nonlocal, shape=[cols_ao]) + end if + + ! Compute diagonal-dominance sums over the local fine-row list. + ! For each row: accumulate abs(off-diagonal) over F neighbors only, + ! store abs(diagonal) for an F diagonal entry, then form ratio. + do ifree = 1, fine_size + local_row = is_pointer(ifree) - input_row_start + 1 + diag_val = 0d0 + off_diag_sum = 0d0 + + do jfree = ad_ia(local_row) + 1, ad_ia(local_row + 1) + target_col = ad_ja(jfree) + 1 + + if (cf_markers_local(target_col) /= F_POINT) cycle + + if (target_col == local_row) then + diag_val = abs(ad_vals(jfree)) + else + off_diag_sum = off_diag_sum + abs(ad_vals(jfree)) + end if + end do + + if (mpi) then + do jfree = ao_ia(local_row) + 1, ao_ia(local_row + 1) + target_col = ao_ja(jfree) + 1 + + if (nint(cf_markers_nonlocal(target_col)) /= F_POINT) cycle + + off_diag_sum = off_diag_sum + abs(ao_vals(jfree)) + end do + end if + + ! If no diagonal was found, keep ratio at zero. + ! This matches the Kokkos behavior for rows without a diagonal entry. + if (diag_val /= 0d0) then + diag_dom_ratio(ifree) = off_diag_sum / diag_val + end if + end do + + ! Cleanup for halo scatter resources. + if (mpi) then + call vecscatter_mat_restore_c(A_array, cf_markers_nonlocal_ptr) + call VecDestroy(cf_markers_vec, ierr) + deallocate(cf_markers_local_real) + end if + + ! Restore CSR pointers before returning. + call MatRestoreRowIJ(Ad, shift, symmetric, inodecompressed, n_ad, ad_ia, ad_ja, done, ierr) + if (mpi) then + call MatRestoreRowIJ(Ao, shift, symmetric, inodecompressed, n_ao, ao_ia, ao_ja, done, ierr) + end if + + call ISRestoreIndices(is_fine, is_pointer, ierr) + + if (fine_size == 0) then + max_dd_ratio_local = 0d0 + else + max_dd_ratio_local = maxval(diag_dom_ratio) + end if + call MPI_Allreduce(max_dd_ratio_local, max_dd_ratio_achieved, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_MATRIX, errorcode) + + end subroutine MatDiagDomRatio_cpu + +! ------------------------------------------------------------------------------------------------------------------------------- + +end module matdiagdom + diff --git a/src/MatDiagDomSubmatrix.F90 b/src/MatDiagDomSubmatrix.F90 new file mode 100644 index 00000000..826f9da1 --- /dev/null +++ b/src/MatDiagDomSubmatrix.F90 @@ -0,0 +1,81 @@ +module matdiagdomsubmatrix + + use petscmat + use cf_splitting, only: compute_cf_splitting, CF_DIAG_DOM + use petsc_helper, only: MatCreateSubMatrixWrapper + use pflare_parameters, only: + +#include "petsc/finclude/petscmat.h" + + implicit none + public + + contains + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine compute_diag_dom_submatrix(input_mat, max_dd_ratio, output_mat) + + ! Returns a diagonally dominant submatrix taken from input_mat where every row's + ! diagonal dominance ratio is < max_dd_ratio + ! Not guaranteed to be the optimal (ie largest) submatrix but should be close + ! This works for symmetric and asymmetric input_mat + ! Works in serial, parallel and kokkos (and hence gpus) + ! This is just a convenience wrapper around compute_cf_splitting and matcreatesubmatrixwrapper + + ! ~~~~~~ + type(tMat), target, intent(in) :: input_mat + PetscReal, intent(in) :: max_dd_ratio + type(tMat), intent(inout) :: output_mat + + PetscErrorCode :: ierr + type(tIS) :: is_fine, is_coarse + integer :: ddc_its, max_luby_steps, algorithm, errorcode + PetscReal :: ddc_fraction + logical :: symmetric + + ! ~~~~~~ + + if (max_dd_ratio .le. 0d0 .or. max_dd_ratio .ge. 1d0) then + print *, "max_dd_ratio input to compute_diag_dom_submatrix must be (0.0, 1.0)" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + + ! Ignored for CF_DIAG_DOM + ddc_its = 0 + ! Ignored for CF_DIAG_DOM + ddc_fraction = 0.0 + ! As many steps as needed + max_luby_steps = -1 + ! PMISR DDC where strength of connection is given by + ! |a_ij| .ge. max_dd_ratio |a_ii| + algorithm = CF_DIAG_DOM + ! Assume asymmetric - still works for symmetric + symmetric = .FALSE. + + ! Call the CF splitting + ! We call with max_dd_ratio as the strong_threshold + call compute_cf_splitting(input_mat, & + symmetric, & + max_dd_ratio, max_luby_steps, & + algorithm, & + ddc_its, & + ddc_fraction, & + is_fine, is_coarse) + + call ISDestroy(is_coarse, ierr) + + ! The input_mat(is_fine, is_fine) is the diagonally dominant + ! submatrix - this is the wrapper around the kokkos gpu code + call MatCreateSubMatrixWrapper(input_mat, & + is_fine, is_fine, MAT_INITIAL_MATRIX, & + output_mat) + + call ISDestroy(is_fine, ierr) + + end subroutine compute_diag_dom_submatrix + +! ------------------------------------------------------------------------------------------------------------------------------- + +end module matdiagdomsubmatrix + diff --git a/src/MatDiagDomk.kokkos.cxx b/src/MatDiagDomk.kokkos.cxx new file mode 100644 index 00000000..0d50c75c --- /dev/null +++ b/src/MatDiagDomk.kokkos.cxx @@ -0,0 +1,260 @@ +// Our petsc kokkos definitions - has to go first +#include "kokkos_helper.hpp" +#include +#include <../src/mat/impls/aij/seq/aij.h> +#include <../src/mat/impls/aij/mpi/mpiaij.h> + +// The definition of the device copy of the cf markers on a given level +// is stored in Device_Datak.kokkos.cxx and imported as extern from +// kokkos_helper.hpp + +//------------------------------------------------------------------------------------------------------------------------ + +// Computes the diagonal dominance ratio of the input matrix over fine points in global variable cf_markers_local_d +// This code is very similar to MatCreateSubMatrix_kokkos +PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscReal *max_dd_ratio_achieved, PetscInt *local_rows_aff) +{ + PetscInt local_rows, local_cols; + + // Are we in parallel? + MatType mat_type; + MPI_Comm MPI_COMM_MATRIX; + PetscCallVoid(MatGetType(*input_mat, &mat_type)); + + const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; + PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); + PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); + + Mat_MPIAIJ *mat_mpi = nullptr; + Mat mat_local = NULL, mat_nonlocal = NULL; + + PetscInt rows_ao, cols_ao; + if (mpi) + { + mat_mpi = (Mat_MPIAIJ *)(*input_mat)->data; + PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local, &mat_nonlocal, NULL)); + PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); + } + else + { + mat_local = *input_mat; + } + + // Can't use the global directly within the parallel + // regions on the device + intKokkosView cf_markers_d = cf_markers_local_d; + intKokkosView sf_int_dummy_d("sf_int_dummy_d", 1); + intKokkosView cf_markers_nonlocal_d; + intKokkosView cf_markers_send_d; + PetscIntKokkosView is_fine_local_d; + auto exec = PetscGetKokkosExecutionSpace(); + + // ~~~~~~~~~~~~ + // Get the F point local indices from cf_markers_local_d + // ~~~~~~~~~~~~ + const int match_cf = -1; // F_POINT == -1 + create_cf_is_device_kokkos(input_mat, match_cf, is_fine_local_d); + PetscInt local_rows_row = is_fine_local_d.extent(0); + *local_rows_aff = local_rows_row; + + // Create device memory for the diag_dom_ratio + diag_dom_ratio_local_d = PetscScalarKokkosView("diag_dom_ratio_local_d", local_rows_row); + PetscScalarKokkosView diag_dom_ratio_d = diag_dom_ratio_local_d; + + // ~~~~~~~~~~~~~~~ + // Can now go and compute the diagonal dominance sums + // ~~~~~~~~~~~~~~~ + // PetscSF comms cannot be started with a pointer derived from a zero-extent Kokkos view - + // doing so causes intermittent failures in parallel on GPUs. Use a size-1 dummy view + // so that every pointer passed to PetscSF is always backed by valid device memory. + int *cf_markers_nonlocal_d_ptr = NULL; + int *cf_markers_send_d_ptr = NULL; + PetscMemType mem_type = PETSC_MEMTYPE_KOKKOS; + PetscMemType mtype; + + // The off-diagonal component requires some comms which we can start now + if (mpi) + { + cf_markers_send_d = intKokkosView("cf_markers_send_d", local_rows); + // Copy cf_markers_d into a temporary buffer + // If we gave the comms routine cf_markers_d we couldn't even read from + // it until comms ended, meaning we couldn't do the work overlapping below + Kokkos::deep_copy(cf_markers_send_d, cf_markers_d); + cf_markers_send_d_ptr = local_rows > 0 ? cf_markers_send_d.data() : sf_int_dummy_d.data(); + exec.fence(); + cf_markers_nonlocal_d = intKokkosView("cf_markers_nonlocal_d", cols_ao); + cf_markers_nonlocal_d_ptr = cols_ao > 0 ? cf_markers_nonlocal_d.data() : sf_int_dummy_d.data(); + + // Start the scatter of the cf splitting - the kokkos memtype is set as PETSC_MEMTYPE_HOST or + // one of the kokkos backends like PETSC_MEMTYPE_HIP + // Be careful these aren't petscints + // PetscSF owns cf_markers_send_d_ptr as the active send buffer until End. + // Do not even read from that send buffer before End is called. + // If you alias it in overlapped GPU work, the failure shows up intermittently + // in parallel runs on GPUs. + PetscCallVoid(PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT, + mem_type, cf_markers_send_d_ptr, + mem_type, cf_markers_nonlocal_d_ptr, + MPI_REPLACE)); + } + + // ~~~~~~~~~~~~~~~ + // Do the local component so work/comms are overlapped + // ~~~~~~~~~~~~~~~ + + // ~~~~~~~~~~~~ + // Get pointers to the local i,j,vals on the device + // ~~~~~~~~~~~~ + const PetscInt *device_local_i = nullptr, *device_local_j = nullptr; + PetscScalar *device_local_vals = nullptr; + PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); + + // Have to store the diagonal entry + PetscScalarKokkosView diag_entry_d = PetscScalarKokkosView("diag_entry_d", local_rows_row); + Kokkos::deep_copy(diag_entry_d, 0); + + // Scoping to reduce peak memory + { + // We now go and do a reduce to get the diagonal entry, while also + // summing up the local non-diagonals into diag_dom_ratio_d + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows_row, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + const PetscInt i_idx_is_row = t.league_rank(); + const PetscInt i = is_fine_local_d(i_idx_is_row); + const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; + + PetscScalar sum_val = 0.0; + + // Reduce over local columns + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(t, ncols_local), + [&](const PetscInt j, PetscScalar& thread_sum) { + + // Get this local column in the input_mat + const PetscInt target_col = device_local_j[device_local_i[i] + j]; + // Is this column fine? F_POINT == -1 + if (cf_markers_d(target_col) == -1) + { + // Is this column the diagonal + const bool is_diagonal = i == target_col; + + // Get the abs value of the entry + PetscScalar val = Kokkos::abs(device_local_vals[device_local_i[i] + j]); + + // We have found a diagonal in this row + if (is_diagonal) { + // Will only happen for one thread + diag_entry_d(i_idx_is_row) = val; + } + else + { + thread_sum += val; + } + } + }, + Kokkos::Sum(sum_val) + ); + + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + diag_dom_ratio_d(i_idx_is_row) = sum_val; + }); + }); + } + + // ~~~~~~~~~~~~~~~ + // Finish the comms and add the non-local entries to diag_dom_ratio_d + // before we divide by the diagonal entry + // ~~~~~~~~~~~~~~~ + + // The off-diagonal component requires some comms + // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist + if (mpi) + { + // Finish the scatter of the cf splitting + // Be careful these aren't petscints + // End releases the send snapshot for normal access again. + // The scattered cf_markers_nonlocal_d values are now safe to read. + PetscCallVoid(PetscSFBcastEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_send_d_ptr, cf_markers_nonlocal_d_ptr, MPI_REPLACE)); + + // ~~~~~~~~~~~~ + // Get pointers to the nonlocal i,j,vals on the device + // ~~~~~~~~~~~~ + const PetscInt *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; + PetscScalar *device_nonlocal_vals = nullptr; + PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); + + // Sum up the nonlocal matching entries into diag_dom_ratio_d + if (cols_ao > 0) + { + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows_row, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + const PetscInt i_idx_is_row = t.league_rank(); + const PetscInt i = is_fine_local_d(i_idx_is_row); + const PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; + + PetscScalar sum_val = 0.0; + + // Reduce over local columns + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(t, ncols_nonlocal), + [&](const PetscInt j, PetscScalar& thread_sum) { + + // This is the non-local column we have to check is present + const PetscInt target_col = device_nonlocal_j[device_nonlocal_i[i] + j]; + // Is this column in the input IS? F_POINT == -1 + if (cf_markers_nonlocal_d(target_col) == -1) + { + // Get the abs value of the entry + thread_sum += Kokkos::abs(device_nonlocal_vals[device_nonlocal_i[i] + j]); + } + }, + Kokkos::Sum(sum_val) + ); + + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + // Add into existing + diag_dom_ratio_d(i_idx_is_row) += sum_val; + }); + }); + } + } + + // ~~~~~~~~~~~~~ + // Compute the diag dominance ratio + // ~~~~~~~~~~~~~ + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows_row), KOKKOS_LAMBDA(PetscInt i) { + + // If diag_val is zero we didn't find a diagonal + if (diag_entry_d(i) != 0.0){ + // Compute the diagonal dominance ratio + diag_dom_ratio_d(i) = diag_dom_ratio_d(i) / diag_entry_d(i); + } + else{ + diag_dom_ratio_d(i) = 0.0; + } + }); + // Ensure we're done before we exit + exec.fence(); + + PetscReal max_dd_ratio_local = 0.0; + Kokkos::parallel_reduce("max_dd_ratio", local_rows_row, + KOKKOS_LAMBDA(const PetscInt i, PetscReal& thread_max) { + PetscReal dd_ratio = diag_dom_ratio_d(i); + thread_max = (dd_ratio > thread_max) ? dd_ratio : thread_max; + }, + Kokkos::Max(max_dd_ratio_local) + ); + + PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&max_dd_ratio_local, max_dd_ratio_achieved, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_MATRIX)); + + return; +} + +//------------------------------------------------------------------------------------------------------------------------ \ No newline at end of file diff --git a/src/PCAIR.c b/src/PCAIR.c index d70f5b9f..ba9bd105 100644 --- a/src/PCAIR.c +++ b/src/PCAIR.c @@ -15,8 +15,9 @@ PETSC_EXTERN void create_pc_air_data_c(void **pc_air_data); PETSC_EXTERN void create_pc_air_shell_c(void **pc_air_data, PC *pc); PETSC_EXTERN void compute_cf_splitting_c(Mat *input_mat, int symmetric_int, double strong_threshold, int max_luby_steps, int cf_splitting_type, - int ddc_its, double fraction_swap, double max_dd_ratio, + int ddc_its, double fraction_swap, IS *is_fine, IS *is_coarse); +PETSC_EXTERN void compute_diag_dom_submatrix_c(Mat *input_mat, double max_dd_ratio, Mat *output_mat); // Defined in PCAIR_C_Fortran_Bindings.F90 // External users should use the get/set routines without _c which have // PetscErrorCode defined as return type, those routines are defined below this @@ -34,7 +35,6 @@ PETSC_EXTERN void PCAIRGetProcessEqLimit_c(PC *pc, PetscInt *input_int); PETSC_EXTERN void PCAIRGetSubcomm_c(PC *pc, PetscBool *input_bool); PETSC_EXTERN void PCAIRGetStrongThreshold_c(PC *pc, PetscReal *input_real); PETSC_EXTERN void PCAIRGetDDCIts_c(PC *pc, PetscInt *input_int); -PETSC_EXTERN void PCAIRGetMaxDDRatio_c(PC *pc, PetscReal *input_real); PETSC_EXTERN void PCAIRGetDDCFraction_c(PC *pc, PetscReal *input_real); PETSC_EXTERN void PCAIRGetCFSplittingType_c(PC *pc, CFSplittingType *input_int); PETSC_EXTERN void PCAIRGetMaxLubySteps_c(PC *pc, PetscInt *input_int); @@ -84,7 +84,6 @@ PETSC_EXTERN void PCAIRSetProcessEqLimit_c(PC *pc, PetscInt input_int); PETSC_EXTERN void PCAIRSetSubcomm_c(PC *pc, PetscBool input_bool); PETSC_EXTERN void PCAIRSetStrongThreshold_c(PC *pc, PetscReal input_real); PETSC_EXTERN void PCAIRSetDDCIts_c(PC *pc, PetscInt input_int); -PETSC_EXTERN void PCAIRSetMaxDDRatio_c(PC *pc, PetscReal input_real); PETSC_EXTERN void PCAIRSetDDCFraction_c(PC *pc, PetscReal input_real); PETSC_EXTERN void PCAIRSetCFSplittingType_c(PC *pc, CFSplittingType input_int); PETSC_EXTERN void PCAIRSetMaxLubySteps_c(PC *pc, PetscInt input_int); @@ -196,12 +195,17 @@ PETSC_EXTERN PetscErrorCode c_PCAIRGetPCShell(PC *pc, PC *pc_air_shell) // CF splitting PETSC_EXTERN void compute_cf_splitting(Mat input_mat, int symmetric_int, double strong_threshold, int max_luby_steps, int cf_splitting_type, - int ddc_its, double fraction_swap, double max_dd_ratio, + int ddc_its, double fraction_swap, IS *is_fine, IS *is_coarse) { compute_cf_splitting_c(&input_mat, symmetric_int, strong_threshold, max_luby_steps, cf_splitting_type, ddc_its, fraction_swap, - max_dd_ratio, is_fine, is_coarse); + is_fine, is_coarse); +} + +PETSC_EXTERN void compute_diag_dom_submatrix(Mat input_mat, double max_dd_ratio, Mat *output_mat) +{ + compute_diag_dom_submatrix_c(&input_mat, max_dd_ratio, output_mat); } // Get routines @@ -280,12 +284,6 @@ PETSC_EXTERN PetscErrorCode PCAIRGetStrongThreshold(PC pc, PetscReal *input_real PCAIRGetStrongThreshold_c(&pc, input_real); PetscFunctionReturn(PETSC_SUCCESS); } -PETSC_EXTERN PetscErrorCode PCAIRGetMaxDDRatio(PC pc, PetscReal *input_real) -{ - PetscFunctionBegin; - PCAIRGetMaxDDRatio_c(&pc, input_real); - PetscFunctionReturn(PETSC_SUCCESS); -} PETSC_EXTERN PetscErrorCode PCAIRGetDDCFraction(PC pc, PetscReal *input_real) { PetscFunctionBegin; @@ -632,16 +630,6 @@ PETSC_EXTERN PetscErrorCode PCAIRSetDDCIts(PC pc, PetscInt input_int) PCAIRSetDDCIts_c(&pc, input_int); PetscFunctionReturn(PETSC_SUCCESS); } -// If using CF splitting type pmisr_ddc, do as many DDC iterations as necessary to -// hit this diagonal dominance ratio. If 0.0 do the number in -pc_air_ddc_its -// Default: 0.0 -// -pc_air_max_dd_ratio -PETSC_EXTERN PetscErrorCode PCAIRSetMaxDDRatio(PC pc, PetscReal input_real) -{ - PetscFunctionBegin; - PCAIRSetMaxDDRatio_c(&pc, input_real); - PetscFunctionReturn(PETSC_SUCCESS); -} // Second pass in the PMISR DDC CF splitting converts // this fraction of local F points to C based on diagonal dominance // Default: 0.1 @@ -1141,11 +1129,6 @@ static PetscErrorCode PCSetFromOptions_AIR_c(PC pc, PetscOptionItems PetscOption PetscCall(PetscOptionsReal("-pc_air_strong_threshold", "Strong threshold for CF splitting", "PCAIRSetStrongThreshold", old_real, &input_real, NULL)); PetscCall(PCAIRSetStrongThreshold(pc, input_real)); // ~~~~ - PetscCall(PCAIRGetMaxDDRatio(pc, &old_real)); - input_real = old_real; - PetscCall(PetscOptionsReal("-pc_air_max_dd_ratio", "Max DDC ratio for CF splitting", "PCAIRGetMaxDDRatio", old_real, &input_real, NULL)); - PetscCall(PCAIRSetMaxDDRatio(pc, input_real)); - // ~~~~ PetscCall(PCAIRGetDDCFraction(pc, &old_real)); input_real = old_real; PetscCall(PetscOptionsReal("-pc_air_ddc_fraction", "DDC fraction for CF splitting", "PCAIRGetDDCFraction", old_real, &input_real, NULL)); @@ -1166,7 +1149,7 @@ static PetscErrorCode PCSetFromOptions_AIR_c(PC pc, PetscOptionItems PetscOption PetscCall(PetscOptionsReal("-pc_air_a_drop", "Drop tolerance for A", "PCAIRSetADrop", old_real, &input_real, NULL)); PetscCall(PCAIRSetADrop(pc, input_real)); // ~~~~ - const char *const CFSplittingTypes[] = {"PMISR_DDC", "PMIS", "PMIS_DIST2", "AGG", "PMIS_AGG", "CFSplittingType", "CF_", NULL}; + const char *const CFSplittingTypes[] = {"PMISR_DDC", "DIAG_DOM", "PMIS", "PMIS_DIST2", "AGG", "PMIS_AGG", "CFSplittingType", "CF_", NULL}; PetscCall(PCAIRGetCFSplittingType(pc, &old_cf_type)); cf_type = old_cf_type; PetscCall(PetscOptionsEnum("-pc_air_cf_splitting_type", "CF splitting algorithm", "PCAIRSetCFSplittingType", CFSplittingTypes, (PetscEnum)old_cf_type, (PetscEnum *)&cf_type, &flg)); @@ -1297,7 +1280,7 @@ static PetscErrorCode PCView_AIR_c(PC pc, PetscViewer viewer) PetscInt input_int, input_int_two, input_int_three, input_int_four; PetscBool flg, flg_f_smooth, flg_c_smooth, flg_diag_scale; - PetscReal input_real, input_real_two, input_real_three; + PetscReal input_real, input_real_two; PCPFLAREINVType input_type; PCAIRZType z_type; CFSplittingType cf_type; @@ -1346,23 +1329,20 @@ static PetscErrorCode PCView_AIR_c(PC pc, PetscViewer viewer) PetscCall(PCAIRGetCFSplittingType(pc, &cf_type)); PetscCall(PCAIRGetStrongThreshold(pc, &input_real)); PetscCall(PCAIRGetDDCIts(pc, &input_int_three)); - PetscCall(PCAIRGetMaxDDRatio(pc, &input_real_three)); PetscCall(PCAIRGetDDCFraction(pc, &input_real_two)); PetscCall(PCAIRGetMaxLubySteps(pc, &input_int_two)); if (cf_type == CF_PMISR_DDC) { PetscCall(PetscViewerASCIIPrintf(viewer, " CF splitting algorithm=PMISR_DDC \n")); - if (input_real_three == 0.0) - { - PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " Luby steps \n Strong threshold=%f, DDC its=%" PetscInt_FMT ", DDC fraction=%f \n", \ - input_int_two, input_real, input_int_three, input_real_two)); - } - else - { - PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " Luby steps \n Strong threshold=%f, Max DD Ratio=%f, DDC fraction=%f \n", \ - input_int_two, input_real, input_real_three, input_real_two)); - } + PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " Luby steps \n Strong threshold=%f, DDC its=%" PetscInt_FMT ", DDC fraction=%f \n", \ + input_int_two, input_real, input_int_three, input_real_two)); } + else if (cf_type == CF_DIAG_DOM) + { + PetscCall(PetscViewerASCIIPrintf(viewer, " CF splitting algorithm=DIAG_DOM \n")); + PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " Luby steps \n Diagonal dominance target (strong threshold)=%f \n", \ + input_int_two, input_real)); + } else if (cf_type == CF_PMIS) { PetscCall(PetscViewerASCIIPrintf(viewer, " CF splitting algorithm=PMIS \n")); diff --git a/src/PCAIR_C_Fortran_Bindings.F90 b/src/PCAIR_C_Fortran_Bindings.F90 index 925c347a..76c350ea 100644 --- a/src/PCAIR_C_Fortran_Bindings.F90 +++ b/src/PCAIR_C_Fortran_Bindings.F90 @@ -250,23 +250,6 @@ subroutine PCAIRGetDDCIts_c(pc_ptr, its) bind(C, name='PCAIRGetDDCIts_c') end subroutine PCAIRGetDDCIts_c -! ------------------------------------------------------------------------------------------------------------------------------- - - subroutine PCAIRGetMaxDDRatio_c(pc_ptr, ratio) bind(C, name='PCAIRGetMaxDDRatio_c') - - ! ~~~~~~~~ - integer(c_long_long), intent(inout) :: pc_ptr - real(PFLARE_PETSCREAL_C_KIND), intent(out) :: ratio - - type(tPC) :: pc - PetscErrorCode :: ierr - ! ~~~~~~~~ - - pc%v = pc_ptr - call PCAIRGetMaxDDRatio(pc, ratio, ierr) - - end subroutine PCAIRGetMaxDDRatio_c - ! ------------------------------------------------------------------------------------------------------------------------------- subroutine PCAIRGetDDCFraction_c(pc_ptr, frac) bind(C, name='PCAIRGetDDCFraction_c') @@ -1149,23 +1132,6 @@ subroutine PCAIRSetDDCIts_c(pc_ptr, its) bind(C, name='PCAIRSetDDCIts_c') end subroutine PCAIRSetDDCIts_c -! ------------------------------------------------------------------------------------------------------------------------------- - - subroutine PCAIRSetMaxDDRatio_c(pc_ptr, ratio) bind(C, name='PCAIRSetMaxDDRatio_c') - - ! ~~~~~~~~ - integer(c_long_long), intent(inout) :: pc_ptr - real(PFLARE_PETSCREAL_C_KIND), value, intent(in) :: ratio - - type(tPC) :: pc - PetscErrorCode :: ierr - ! ~~~~~~~~ - - pc%v = pc_ptr - call PCAIRSetMaxDDRatio(pc, ratio, ierr) - - end subroutine PCAIRSetMaxDDRatio_c - ! ------------------------------------------------------------------------------------------------------------------------------- subroutine PCAIRSetDDCFraction_c(pc_ptr, frac) bind(C, name='PCAIRSetDDCFraction_c') diff --git a/src/PCAIR_Interfaces.F90 b/src/PCAIR_Interfaces.F90 index 3f8c12e5..4c3470c8 100644 --- a/src/PCAIR_Interfaces.F90 +++ b/src/PCAIR_Interfaces.F90 @@ -606,25 +606,6 @@ subroutine PCAIRGetDDCIts(pc, its, ierr) end subroutine PCAIRGetDDCIts -! ------------------------------------------------------------------------------------------------------------------------------- - - subroutine PCAIRGetMaxDDRatio(pc, ratio, ierr) - - ! ~~~~~~~~ - type(tPC), intent(inout) :: pc - PetscReal, intent(out) :: ratio - PetscErrorCode, intent(out) :: ierr - - type(air_options), pointer :: options - ! ~~~~~~~~ - - ! Get the options - call PCAIRGetOptions(pc, options) - ratio = options%max_dd_ratio - ierr = 0 - - end subroutine PCAIRGetMaxDDRatio - ! ------------------------------------------------------------------------------------------------------------------------------- subroutine PCAIRGetDDCFraction(pc, frac, ierr) @@ -1616,37 +1597,6 @@ subroutine PCAIRSetDDCIts(pc, its, ierr) end subroutine PCAIRSetDDCIts -! ------------------------------------------------------------------------------------------------------------------------------- - - subroutine PCAIRSetMaxDDRatio(pc, ratio, ierr) - - ! ~~~~~~~~ - type(tPC), intent(inout) :: pc - PetscReal, intent(in) :: ratio - PetscErrorCode, intent(out) :: ierr - - type(air_options), pointer :: options - type(tPC) :: pc_shell - PetscReal :: old_real - ! ~~~~~~~~ - - call PCAIRGetMaxDDRatio(pc, old_real, ierr) - if (old_real == ratio) then - ierr = 0 - return - end if - - ! Set the options - call PCAIRGetOptionsAndShell(pc, options, pc_shell) - call PCReset(pc_shell, ierr) - call PCReset_AIR_Shell(pc_shell, ierr) - call PCMarkNotSetUp_c(pc%v) - - options%max_dd_ratio = ratio - ierr = 0 - - end subroutine PCAIRSetMaxDDRatio - ! ------------------------------------------------------------------------------------------------------------------------------- subroutine PCAIRSetDDCFraction(pc, frac, ierr) diff --git a/src/PETSc_Helper.F90 b/src/PETSc_Helper.F90 index 82e522d5..fd34786f 100644 --- a/src/PETSc_Helper.F90 +++ b/src/PETSc_Helper.F90 @@ -102,7 +102,8 @@ subroutine destroy_matrix_reuse(mat, submatrices) !------------------------------------------------------------------------------------------------------------------------ - subroutine remove_small_from_sparse(input_mat, tol, output_mat, relative_max_row_tol_int, lump, drop_diagonal_int) + subroutine remove_small_from_sparse(input_mat, tol, output_mat, relative_max_row_tol_int, & + lump, drop_diagonal_int, diag_strength_int) ! Wrapper around remove_small_from_sparse_cpu and remove_small_from_sparse_kokkos @@ -112,11 +113,11 @@ subroutine remove_small_from_sparse(input_mat, tol, output_mat, relative_max_row type(tMat), intent(inout) :: output_mat PetscReal, intent(in) :: tol logical, intent(in), optional :: lump - integer, intent(in), optional :: relative_max_row_tol_int, drop_diagonal_int + integer, intent(in), optional :: relative_max_row_tol_int, drop_diagonal_int, diag_strength_int #if defined(PETSC_HAVE_KOKKOS) integer(c_long_long) :: A_array, B_array - integer :: lump_int, allow_drop_diagonal_int, rel_max_row_tol_int, errorcode + integer :: lump_int, allow_drop_diagonal_int, allow_diag_strength_int, rel_max_row_tol_int, errorcode PetscErrorCode :: ierr MatType :: mat_type Mat :: temp_mat @@ -147,11 +148,18 @@ subroutine remove_small_from_sparse(input_mat, tol, output_mat, relative_max_row allow_drop_diagonal_int = 0 if (present(drop_diagonal_int)) then allow_drop_diagonal_int = drop_diagonal_int - end if + end if + ! Whether we use a strength of connection defined by |a_ij| .ge. tol * |a_ii| + allow_diag_strength_int = 0 + if (present(diag_strength_int)) then + allow_diag_strength_int = diag_strength_int + end if A_array = input_mat%v call remove_small_from_sparse_kokkos(A_array, tol, & - B_array, rel_max_row_tol_int, lump_int, allow_drop_diagonal_int) + B_array, rel_max_row_tol_int, & + lump_int, allow_drop_diagonal_int, & + allow_diag_strength_int) output_mat%v = B_array ! If debugging do a comparison between CPU and Kokkos results @@ -159,7 +167,7 @@ subroutine remove_small_from_sparse(input_mat, tol, output_mat, relative_max_row ! Debug check if the CPU and Kokkos versions are the same call remove_small_from_sparse_cpu(input_mat, tol, temp_mat, relative_max_row_tol_int, & - lump, drop_diagonal_int) + lump, drop_diagonal_int, diag_strength_int) call MatAXPY(temp_mat, -1d0, output_mat, DIFFERENT_NONZERO_PATTERN, ierr) ! Find the biggest entry in the difference @@ -180,12 +188,12 @@ subroutine remove_small_from_sparse(input_mat, tol, output_mat, relative_max_row else call remove_small_from_sparse_cpu(input_mat, tol, output_mat, relative_max_row_tol_int, & - lump, drop_diagonal_int) + lump, drop_diagonal_int, diag_strength_int) end if #else call remove_small_from_sparse_cpu(input_mat, tol, output_mat, relative_max_row_tol_int, & - lump, drop_diagonal_int) + lump, drop_diagonal_int, diag_strength_int) #endif @@ -193,7 +201,8 @@ end subroutine remove_small_from_sparse !------------------------------------------------------------------------------------------------------------------------ - subroutine remove_small_from_sparse_cpu(input_mat, tol, output_mat, relative_max_row_tol_int, lump, drop_diagonal_int) + subroutine remove_small_from_sparse_cpu(input_mat, tol, output_mat, relative_max_row_tol_int, & + lump, drop_diagonal_int, diag_strength_int) ! Returns a copy of a sparse matrix with entries below abs(val) < tol removed ! If rel_max_row_tol_int is 1, then the tol is taken to be a relative scaling @@ -206,11 +215,11 @@ subroutine remove_small_from_sparse_cpu(input_mat, tol, output_mat, relative_max type(tMat), intent(inout) :: output_mat PetscReal, intent(in) :: tol logical, intent(in), optional :: lump - integer, intent(in), optional :: drop_diagonal_int, relative_max_row_tol_int + integer, intent(in), optional :: drop_diagonal_int, relative_max_row_tol_int, diag_strength_int PetscInt :: col, ncols, ifree, max_nnzs PetscInt :: local_rows, local_cols, global_rows, global_cols, global_row_start - PetscInt :: global_row_end_plus_one, max_nnzs_total + PetscInt :: global_row_end_plus_one, max_nnzs_total, diag_index PetscCount :: counter PetscErrorCode :: ierr PetscInt, dimension(:), pointer :: cols => null() @@ -219,7 +228,7 @@ subroutine remove_small_from_sparse_cpu(input_mat, tol, output_mat, relative_max PetscReal, allocatable, dimension(:) :: v PetscInt, parameter :: nz_ignore = -1, one=1, zero=0 logical :: lump_entries - integer :: drop_diag_int, errorcode, rel_max_row_tol_int + integer :: drop_diag_int, diag_stren_int, errorcode, rel_max_row_tol_int PetscReal :: rel_row_tol MPIU_Comm :: MPI_COMM_MATRIX MatType:: mat_type @@ -236,8 +245,10 @@ subroutine remove_small_from_sparse_cpu(input_mat, tol, output_mat, relative_max ! -1 - Always drop diagonal ! Never drop the diagonal by default drop_diag_int = 0 + diag_stren_int = 0 if (present(lump)) lump_entries = lump if (present(drop_diagonal_int)) drop_diag_int = drop_diagonal_int + if (present(diag_strength_int)) diag_stren_int = diag_strength_int rel_row_tol = tol ! 1 - Relative row tolerance (including diagonal) ! 0 - Absolute tolerance @@ -303,24 +314,58 @@ subroutine remove_small_from_sparse_cpu(input_mat, tol, output_mat, relative_max ! Copy in all the values v(counter:counter + ncols - 1) = vals(1:ncols) + ! Find where the diagonal is + diag_index = -1 + if (diag_stren_int == 1) then + do col = 1, ncols + if (cols(col) == ifree) then + diag_index = col + end if + end do + end if + ! If we want a relative row tolerance if (rel_max_row_tol_int /= 0) then ! Include the diagonal in the relative row tolerance if (rel_max_row_tol_int == 1) then - rel_row_tol = tol * maxval(abs(vals(1:ncols))) + if (diag_stren_int == 1) then + ! If there is a zero diagonal + if (diag_index == -1) then + rel_row_tol = 0d0 + else + ! We are measuring relative to the strength of the diagonal + rel_row_tol = tol * abs(vals(diag_index)) + end if + else + rel_row_tol = tol * maxval(abs(vals(1:ncols))) + end if ! Don't include the diagonal in the relative row tolerance else if (rel_max_row_tol_int == -1) then - ! Be careful here to use huge(0d0) rather than huge(0)! - abs_biggest_entry = -huge(0d0) - ! Find the biggest entry in the row thats not the diagonal - do col = 1, ncols - if (cols(col) /= ifree .AND. abs(vals(col)) > abs_biggest_entry) then - abs_biggest_entry = abs(vals(col)) - end if - end do - rel_row_tol = tol * abs_biggest_entry + ! If the user has specified they don't want to include the diagonal in the + ! relative row tolerance, but also have said they want to measure + ! everything by the strength of the diagonal, it doesn't make + ! sense to exclude the diagonal! + if (diag_stren_int == 1) then + ! If there is a zero diagonal + if (diag_index == -1) then + rel_row_tol = 0d0 + else + ! We are measuring relative to the strength of the diagonal + rel_row_tol = tol * abs(vals(diag_index)) + end if + else + ! Be careful here to use huge(0d0) rather than huge(0)! + abs_biggest_entry = -huge(0d0) + ! Find the biggest entry in the row thats not the diagonal + do col = 1, ncols + if (cols(col) /= ifree .AND. abs(vals(col)) > abs_biggest_entry) then + abs_biggest_entry = abs(vals(col)) + end if + end do + rel_row_tol = tol * abs_biggest_entry + end if end if end if diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index d3af6122..5d4ae35b 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -87,7 +87,8 @@ PETSC_INTERN void rewrite_j_global_to_local(PetscInt colmap_max_size, PetscInt & // Drop according to a tolerance but with kokkos - keeping everything on the device PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscReal tol, Mat *output_mat, \ - const int relative_max_row_tolerance_int, const int lump_int, const int allow_drop_diagonal_int) + const int relative_max_row_tolerance_int, const int lump_int, \ + const int allow_drop_diagonal_int, const int allow_diag_strength_int) { MPI_Comm MPI_COMM_MATRIX; PetscInt local_rows, local_cols, global_rows, global_cols; @@ -180,56 +181,102 @@ PETSC_INTERN void remove_small_from_sparse_kokkos(Mat *input_mat, const PetscRea const PetscInt i = t.league_rank(); const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; - PetscScalar max_val = -1.0; const PetscInt row_index_global = i + global_row_start; - // Reduce over local columns - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(t, ncols_local), - [&](const PetscInt j, PetscScalar& thread_max) { + // If we're measuring relative to the diagonal strength, we need to find the diagonal entry first + if (allow_diag_strength_int) { + PetscScalar diag_val_abs = -1.0; - // Is this column the diagonal - const bool is_diagonal = (device_local_j[device_local_i[i] + j] + global_col_start == row_index_global); + // Find the diagonal magnitude in the local block, if present. + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(t, ncols_local), + [&](const PetscInt j, PetscScalar &thread_diag_abs) { + const bool is_diagonal = (device_local_j[device_local_i[i] + j] + global_col_start == row_index_global); + if (is_diagonal) { + const PetscScalar val = Kokkos::abs(device_local_vals[device_local_i[i] + j]); + if (val > thread_diag_abs) thread_diag_abs = val; + } + }, + Kokkos::Max(diag_val_abs) + ); - // If our current tolerance is bigger than the max value we've seen so far - PetscScalar val = Kokkos::abs(device_local_vals[device_local_i[i] + j]); - // If we're not comparing against the diagonal when computing relative residual - if (not_include_diag && is_diagonal) val = -1.0; - if (val > thread_max) thread_max = val; + if (mpi) { + const PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; + PetscScalar diag_val_abs_nonlocal = -1.0; + + // Diagonal can be in the off-diagonal block for rectangular distributions. + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(t, ncols_nonlocal), + [&](const PetscInt j, PetscScalar &thread_diag_abs) { + const bool is_diagonal = (colmap_input_d(device_nonlocal_j[device_nonlocal_i[i] + j]) == row_index_global); + if (is_diagonal) { + const PetscScalar val = Kokkos::abs(device_nonlocal_vals[device_nonlocal_i[i] + j]); + if (val > thread_diag_abs) thread_diag_abs = val; + } + }, + Kokkos::Max(diag_val_abs_nonlocal) + ); + + if (diag_val_abs_nonlocal > diag_val_abs) diag_val_abs = diag_val_abs_nonlocal; + } - }, - Kokkos::Max(max_val) - ); + Kokkos::single(Kokkos::PerTeam(t), [&]() { + // If there is no explicit diagonal entry, use a zero threshold to avoid over-dropping. + rel_row_tol_d(i) = (diag_val_abs >= 0.0) ? tol * diag_val_abs : 0.0; + }); - if (mpi) { - PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; - PetscScalar max_val_nonlocal = -1.0; - - // Reduce over nonlocal columns + // If instead we're measuring relative to the max value in the row that isn't the diagonal + } else { + PetscScalar max_val = -1.0; + + // Reduce over local columns Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(t, ncols_nonlocal), + Kokkos::TeamVectorRange(t, ncols_local), [&](const PetscInt j, PetscScalar& thread_max) { // Is this column the diagonal - const bool is_diagonal = (colmap_input_d(device_nonlocal_j[device_nonlocal_i[i] + j]) == row_index_global); + const bool is_diagonal = (device_local_j[device_local_i[i] + j] + global_col_start == row_index_global); // If our current tolerance is bigger than the max value we've seen so far - PetscScalar val = Kokkos::abs(device_nonlocal_vals[device_nonlocal_i[i] + j]); + PetscScalar val = Kokkos::abs(device_local_vals[device_local_i[i] + j]); // If we're not comparing against the diagonal when computing relative residual - if (not_include_diag && is_diagonal) val = -1.0; + if (not_include_diag && is_diagonal) val = -1.0; if (val > thread_max) thread_max = val; }, - Kokkos::Max(max_val_nonlocal) + Kokkos::Max(max_val) ); - // Take max of local and nonlocal - if (max_val_nonlocal > max_val) max_val = max_val_nonlocal; - } - // Only want one thread in the team to write the result - Kokkos::single(Kokkos::PerTeam(t), [&]() { - rel_row_tol_d(i) = tol * max_val; - }); + if (mpi) { + PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; + PetscScalar max_val_nonlocal = -1.0; + + // Reduce over nonlocal columns + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(t, ncols_nonlocal), + [&](const PetscInt j, PetscScalar& thread_max) { + + // Is this column the diagonal + const bool is_diagonal = (colmap_input_d(device_nonlocal_j[device_nonlocal_i[i] + j]) == row_index_global); + + // If our current tolerance is bigger than the max value we've seen so far + PetscScalar val = Kokkos::abs(device_nonlocal_vals[device_nonlocal_i[i] + j]); + // If we're not comparing against the diagonal when computing relative residual + if (not_include_diag && is_diagonal) val = -1.0; + if (val > thread_max) thread_max = val; + + }, + Kokkos::Max(max_val_nonlocal) + ); + // Take max of local and nonlocal + if (max_val_nonlocal > max_val) max_val = max_val_nonlocal; + } + + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + rel_row_tol_d(i) = tol * max_val; + }); + } }); } // If we're using a constant tolerance, we can just copy it in diff --git a/src/PMISR_DDC.F90 b/src/PMISR_DDC.F90 deleted file mode 100644 index 29d4bd17..00000000 --- a/src/PMISR_DDC.F90 +++ /dev/null @@ -1,858 +0,0 @@ -module pmisr_ddc - - use iso_c_binding - use petscmat - use petsc_helper, only: kokkos_debug - use c_petsc_interfaces, only: pmisr_kokkos, copy_cf_markers_d2h, & - vecscatter_mat_begin_c, vecscatter_mat_end_c, vecscatter_mat_restore_c, & - allreducesum_petscint_mine, boolscatter_mat_begin_c, boolscatter_mat_end_c, & - boolscatter_mat_reverse_begin_c, boolscatter_mat_reverse_end_c, ddc_kokkos - use pflare_parameters, only: C_POINT, F_POINT - -#include "petsc/finclude/petscmat.h" -#include "finclude/PETSc_ISO_Types.h" - - implicit none - - public - - contains - - -! ------------------------------------------------------------------------------------------------------------------------------- - - subroutine pmisr(strength_mat, max_luby_steps, pmis, cf_markers_local, zero_measure_c_point) - - ! Wrapper - - ! ~~~~~~ - - type(tMat), target, intent(in) :: strength_mat - integer, intent(in) :: max_luby_steps - logical, intent(in) :: pmis - integer, dimension(:), allocatable, target, intent(inout) :: cf_markers_local - logical, optional, intent(in) :: zero_measure_c_point - -#if defined(PETSC_HAVE_KOKKOS) - integer(c_long_long) :: A_array - PetscErrorCode :: ierr - MatType :: mat_type - integer :: pmis_int, zero_measure_c_point_int, seed_size, kfree, comm_rank, errorcode - integer, dimension(:), allocatable :: seed - PetscReal, dimension(:), allocatable, target :: measure_local - PetscInt :: local_rows, local_cols - MPIU_Comm :: MPI_COMM_MATRIX - type(c_ptr) :: measure_local_ptr, cf_markers_local_ptr - integer, dimension(:), allocatable :: cf_markers_local_two -#endif - ! ~~~~~~~~~~ - -#if defined(PETSC_HAVE_KOKKOS) - - call MatGetType(strength_mat, mat_type, ierr) - if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & - mat_type == MATAIJKOKKOS) then - - call PetscObjectGetComm(strength_mat, MPI_COMM_MATRIX, ierr) - call MPI_Comm_rank(MPI_COMM_MATRIX, comm_rank, errorcode) - - A_array = strength_mat%v - pmis_int = 0 - if (pmis) pmis_int = 1 - zero_measure_c_point_int = 0 - if (present(zero_measure_c_point)) then - if (zero_measure_c_point) zero_measure_c_point_int = 1 - end if - - ! Let's generate the random values on the host for now so they match - ! for comparisons with pmisr_cpu - call MatGetLocalSize(strength_mat, local_rows, local_cols, ierr) - allocate(measure_local(local_rows)) - call random_seed(size=seed_size) - allocate(seed(seed_size)) - do kfree = 1, seed_size - seed(kfree) = comm_rank + 1 + kfree - end do - call random_seed(put=seed) - ! Fill the measure with random numbers - call random_number(measure_local) - deallocate(seed) - - measure_local_ptr = c_loc(measure_local) - - allocate(cf_markers_local(local_rows)) - cf_markers_local_ptr = c_loc(cf_markers_local) - - ! Creates a cf_markers on the device - call pmisr_kokkos(A_array, max_luby_steps, pmis_int, measure_local_ptr, zero_measure_c_point_int) - - ! If debugging do a comparison between CPU and Kokkos results - if (kokkos_debug()) then - - ! Kokkos PMISR by default now doesn't copy back to the host, as any following ddc calls - ! use the device data - call copy_cf_markers_d2h(cf_markers_local_ptr) - call pmisr_cpu(strength_mat, max_luby_steps, pmis, cf_markers_local_two, zero_measure_c_point) - - if (any(cf_markers_local /= cf_markers_local_two)) then - - ! do kfree = 1, local_rows - ! if (cf_markers_local(kfree) /= cf_markers_local_two(kfree)) then - ! print *, kfree, "no match", cf_markers_local(kfree), cf_markers_local_two(kfree) - ! end if - ! end do - print *, "Kokkos and CPU versions of pmisr do not match" - call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) - end if - deallocate(cf_markers_local_two) - end if - - else - call pmisr_cpu(strength_mat, max_luby_steps, pmis, cf_markers_local, zero_measure_c_point) - end if -#else - call pmisr_cpu(strength_mat, max_luby_steps, pmis, cf_markers_local, zero_measure_c_point) -#endif - - ! ~~~~~~ - - end subroutine pmisr - -! ------------------------------------------------------------------------------------------------------------------------------- - - subroutine pmisr_cpu(strength_mat, max_luby_steps, pmis, cf_markers_local, zero_measure_c_point) - - ! Let's do our own independent set with a Luby algorithm - ! If PMIS is true, this is a traditional PMIS algorithm - ! If PMIS is false, this is a PMISR - ! PMISR swaps the C-F definition compared to a PMIS and - ! also checks the measure from smallest, rather than the largest - ! PMISR should give an Aff with no off-diagonal strong connections - ! If you set positive max_luby_steps, it will avoid all parallel reductions - ! by taking a fixed number of times in the Luby top loop - - ! ~~~~~~ - - type(tMat), target, intent(in) :: strength_mat - integer, intent(in) :: max_luby_steps - logical, intent(in) :: pmis - integer, dimension(:), allocatable, intent(inout) :: cf_markers_local - logical, optional, intent(in) :: zero_measure_c_point - - ! Local - PetscInt :: local_rows, local_cols, global_rows, global_cols - PetscInt :: global_row_start, global_row_end_plus_one, ifree, ncols - PetscInt :: jfree - PetscInt :: rows_ao, cols_ao, n_ad, n_ao - PetscInt :: counter_undecided, counter_in_set_start, counter_parallel - integer :: comm_size, comm_size_world, loops_through, seed_size - integer :: comm_rank, errorcode - integer :: kfree - PetscErrorCode :: ierr - MPIU_Comm :: MPI_COMM_MATRIX - integer, dimension(:), allocatable :: seed - PetscReal, dimension(:), allocatable :: measure_local - PFLARE_PETSCBOOL_C_TYPE, dimension(:), allocatable :: in_set_this_loop - PFLARE_PETSCBOOL_C_TYPE, dimension(:), allocatable, target :: assigned_local, assigned_nonlocal - type(c_ptr) :: measure_nonlocal_ptr=c_null_ptr, assigned_local_ptr=c_null_ptr, assigned_nonlocal_ptr=c_null_ptr - real(c_double), pointer :: measure_nonlocal(:) => null() - type(tMat) :: Ad, Ao - type(tVec) :: measure_vec - PetscInt, dimension(:), pointer :: colmap - integer(c_long_long) :: A_array, vec_long - PetscInt, dimension(:), pointer :: ad_ia, ad_ja, ao_ia, ao_ja - PetscInt :: shift = 0 - PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done - logical :: zero_measure_c = .FALSE. - PetscInt, parameter :: nz_ignore = -1, one=1, zero=0 - - ! ~~~~~~ - - if (present(zero_measure_c_point)) zero_measure_c = zero_measure_c_point - - ! Get the comm size - call PetscObjectGetComm(strength_mat, MPI_COMM_MATRIX, ierr) - call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) - call MPI_Comm_size(MPI_COMM_WORLD, comm_size_world, errorcode) - ! Get the comm rank - call MPI_Comm_rank(MPI_COMM_MATRIX, comm_rank, errorcode) - - ! Get the local sizes - call MatGetLocalSize(strength_mat, local_rows, local_cols, ierr) - call MatGetSize(strength_mat, global_rows, global_cols, ierr) - call MatGetOwnershipRange(strength_mat, global_row_start, global_row_end_plus_one, ierr) - - if (comm_size /= 1) then - call MatMPIAIJGetSeqAIJ(strength_mat, Ad, Ao, colmap, ierr) - ! We know the col size of Ao is the size of colmap, the number of non-zero offprocessor columns - call MatGetSize(Ao, rows_ao, cols_ao, ierr) - else - Ad = strength_mat - end if - - ! ~~~~~~~~ - ! Get pointers to the sequential diagonal and off diagonal aij structures - ! ~~~~~~~~ - call MatGetRowIJ(Ad,shift,symmetric,inodecompressed,n_ad,ad_ia,ad_ja,done,ierr) - if (.NOT. done) then - print *, "Pointers not set in call to MatGetRowIJ" - call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) - end if - if (comm_size /= 1) then - call MatGetRowIJ(Ao,shift,symmetric,inodecompressed,n_ao,ao_ia,ao_ja,done,ierr) - if (.NOT. done) then - print *, "Pointers not set in call to MatGetRowIJ" - call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) - end if - end if - ! ~~~~~~~~~~ - - ! Get the number of connections in S - allocate(measure_local(local_rows)) - allocate(cf_markers_local(local_rows)) - cf_markers_local = C_POINT - allocate(in_set_this_loop(local_rows)) - allocate(assigned_local(local_rows)) - - ! ~~~~~~~~~~~~ - ! Seed the measure_local between 0 and 1 - ! ~~~~~~~~~~~~ - call random_seed(size=seed_size) - allocate(seed(seed_size)) - do kfree = 1, seed_size - seed(kfree) = comm_rank + 1 + kfree - end do - call random_seed(put=seed) - - ! To get the same results regardless of number of processors, you can - ! force the random number on each node to match across all processors - ! This is tricky to do, given the numbering of rows is different in parallel - ! I did code up a version that used the unique spatial node positions to seed the random - ! number generator and test that and it works the same regardless of num of procs - ! so I'm fairly confident things are correct - - ! Fill the measure with random numbers - call random_number(measure_local) - deallocate(seed) - - ! ~~~~~~~~~~ - - ! ~~~~~~~~~~~~ - ! Add the number of connections in S to the randomly seeded measure_local - ! The number of connections is just equal to a matvec with a vec of all ones and the strength_mat - ! We don't have to bother with a matvec though as we know the strenth_mat has entries of one - ! ~~~~~~~~~~~~ - do ifree = 1, local_rows - - ! Do local component - ncols = ad_ia(ifree+1) - ad_ia(ifree) - measure_local(ifree) = measure_local(ifree) + ncols - - ! Do non local component - if (comm_size /= 1) then - ncols = ao_ia(ifree+1) - ao_ia(ifree) - measure_local(ifree) = measure_local(ifree) + ncols - end if - end do - - ! If PMIS then we want to search the measure based on the largest entry - ! PMISR searches the measure based on the smallest entry - ! We just let the measure be negative rather than change the .ge. comparison - ! in our Luby below - if (pmis) measure_local = measure_local * (-1) - - ! ~~~~~~~~~~~~ - ! Create parallel vec and scatter the measure - ! ~~~~~~~~~~~~ - if (comm_size/=1) then - - ! This is fine being mpi type specifically as strength_mat is always a mataij - call VecCreateMPIWithArray(MPI_COMM_MATRIX, one, & - local_rows, global_rows, measure_local, measure_vec, ierr) - - A_array = strength_mat%v - vec_long = measure_vec%v - ! We're just going to use the existing lvec to scatter the measure - ! Have to call restore after we're done with lvec (ie measure_nonlocal_ptr) - call vecscatter_mat_begin_c(A_array, vec_long, measure_nonlocal_ptr) - call vecscatter_mat_end_c(A_array, vec_long, measure_nonlocal_ptr) - ! This is the lvec so we have to make sure we don't do a matvec anywhere - ! before calling restore - call c_f_pointer(measure_nonlocal_ptr, measure_nonlocal, shape=[cols_ao]) - - allocate(assigned_nonlocal(cols_ao)) - assigned_local_ptr = c_loc(assigned_local) - assigned_nonlocal_ptr = c_loc(assigned_nonlocal) - else - ! Need to avoid uninitialised warning - allocate(assigned_nonlocal(0)) - end if - - ! ~~~~~~~~~~~~ - ! Initialise the set - ! ~~~~~~~~~~~~ - counter_in_set_start = 0 - assigned_local = .FALSE. - assigned_nonlocal = .FALSE. - - do ifree = 1, local_rows - - ! If there are no strong neighbours (not measure_local == 0 as we have added a random number to it) - ! then we treat it special - ! Absolute value here given measure_local could be negative (pmis) or positive (pmisr) - if (abs(measure_local(ifree)) < 1) then - - ! Assign this node - assigned_local(ifree) = .TRUE. - - ! This is typically enabled in a second pass of PMIS just on C points - ! (ie aggressive coarsening based on MIS(MIS(1))), we want to keep - ! C-points with no other strong C connections as C points - if (zero_measure_c) then - if (pmis) then - ! Set as F here but reversed below to become C - cf_markers_local(ifree) = F_POINT - else - ! Becomes C - cf_markers_local(ifree) = C_POINT - end if - else - if (pmis) then - ! Set as C here but reversed below to become F - ! Otherwise dirichlet conditions persist down onto the coarsest grid - cf_markers_local(ifree) = C_POINT - else - ! Becomes F - cf_markers_local(ifree) = F_POINT - end if - end if - counter_in_set_start = counter_in_set_start + 1 - end if - end do - - ! Check the total number of undecided in parallel - if (max_luby_steps < 0) then - counter_undecided = local_rows - counter_in_set_start - ! Parallel reduction! - ! This is just an allreduce sum, but we can't use MPIU_INTEGER, as if we call the pmisr - ! cf splitting from C it is not defined - also have to pass the matrix so we can get the comm - ! given they're different in C and fortran - A_array = strength_mat%v - call allreducesum_petscint_mine(A_array, counter_undecided, counter_parallel) - counter_undecided = counter_parallel - - ! If we're doing a fixed number of steps, then we don't care - ! how many undecided nodes we have - have to take care here not to use - ! local_rows for counter_undecided, as we may have zero DOFs on some procs - ! but we have to enter the loop below for the collective scatters - else - counter_undecided = 1 - end if - - ! ~~~~~~~~~~~~ - ! Now go through the outer Luby loop - ! ~~~~~~~~~~~~ - - ! Let's keep track of how many times we go through the loops - loops_through = -1 - - do while (counter_undecided /= 0) - - ! If max_luby_steps is positive, then we only take that many times through this top loop - ! We typically find 2-3 iterations decides >99% of the nodes - ! and a fixed number of outer loops means we don't have to do any parallel reductions - ! We will do redundant nearest neighbour comms in the case we have already - ! finished deciding all the nodes, but who cares - ! Any undecided nodes just get turned into C points - ! We can do this as we know we won't ruin Aff by doing so, unlike in a normal multigrid - if (max_luby_steps > 0 .AND. max_luby_steps+1 == -loops_through) exit - - ! ~~~~~~~~~ - ! Start the async broadcast of assigned_local to assigned_nonlocal - ! ~~~~~~~~~ - if (comm_size /= 1) then - call boolscatter_mat_begin_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) - end if - - ! Reset in_set_this_loop, which keeps track of which nodes are added to the set this loop - do ifree = 1, local_rows - ! If they're already assigned they can't be added - if (assigned_local(ifree)) then - in_set_this_loop(ifree) = .FALSE. - ! We assume any unassigned are added to the set this loop and then rule them out below - else - in_set_this_loop(ifree) = .TRUE. - end if - end do - - ! ~~~~~~~~ - ! The Luby algorithm has measure_local(v) > measure_local(u) for all u in active neighbours - ! and then you have to loop from the nodes with biggest measure_local down - ! That is the definition of PMIS - ! PMISR swaps the CF definitions from a traditional PMIS - ! PMISR starts from the smallest measure_local and ensure - ! measure_local(v) < measure_local(u) for all u in active neighbours - ! measure_local is negative for PMIS and positive for PMISR - ! that way we dont have to change the .ge. in the comparison code below - ! ~~~~~~~~ - - ! ~~~~~~~~ - ! Go and do the local component - ! ~~~~~~~~ - node_loop_local: do ifree = 1, local_rows - - ! Check if this node is already in A - if (assigned_local(ifree)) cycle node_loop_local - - ! Loop over all the active strong neighbours on the local processors - do jfree = ad_ia(ifree)+1, ad_ia(ifree+1) - - ! Have to only check unassigned strong neighbours - if (assigned_local(ad_ja(jfree) + 1)) cycle - - ! Check the measure_local - if (measure_local(ifree) .ge. measure_local(ad_ja(jfree) + 1)) then - in_set_this_loop(ifree) = .FALSE. - cycle node_loop_local - end if - end do - end do node_loop_local - - ! ~~~~~~~~ - ! Finish the async broadcast, assigned_nonlocal is now correct - ! ~~~~~~~~ - if (comm_size /= 1) then - call boolscatter_mat_end_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) - end if - - ! ~~~~~~~~ - ! Now go through and do the non-local part of the matrix - ! ~~~~~~~~ - if (comm_size /= 1) then - - node_loop: do ifree = 1, local_rows - - ! Check if already ruled out by local loop or already assigned - if (assigned_local(ifree) .OR. .NOT. in_set_this_loop(ifree)) cycle node_loop - - ! Loop over all the active strong neighbours on the non-local processors - do jfree = ao_ia(ifree)+1, ao_ia(ifree+1) - - ! Have to only check unassigned strong neighbours - if (assigned_nonlocal(ao_ja(jfree) + 1)) cycle - - ! Check the measure_local - if (measure_local(ifree) .ge. measure_nonlocal(ao_ja(jfree) + 1)) then - in_set_this_loop(ifree) = .FALSE. - cycle node_loop - end if - end do - - end do node_loop - end if - - ! We now know all nodes which were added to the set this loop, so let's record them - do ifree = 1, local_rows - if (in_set_this_loop(ifree)) then - assigned_local(ifree) = .TRUE. - cf_markers_local(ifree) = F_POINT - end if - end do - - ! ~~~~~~~~~~~~~~ - ! All the work below here is now to ensure assigned_local is correct for the next iteration - ! Update the nonlocal values first then comm them - ! ~~~~~~~~~~~~~~ - if (comm_size /= 1) then - - ! We're going to do an LOR reduce so start all as false - assigned_nonlocal = .FALSE. - - do ifree = 1, local_rows - - ! Only need to update neighbours of nodes assigned this top loop - if (.NOT. in_set_this_loop(ifree)) cycle - - ! We know all neighbours of points assigned this loop are C points - ! We don't actually need to record that they're C points, just that they're assigned - do jfree = ao_ia(ifree)+1, ao_ia(ifree+1) - assigned_nonlocal(ao_ja(jfree) + 1) = .TRUE. - end do - end do - - ! ~~~~~~~~~~~ - ! We need to start the async reduce LOR of the assigned_nonlocal into assigned_local - ! After this comms finishes any local node in another processors halo - ! that has been assigned on another process will be correctly marked in assigned_local - ! ~~~~~~~~~~~ - call boolscatter_mat_reverse_begin_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) - - end if - - ! ~~~~~~~~~~~~~~ - ! Now go and update the local values - ! ~~~~~~~~~~~~~~ - - do ifree = 1, local_rows - - ! Only need to update neighbours of nodes assigned this top loop - if (.NOT. in_set_this_loop(ifree)) cycle - - ! Don't need a guard here to check if they're already assigned, as we - ! can guarantee they won't be - do jfree = ad_ia(ifree)+1, ad_ia(ifree+1) - assigned_local(ad_ja(jfree) + 1) = .TRUE. - end do - end do - - ! ~~~~~~~~~ - ! In parallel we have to finish our asyn comms - ! ~~~~~~~~~ - if (comm_size /= 1) then - ! Finishes the reduce LOR, assigned_local will now be correct - call boolscatter_mat_reverse_end_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) - end if - - ! ~~~~~~~~~~~~ - ! We've now done another top level loop - ! ~~~~~~~~~~~~ - loops_through = loops_through - 1 - - ! ~~~~~~~~~~~~ - ! Check the total number of undecided in parallel before we loop again - ! ~~~~~~~~~~~~ - if (max_luby_steps < 0) then - ! Count how many are undecided - counter_undecided = local_rows - count(assigned_local) - ! Parallel reduction! - A_array = strength_mat%v - call allreducesum_petscint_mine(A_array, counter_undecided, counter_parallel) - counter_undecided = counter_parallel - end if - end do - - ! ~~~~~~~~~~~~ - ! We're finished our IS now - ! ~~~~~~~~~~~~ - - ! Restore the sequantial pointers once we're done - call MatRestoreRowIJ(Ad,shift,symmetric,inodecompressed,n_ad,ad_ia,ad_ja,done,ierr) - if (comm_size /= 1) then - call MatRestoreRowIJ(Ao,shift,symmetric,inodecompressed,n_ao,ao_ia,ao_ja,done,ierr) - end if - - ! If PMIS then we swap the CF markers from PMISR - if (pmis) then - cf_markers_local = cf_markers_local * (-1) - end if - - ! ~~~~~~~~~ - ! Cleanup - ! ~~~~~~~~~ - deallocate(measure_local, in_set_this_loop, assigned_local) - if (comm_size/=1) then - call VecDestroy(measure_vec, ierr) - ! Don't forget to restore on lvec from our matrix - call vecscatter_mat_restore_c(A_array, measure_nonlocal_ptr) - end if - deallocate(assigned_nonlocal) - - end subroutine pmisr_cpu - -! ------------------------------------------------------------------------------------------------------------------------------- - - subroutine ddc(input_mat, is_fine, fraction_swap, max_dd_ratio, cf_markers_local) - - ! Second pass diagonal dominance cleanup - ! Flips the F definitions to C based on least diagonally dominant local rows - ! If fraction_swap = 0 this does nothing - ! If fraction_swap < 0 it uses abs(fraction_swap) to be a threshold - ! for swapping C to F based on row-wise diagonal dominance (ie alpha_diag) - ! If fraction_swap > 0 it uses fraction_swap as the local fraction of worst C points to swap to F - ! though it won't hit that fraction exactly as we bin the diag dom ratios for speed, it will be close to the fraction - - ! ~~~~~~ - type(tMat), target, intent(in) :: input_mat - type(tIS), intent(in) :: is_fine - PetscReal, intent(in) :: fraction_swap - PetscReal, intent(inout) :: max_dd_ratio - integer, dimension(:), allocatable, target, intent(inout) :: cf_markers_local - -#if defined(PETSC_HAVE_KOKKOS) - integer(c_long_long) :: A_array - PetscErrorCode :: ierr - MatType :: mat_type - type(c_ptr) :: cf_markers_local_ptr - integer :: errorcode - !integer :: kfree - integer, dimension(:), allocatable :: cf_markers_local_two - PetscReal :: max_dd_ratio_cpu, max_dd_ratio_kokkos -#endif - ! ~~~~~~ - - ! If we don't need to swap anything, return - if (fraction_swap == 0d0) then - return - end if - -#if defined(PETSC_HAVE_KOKKOS) - - call MatGetType(input_mat, mat_type, ierr) - if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & - mat_type == MATAIJKOKKOS) then - - A_array = input_mat%v - cf_markers_local_ptr = c_loc(cf_markers_local) - - ! If debugging do a comparison between CPU and Kokkos results - if (kokkos_debug()) then - allocate(cf_markers_local_two(size(cf_markers_local))) - cf_markers_local_two = cf_markers_local - end if - - ! Modifies the existing device cf_markers created by the pmisr - max_dd_ratio_kokkos = max_dd_ratio - call ddc_kokkos(A_array, fraction_swap, max_dd_ratio_kokkos) - - ! If debugging do a comparison between CPU and Kokkos results - if (kokkos_debug()) then - - ! Kokkos DDC by default now doesn't copy back to the host, as any subsequent ddc calls - ! use the existing device data - call copy_cf_markers_d2h(cf_markers_local_ptr) - max_dd_ratio_cpu = max_dd_ratio - call ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio_cpu, cf_markers_local_two) - - if (any(cf_markers_local /= cf_markers_local_two)) then - - ! do kfree = 1, size(cf_markers_local) - ! if (cf_markers_local(kfree) /= cf_markers_local_two(kfree)) then - ! print *, kfree-1, "no match", cf_markers_local(kfree), cf_markers_local_two(kfree) - ! end if - ! end do - print *, "Kokkos and CPU versions of ddc do not match" - call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) - end if - deallocate(cf_markers_local_two) - end if - max_dd_ratio = max_dd_ratio_kokkos - - else - call ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, cf_markers_local) - end if -#else - call ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, cf_markers_local) -#endif - - end subroutine ddc - -! ------------------------------------------------------------------------------------------------------------------------------- - - subroutine ddc_cpu(input_mat, is_fine, fraction_swap, max_dd_ratio, cf_markers_local) - - ! Second pass diagonal dominance cleanup - ! Flips the F definitions to C based on least diagonally dominant local rows - ! If fraction_swap = 0 this does nothing - ! If fraction_swap < 0 it uses abs(fraction_swap) to be a threshold - ! for swapping C to F based on row-wise diagonal dominance (ie alpha_diag) - ! If fraction_swap > 0 it uses fraction_swap as the local fraction of worst C points to swap to F - ! though it won't hit that fraction exactly as we bin the diag dom ratios for speed, it will be close to the fraction - - ! ~~~~~~ - type(tMat), target, intent(in) :: input_mat - type(tIS), intent(in) :: is_fine - PetscReal, intent(in) :: fraction_swap - PetscReal, intent(inout) :: max_dd_ratio - integer, dimension(:), allocatable, intent(inout) :: cf_markers_local - - ! Local - PetscInt :: local_rows, local_cols, one=1 - PetscInt :: a_global_row_start, a_global_row_end_plus_one, ifree, ncols - PetscInt :: input_row_start, input_row_end_plus_one - PetscInt :: jfree, idx, search_size, diag_index, fine_size, frac_size - integer :: bin_sum, bin_boundary, bin, errorcode - PetscErrorCode :: ierr - PetscInt, dimension(:), pointer :: cols => null() - PetscReal, dimension(:), pointer :: vals => null() - PetscReal, dimension(:), allocatable :: diag_dom_ratio - PetscInt, dimension(:), pointer :: is_pointer - type(tMat) :: Aff - PetscReal :: diag_val, max_dd_ratio_local, max_dd_ratio_achieved - real(c_double) :: swap_dom_val - integer, dimension(1000) :: dom_bins - MPIU_Comm :: MPI_COMM_MATRIX - logical :: trigger_dd_ratio_compute - - ! ~~~~~~ - - ! The indices are the numbering in Aff matrix - call ISGetIndices(is_fine, is_pointer, ierr) - call ISGetLocalSize(is_fine, fine_size, ierr) - - trigger_dd_ratio_compute = max_dd_ratio > 0 - - ! Do a fixed alpha_diag - if (fraction_swap < 0) then - ! We have to look through all the local rows - search_size = fine_size - - ! Or pick alpha_diag based on the worst % of rows - else - ! Only need to go through the biggest % of indices - frac_size = int(dble(fine_size) * fraction_swap) - - ! If we are trying to hit a given max_dd_ratio, then we need to continue coarsening, even - ! if we only change one dof at a time, otherwise we could get stuck - if (trigger_dd_ratio_compute) then - search_size = max(one, frac_size) - ! If we're not trying to hit a given max_dd_ratio, then if fraction_swap is small - ! we allow it to just not swap anything if the number of local rows is small - ! This stops many lower levels in parallel where we are only changing one dof at a time - else - search_size = frac_size - end if - end if - - ! ~~~~~~~~~~~~~ - - call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) - - ! Pull out Aff for ease of use - call MatCreateSubMatrix(input_mat, & - is_fine, is_fine, MAT_INITIAL_MATRIX, & - Aff, ierr) - - ! Get the local sizes - call MatGetLocalSize(Aff, local_rows, local_cols, ierr) - call MatGetOwnershipRange(Aff, a_global_row_start, a_global_row_end_plus_one, ierr) - call MatGetOwnershipRange(input_mat, input_row_start, input_row_end_plus_one, ierr) - - ! ~~~~~~~~~~~~~ - ! Compute diagonal dominance ratio - ! ~~~~~~~~~~~~~ - allocate(diag_dom_ratio(local_rows)) - diag_dom_ratio = 0 - dom_bins = 0 - - ! Sum the rows and find the diagonal entry in each local row - do ifree = a_global_row_start, a_global_row_end_plus_one-1 - call MatGetRow(Aff, ifree, ncols, cols, vals, ierr) - - ! Index of the diagonal - diag_index = -1 - diag_val = 1.0d0 - - do jfree = 1, ncols - ! Store the diagonal - if (cols(jfree) == ifree) then - diag_val = abs(vals(jfree)) - diag_index = jfree - else - ! Row sum of off-diagonals - diag_dom_ratio(ifree - a_global_row_start + 1) = diag_dom_ratio(ifree - a_global_row_start + 1) + abs(vals(jfree)) - end if - end do - - ! If we don't have a diagonal entry in this row there is no point trying to - ! compute a diagonal dominance ratio - ! We set diag_dom_ratio to zero and that means this row will stay as an F point - if (diag_index == -1) then - diag_dom_ratio(ifree - a_global_row_start + 1) = 0.0 - call MatRestoreRow(Aff, ifree, ncols, cols, vals, ierr) - cycle - end if - - ! If we have non-diagonal entries - if (diag_dom_ratio(ifree - a_global_row_start + 1) /= 0d0) then - ! Compute the diagonal dominance ratio - diag_dom_ratio(ifree - a_global_row_start + 1) = diag_dom_ratio(ifree - a_global_row_start + 1) / diag_val - end if - - call MatRestoreRow(Aff, ifree, ncols, cols, vals, ierr) - end do - - ! Get the maximum diagonal dominance ratio - if (trigger_dd_ratio_compute) then - max_dd_ratio_local = maxval(diag_dom_ratio) - call MPI_Allreduce(max_dd_ratio_local, max_dd_ratio_achieved, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_MATRIX, errorcode) - ! print *, "computed diag dom ratio", max_dd_ratio_achieved - ! If we have hit the required diagonal dominance ratio, then return without swapping any F points - if (max_dd_ratio_achieved < max_dd_ratio) then - max_dd_ratio = max_dd_ratio_achieved - call ISRestoreIndices(is_fine, is_pointer, ierr) - call MatDestroy(Aff, ierr) - return - end if - end if - - ! ~~~~~~~~~~~~~ - - ! Can't put this above because of collective operations in parallel (namely the getsubmatrix) - ! If we have local points to swap - if (search_size > 0) then - - ! If we reach here then we want to swap some local F points to C points - - do ifree = a_global_row_start, a_global_row_end_plus_one-1 - - ! Bin the entries between 0 and 1 - ! The top bin has entries greater than 0.9 (including greater than 1) - bin = min(floor(diag_dom_ratio(ifree - a_global_row_start + 1) * size(dom_bins)) + 1, size(dom_bins)) - ! If the diagonal dominance ratio is really large the expression above will overflow - ! the int to negative, so we just stick that in the top bin - if (bin < 0) then - bin = size(dom_bins) - end if - dom_bins(bin) = dom_bins(bin) + 1 - - end do - - ! Do a fixed alpha_diag - if (fraction_swap< 0) then - swap_dom_val = -fraction_swap - - ! Otherwise swap everything bigger than a fixed fraction - else - - ! In order to reduce the size of the sort required, we have binned the entries into 1000 bins - ! Let's count backwards from the biggest entries to find which bin we know the nth_element is in - ! and then we only include those bins and higher into the sort - bin_sum = 0 - do bin_boundary = size(dom_bins), 1, -1 - bin_sum = bin_sum + dom_bins(bin_boundary) - if (bin_sum .ge. search_size) exit - end do - ! Now bin_boundary holds the bin whose lower boundary is guaranteed to be <= the n_th element - - ! Rather than do any type of sort, just swap everything above that bin boundary - ! This will give a fraction_swap that is very close to that passed in as long as the - ! size of the bins is small - swap_dom_val = dble(bin_boundary-1)/dble(size(dom_bins)) - - end if - - ! Let's go and swap F points to C points - do ifree = 1, local_rows - - ! If this row only has a single diagonal entry, or is below the threshold we swap, skip - if (diag_dom_ratio(ifree) == 0 .OR. diag_dom_ratio(ifree) < swap_dom_val) cycle - - ! This is the actual numbering in A, rather than Aff - ! Careful here to minus away the row_start of A, not Aff, as cf_markers_local is as big as A - idx = is_pointer(ifree) - input_row_start + 1 - - ! Swap by multiplying by -1 - cf_markers_local(idx) = cf_markers_local(idx) * (-1) - end do - end if - - deallocate(diag_dom_ratio) - call ISRestoreIndices(is_fine, is_pointer, ierr) - call MatDestroy(Aff, ierr) - - end subroutine ddc_cpu - -! ------------------------------------------------------------------------------------------------------------------------------- - -end module pmisr_ddc - diff --git a/src/PMISR_DDCk.kokkos.cxx b/src/PMISR_DDCk.kokkos.cxx deleted file mode 100644 index 472ad9af..00000000 --- a/src/PMISR_DDCk.kokkos.cxx +++ /dev/null @@ -1,1021 +0,0 @@ -// Our petsc kokkos definitions - has to go first -#include "kokkos_helper.hpp" -#include -#include <../src/mat/impls/aij/seq/aij.h> -#include <../src/mat/impls/aij/mpi/mpiaij.h> - -// This is a device copy of the cf markers on a given level -// to save having to copy it to/from the host between pmisr and ddc calls -intKokkosView cf_markers_local_d; - -//------------------------------------------------------------------------------------------------------------------------ - -// Copy the global cf_markers_local_d back to the host -PETSC_INTERN void copy_cf_markers_d2h(int *cf_markers_local) -{ - // Host wrapper for cf_markers_local - intKokkosViewHost cf_markers_local_h(cf_markers_local, cf_markers_local_d.extent(0)); - - // Now copy device cf_markers_local_d back to host - Kokkos::deep_copy(cf_markers_local_h, cf_markers_local_d); - // Log copy with petsc - size_t bytes = cf_markers_local_d.extent(0) * sizeof(int); - PetscCallVoid(PetscLogGpuToCpu(bytes)); - - return; -} - -//------------------------------------------------------------------------------------------------------------------------ - -// Delete the global cf_markers_local_d -PETSC_INTERN void delete_device_cf_markers() -{ - // Delete the device view - this assigns an empty view - // and hence the old view has its ref counter decremented - cf_markers_local_d = intKokkosView(); - - return; -} - -//------------------------------------------------------------------------------------------------------------------------ - -// PMISR cf splitting but on the device -// This no longer copies back to the host pointer cf_markers_local at the end -// You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this -PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscReal *measure_local, const int zero_measure_c_point_int) -{ - - MPI_Comm MPI_COMM_MATRIX; - PetscInt local_rows, local_cols, global_rows, global_cols; - PetscInt global_row_start, global_row_end_plus_one; - PetscInt rows_ao, cols_ao; - MatType mat_type; - - PetscCallVoid(MatGetType(*strength_mat, &mat_type)); - // Are we in parallel? - const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; - - Mat_MPIAIJ *mat_mpi = nullptr; - Mat mat_local = NULL, mat_nonlocal = NULL; - - if (mpi) - { - mat_mpi = (Mat_MPIAIJ *)(*strength_mat)->data; - PetscCallVoid(MatMPIAIJGetSeqAIJ(*strength_mat, &mat_local, &mat_nonlocal, NULL)); - PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); - } - else - { - mat_local = *strength_mat; - } - - // Get the comm - PetscCallVoid(PetscObjectGetComm((PetscObject)*strength_mat, &MPI_COMM_MATRIX)); - PetscCallVoid(MatGetLocalSize(*strength_mat, &local_rows, &local_cols)); - PetscCallVoid(MatGetSize(*strength_mat, &global_rows, &global_cols)); - // This returns the global index of the local portion of the matrix - PetscCallVoid(MatGetOwnershipRange(*strength_mat, &global_row_start, &global_row_end_plus_one)); - - // ~~~~~~~~~~~~ - // Get pointers to the i,j,vals on the device - // ~~~~~~~~~~~~ - const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; - PetscMemType mtype; - PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; - PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); - if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); - - // Device memory for the global variable cf_markers_local_d - be careful these aren't petsc ints - cf_markers_local_d = intKokkosView("cf_markers_local_d", local_rows); - // Can't use the global directly within the parallel - // regions on the device so just take a shallow copy - intKokkosView cf_markers_d = cf_markers_local_d; - - // PetscSF comms cannot be started with a pointer derived from a zero-extent Kokkos view - - // doing so causes intermittent failures in parallel on GPUs. Use a size-1 dummy view - // so that every pointer passed to PetscSF is always backed by valid device memory. - intKokkosView sf_int_dummy_d("sf_int_dummy_d", 1); - PetscScalarKokkosView sf_scalar_dummy_d("sf_scalar_dummy_d", 1); - - intKokkosView cf_markers_nonlocal_d; - int *cf_markers_d_ptr = NULL, *cf_markers_nonlocal_d_ptr = NULL; - cf_markers_d_ptr = local_rows > 0 ? cf_markers_d.data() : sf_int_dummy_d.data(); - - intKokkosView cf_markers_send_d; - int *cf_markers_send_d_ptr = NULL; - - // Host and device memory for the measure - PetscScalarKokkosViewHost measure_local_h(measure_local, local_rows); - PetscScalarKokkosView measure_local_d("measure_local_d", local_rows); - PetscScalar *measure_local_d_ptr = NULL, *measure_nonlocal_d_ptr = NULL; - measure_local_d_ptr = local_rows > 0 ? measure_local_d.data() : sf_scalar_dummy_d.data(); - PetscScalarKokkosView measure_nonlocal_d; - - if (mpi) { - measure_nonlocal_d = PetscScalarKokkosView("measure_nonlocal_d", cols_ao); - measure_nonlocal_d_ptr = cols_ao > 0 ? measure_nonlocal_d.data() : sf_scalar_dummy_d.data(); - cf_markers_nonlocal_d = intKokkosView("cf_markers_nonlocal_d", cols_ao); - cf_markers_nonlocal_d_ptr = cols_ao > 0 ? cf_markers_nonlocal_d.data() : sf_int_dummy_d.data(); - cf_markers_send_d = intKokkosView("cf_markers_send_d", local_rows); - cf_markers_send_d_ptr = local_rows > 0 ? cf_markers_send_d.data() : sf_int_dummy_d.data(); - } - - // Device memory for the mark - boolKokkosView mark_d("mark_d", local_rows); - auto exec = PetscGetKokkosExecutionSpace(); - - // If you want to generate the randoms on the device - //Kokkos::Random_XorShift64_Pool<> random_pool(/*seed=*/12345); - // Copy the input measure from host to device - Kokkos::deep_copy(measure_local_d, measure_local_h); - // Log copy with petsc - size_t bytes = measure_local_h.extent(0) * sizeof(PetscReal); - PetscCallVoid(PetscLogCpuToGpu(bytes)); - - // Compute the measure - Kokkos::parallel_for( - Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { - - // Randoms on the device - // auto generator = random_pool.get_state(); - // measure_local_d(i) = generator.drand(0., 1.); - // random_pool.free_state(generator); - - const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; - measure_local_d(i) += ncols_local; - - if (mpi) - { - PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; - measure_local_d(i) += ncols_nonlocal; - } - // Flip the sign if pmis - if (pmis_int == 1) measure_local_d(i) *= -1; - }); - // Have to ensure the parallel for above finishes before comms - exec.fence(); - - // Start the scatter of the measure - the kokkos memtype is set as PETSC_MEMTYPE_HOST or - // one of the kokkos backends like PETSC_MEMTYPE_HIP - PetscMemType mem_type = PETSC_MEMTYPE_KOKKOS; - if (mpi) - { - // PetscSF owns measure_local_d_ptr as the active send buffer until End. - // Do not even read from that send buffer before End is called. - // If you alias it in overlapped GPU work, the failure shows up intermittently - // in parallel runs on GPUs. - PetscCallVoid(PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPIU_SCALAR, - mem_type, measure_local_d_ptr, - mem_type, measure_nonlocal_d_ptr, - MPI_REPLACE)); - } - - // Initialise the set - PetscInt counter_in_set_start = 0; - // Count how many in the set to begin with and set their CF markers - Kokkos::parallel_reduce ("Reduction", local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) { - if (Kokkos::abs(measure_local_d[i]) < 1) - { - if (zero_measure_c_point_int == 1) { - if (pmis_int == 1) { - // Set as F here but reversed below to become C - cf_markers_d(i) = -1; - } - else { - // Becomes C - cf_markers_d(i) = 1; - } - } - else { - if (pmis_int == 1) { - // Set as C here but reversed below to become F - // Otherwise dirichlet conditions persist down onto the coarsest grid - cf_markers_d(i) = 1; - } - else { - // Becomes F - cf_markers_d(i) = -1; - } - } - // Count - update++; - } - else - { - cf_markers_d(i) = 0; - } - }, counter_in_set_start); - - // Check the total number of undecided in parallel - PetscInt counter_undecided, counter_parallel; - if (max_luby_steps < 0) { - counter_undecided = local_rows - counter_in_set_start; - // Parallel reduction! - PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&counter_undecided, &counter_parallel, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX)); - counter_undecided = counter_parallel; - - // If we're doing a fixed number of steps, then we don't care - // how many undecided nodes we have - have to take care here not to use - // local_rows for counter_undecided, as we may have zero DOFs on some procs - // but we have to enter the loop below for the collective scatters - } - else { - counter_undecided = 1; - } - - // Finish the broadcast for the nonlocal measure - if (mpi) - { - // End releases the active send buffer for normal access again. - // The scattered values in measure_nonlocal_d are now safe to consume. - PetscCallVoid(PetscSFBcastEnd(mat_mpi->Mvctx, MPIU_SCALAR, measure_local_d_ptr, measure_nonlocal_d_ptr, MPI_REPLACE)); - } - - // ~~~~~~~~~~~~ - // Now go through the outer Luby loop - // ~~~~~~~~~~~~ - - // Let's keep track of how many times we go through the loops - int loops_through = -1; - do - { - // Match the fortran version and include a pre-test on the do-while - if (counter_undecided == 0) break; - - // If max_luby_steps is positive, then we only take that many times through this top loop - // We typically find 2-3 iterations decides >99% of the nodes - // and a fixed number of outer loops means we don't have to do any parallel reductions - // We will do redundant nearest neighbour comms in the case we have already - // finished deciding all the nodes, but who cares - // Any undecided nodes just get turned into C points - // We can do this as we know we won't ruin Aff by doing so, unlike in a normal multigrid - if (max_luby_steps > 0 && max_luby_steps+1 == -loops_through) break; - - // ~~~~~~~~~ - // Start the async scatter of the nonlocal cf_markers - // ~~~~~~~~~ - if (mpi) { - // Copy cf_markers_d into a temporary buffer - // If we gave the comms routine cf_markers_d we couldn't even read from - // it until comms ended, meaning we couldn't do the work overlapping below - Kokkos::deep_copy(cf_markers_send_d, cf_markers_d); - // Be careful these aren't petscints - // PetscSF owns cf_markers_send_d_ptr as the active send buffer until End. - // Do not even read from that send buffer before End is called. - // If you alias it in overlapped GPU work, the failure shows up intermittently - // in parallel runs on GPUs. - PetscCallVoid(PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT, - mem_type, cf_markers_send_d_ptr, - mem_type, cf_markers_nonlocal_d_ptr, - MPI_REPLACE)); - } - - - // mark_d keeps track of which of the candidate nodes can become in the set - // Only need this because we want to do async comms so we need a way to trigger - // a node not being in the set due to either strong local neighbours *or* strong offproc neighbours - - // ~~~~~~~~ - // Go and do the local component - // ~~~~~~~~ - Kokkos::parallel_for( - Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), - KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { - - // Row - const PetscInt i = t.league_rank(); - PetscInt strong_neighbours = 0; - - // Check this row isn't already marked - if (cf_markers_d(i) == 0) - { - const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; - - // Reduce over local columns to get the number of strong neighbours - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(t, ncols_local), - [&](const PetscInt j, PetscInt& strong_count) { - - // Have to only check active strong neighbours - if (measure_local_d(i) >= measure_local_d(device_local_j[device_local_i[i] + j]) && \ - cf_markers_d(device_local_j[device_local_i[i] + j]) == 0) - { - strong_count++; - } - - }, strong_neighbours - ); - - // Only want one thread in the team to write the result - Kokkos::single(Kokkos::PerTeam(t), [&]() { - // If we have any strong neighbours - if (strong_neighbours > 0) - { - mark_d(i) = false; - } - else - { - mark_d(i) = true; - } - }); - } - // Any that aren't zero cf marker are already assigned so set to to false - else - { - // Only want one thread in the team to write the result - Kokkos::single(Kokkos::PerTeam(t), [&]() { - mark_d(i) = false; - }); - } - }); - - // ~~~~~~~~ - // Now go through and do the non-local part of the matrix - // ~~~~~~~~ - if (mpi) { - - // Finish the async scatter - // Be careful these aren't petscints - // End releases the send snapshot for normal access again. - // The scattered cf_markers_nonlocal_d values are now safe to read. - PetscCallVoid(PetscSFBcastEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_send_d_ptr, cf_markers_nonlocal_d_ptr, MPI_REPLACE)); - - Kokkos::parallel_for( - Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), - KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { - - // Row - const PetscInt i = t.league_rank(); - PetscInt strong_neighbours = 0; - - // Check this row isn't already marked - if (mark_d(i)) - { - PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; - - // Reduce over nonlocal columns to get the number of strong neighbours - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(t, ncols_nonlocal), - [&](const PetscInt j, PetscInt& strong_count) { - - if (measure_local_d(i) >= measure_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]) && \ - cf_markers_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]) == 0) - { - strong_count++; - } - - }, strong_neighbours - ); - - // Only want one thread in the team to write the result - Kokkos::single(Kokkos::PerTeam(t), [&]() { - // If we don't have any strong neighbours - if (strong_neighbours == 0) cf_markers_d(i) = loops_through; - }); - } - }); - } - // This cf_markers_d(i) = loops_through happens above in the case of mpi, saves a kernel launch - else - { - // The nodes that have mark equal to true have no strong active neighbours in the IS - // hence they can be in the IS - Kokkos::parallel_for( - Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { - - if (mark_d(i)) cf_markers_d(i) = loops_through; - }); - } - - if (mpi) - { - // We're going to do an add reverse scatter, so set them to zero - Kokkos::deep_copy(cf_markers_nonlocal_d, 0); - - Kokkos::parallel_for( - Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), - KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { - - // Row - const PetscInt i = t.league_rank(); - - // Check if this node has been assigned during this top loop - if (cf_markers_d(i) == loops_through) - { - PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; - - // For over nonlocal columns - Kokkos::parallel_for( - Kokkos::TeamThreadRange(t, ncols_nonlocal), [&](const PetscInt j) { - - // Needs to be atomic as may being set by many threads - Kokkos::atomic_store(&cf_markers_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]), 1); - }); - } - }); - - // Ensure everything is done before we comm - exec.fence(); - - // We've updated the values in cf_markers_nonlocal - // Calling a reverse scatter add will then update the values of cf_markers_local - // Reduce with a sum, equivalent to VecScatterBegin with ADD_VALUES, SCATTER_REVERSE - // Be careful these aren't petscints - // PetscSF now owns cf_markers_nonlocal_d_ptr as the active send buffer. - // The local kernel below only touches cf_markers_d, and that is fine here - // because we only care about zero versus nonzero after ReduceEnd. - PetscCallVoid(PetscSFReduceWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT, - mem_type, cf_markers_nonlocal_d_ptr, - mem_type, cf_markers_d_ptr, - MPIU_SUM)); - } - - // Go and do local - Kokkos::parallel_for( - Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), - KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { - - // Row - const PetscInt i = t.league_rank(); - - // Check if this node has been assigned during this top loop - if (cf_markers_d(i) == loops_through) - { - const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; - - // For over nonlocal columns - Kokkos::parallel_for( - Kokkos::TeamThreadRange(t, ncols_local), [&](const PetscInt j) { - - // Needs to be atomic as may being set by many threads - // Tried a version where instead of a "push" approach I tried a pull approach - // that doesn't need an atomic, but it was slower - Kokkos::atomic_store(&cf_markers_d(device_local_j[device_local_i[i] + j]), 1); - }); - } - }); - - if (mpi) - { - // Finish the scatter - // Be careful these aren't petscints - // After End the accumulated cf_markers_d values are complete. - // This is the first point where later logic should consume the reduced - // result rather than the in-flight root buffer. - PetscCallVoid(PetscSFReduceEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_nonlocal_d_ptr, cf_markers_d_ptr, MPIU_SUM)); - } - - // We've done another top level loop - loops_through = loops_through - 1; - - // ~~~~~~~~~~~~ - // Check the total number of undecided in parallel before we loop again - // ~~~~~~~~~~~~ - if (max_luby_steps < 0) { - - counter_undecided = 0; - Kokkos::parallel_reduce ("ReductionCounter_undecided", local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) { - if (cf_markers_d(i) == 0) update++; - }, counter_undecided); - - // Parallel reduction! - PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&counter_undecided, &counter_parallel, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX)); - counter_undecided = counter_parallel; - } else { - // If we're doing a fixed number of steps, then we need an extra fence - // as we don't hit the parallel reduce above (which implicitly fences) - exec.fence(); - } - - } - while (counter_undecided != 0); - - // ~~~~~~~~~ - // Now assign our final cf markers - // ~~~~~~~~~ - - Kokkos::parallel_for( - Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { - - if (cf_markers_d(i) == 0) - { - cf_markers_d(i) = 1; - } - else if (cf_markers_d(i) < 0) - { - cf_markers_d(i) = -1; - } - else - { - cf_markers_d(i) = 1; - } - if (pmis_int) cf_markers_d(i) *= -1; - }); - // Ensure we're done before we exit - exec.fence(); - - return; -} - -//------------------------------------------------------------------------------------------------------------------------ - -// Creates the device local indices for F or C points based on the global cf_markers_local_d -PETSC_INTERN void create_cf_is_device_kokkos(Mat *input_mat, const int match_cf, PetscIntKokkosView &is_local_d) -{ - PetscInt local_rows, local_cols; - PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); - auto exec = PetscGetKokkosExecutionSpace(); - - // Can't use the global directly within the parallel - // regions on the device - intKokkosView cf_markers_d = cf_markers_local_d; - - // ~~~~~~~~~~~~ - // Get the F point local indices from cf_markers_local_d - // ~~~~~~~~~~~~ - PetscIntKokkosView point_offsets_d("point_offsets_d", local_rows+1); - - // Doing an exclusive scan to get the offsets for our local indices - // Doing one larger so we can get the total number of points - Kokkos::parallel_scan("point_offsets_d_scan", - Kokkos::RangePolicy<>(0, local_rows+1), - KOKKOS_LAMBDA(const PetscInt i, PetscInt& update, const bool final_pass) { - bool is_f_point = false; - if (i < local_rows) { // Predicate is based on original data up to local_rows-1 - is_f_point = (cf_markers_d(i) == match_cf); // is this point match_cf - } - if (final_pass) { - point_offsets_d(i) = update; - } - if (is_f_point) { - update++; - } - } - ); - - // The last entry in point_offsets_d is the total number of points that match match_cf - PetscInt local_rows_row = 0; - Kokkos::deep_copy(local_rows_row, Kokkos::subview(point_offsets_d, local_rows)); - - // This will be equivalent to is_fine - global_row_start, ie the local indices - is_local_d = PetscIntKokkosView("is_local_d", local_rows_row); - - // ~~~~~~~~~~~~ - // Write the local indices - // ~~~~~~~~~~~~ - Kokkos::parallel_for( - Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { - // Is this point match_cf - if (cf_markers_d(i) == match_cf) { - // point_offsets_d(i) gives the correct local index - is_local_d(point_offsets_d(i)) = i; - } - }); - // Ensure we're done before we exit - exec.fence(); -} - -//------------------------------------------------------------------------------------------------------------------------ - -// Creates the host IS is_fine and is_coarse based on the global cf_markers_local_d -PETSC_INTERN void create_cf_is_kokkos(Mat *input_mat, IS *is_fine, IS *is_coarse) -{ - PetscIntKokkosView is_fine_local_d, is_coarse_local_d; - MPI_Comm MPI_COMM_MATRIX; - PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); - - // Create the local f point indices - const int match_fine = -1; // F_POINT == -1 - create_cf_is_device_kokkos(input_mat, match_fine, is_fine_local_d); - - // Create the local C point indices - const int match_coarse = 1; // C_POINT == 1 - create_cf_is_device_kokkos(input_mat, match_coarse, is_coarse_local_d); - - // Now convert them back to global indices - PetscInt global_row_start, global_row_end_plus_one; - PetscCallVoid(MatGetOwnershipRange(*input_mat, &global_row_start, &global_row_end_plus_one)); - - // Convert F points - Kokkos::parallel_for( - Kokkos::RangePolicy<>(0, is_fine_local_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - is_fine_local_d(i) += global_row_start; - }); - // Convert C points - Kokkos::parallel_for( - Kokkos::RangePolicy<>(0, is_coarse_local_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { - - is_coarse_local_d(i) += global_row_start; - }); - - // Create some host space for the indices - PetscInt *is_fine_array = nullptr, *is_coarse_array = nullptr; - PetscInt n_fine = is_fine_local_d.extent(0); - PetscCallVoid(PetscMalloc1(n_fine, &is_fine_array)); - PetscIntKokkosViewHost is_fine_h = PetscIntKokkosViewHost(is_fine_array, is_fine_local_d.extent(0)); - PetscInt n_coarse = is_coarse_local_d.extent(0); - PetscCallVoid(PetscMalloc1(n_coarse, &is_coarse_array)); - PetscIntKokkosViewHost is_coarse_h = PetscIntKokkosViewHost(is_coarse_array, n_coarse); - - // Copy over the indices to the host - Kokkos::deep_copy(is_fine_h, is_fine_local_d); - Kokkos::deep_copy(is_coarse_h, is_coarse_local_d); - // Log copy with petsc - size_t bytes_fine = is_fine_local_d.extent(0) * sizeof(PetscInt); - size_t bytes_coarse = is_coarse_local_d.extent(0) * sizeof(PetscInt); - PetscCallVoid(PetscLogGpuToCpu(bytes_fine + bytes_coarse)); - - // Now we can create the IS objects - PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, is_fine_local_d.extent(0), is_fine_array, PETSC_OWN_POINTER, is_fine)); - PetscCallVoid(ISCreateGeneral(MPI_COMM_MATRIX, is_coarse_local_d.extent(0), is_coarse_array, PETSC_OWN_POINTER, is_coarse)); -} - -//------------------------------------------------------------------------------------------------------------------------ - -// Computes the diagonal dominance ratio of the input matrix over fine points in global variable cf_markers_local_d -// This code is very similar to MatCreateSubMatrix_kokkos -PETSC_INTERN void MatDiagDomRatio_kokkos(Mat *input_mat, PetscIntKokkosView &is_fine_local_d, PetscScalarKokkosView &diag_dom_ratio_d) -{ - PetscInt local_rows, local_cols; - - // Are we in parallel? - MatType mat_type; - MPI_Comm MPI_COMM_MATRIX; - PetscCallVoid(MatGetType(*input_mat, &mat_type)); - - const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; - PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); - PetscCallVoid(MatGetLocalSize(*input_mat, &local_rows, &local_cols)); - - Mat_MPIAIJ *mat_mpi = nullptr; - Mat mat_local = NULL, mat_nonlocal = NULL; - - PetscInt rows_ao, cols_ao; - if (mpi) - { - mat_mpi = (Mat_MPIAIJ *)(*input_mat)->data; - PetscCallVoid(MatMPIAIJGetSeqAIJ(*input_mat, &mat_local, &mat_nonlocal, NULL)); - PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); - } - else - { - mat_local = *input_mat; - } - - // Can't use the global directly within the parallel - // regions on the device - intKokkosView cf_markers_d = cf_markers_local_d; - intKokkosView sf_int_dummy_d("sf_int_dummy_d", 1); - intKokkosView cf_markers_nonlocal_d; - intKokkosView cf_markers_send_d; - auto exec = PetscGetKokkosExecutionSpace(); - - // ~~~~~~~~~~~~ - // Get the F point local indices from cf_markers_local_d - // ~~~~~~~~~~~~ - const int match_cf = -1; // F_POINT == -1 - create_cf_is_device_kokkos(input_mat, match_cf, is_fine_local_d); - PetscInt local_rows_row = is_fine_local_d.extent(0); - - // Create device memory for the diag_dom_ratio - diag_dom_ratio_d = PetscScalarKokkosView("diag_dom_ratio_d", local_rows_row); - - // ~~~~~~~~~~~~~~~ - // Can now go and compute the diagonal dominance sums - // ~~~~~~~~~~~~~~~ - // PetscSF comms cannot be started with a pointer derived from a zero-extent Kokkos view - - // doing so causes intermittent failures in parallel on GPUs. Use a size-1 dummy view - // so that every pointer passed to PetscSF is always backed by valid device memory. - int *cf_markers_nonlocal_d_ptr = NULL; - int *cf_markers_send_d_ptr = NULL; - PetscMemType mem_type = PETSC_MEMTYPE_KOKKOS; - PetscMemType mtype; - - // The off-diagonal component requires some comms which we can start now - if (mpi) - { - cf_markers_send_d = intKokkosView("cf_markers_send_d", local_rows); - // Copy cf_markers_d into a temporary buffer - // If we gave the comms routine cf_markers_d we couldn't even read from - // it until comms ended, meaning we couldn't do the work overlapping below - Kokkos::deep_copy(cf_markers_send_d, cf_markers_d); - cf_markers_send_d_ptr = local_rows > 0 ? cf_markers_send_d.data() : sf_int_dummy_d.data(); - exec.fence(); - cf_markers_nonlocal_d = intKokkosView("cf_markers_nonlocal_d", cols_ao); - cf_markers_nonlocal_d_ptr = cols_ao > 0 ? cf_markers_nonlocal_d.data() : sf_int_dummy_d.data(); - - // Start the scatter of the cf splitting - the kokkos memtype is set as PETSC_MEMTYPE_HOST or - // one of the kokkos backends like PETSC_MEMTYPE_HIP - // Be careful these aren't petscints - // PetscSF owns cf_markers_send_d_ptr as the active send buffer until End. - // Do not even read from that send buffer before End is called. - // If you alias it in overlapped GPU work, the failure shows up intermittently - // in parallel runs on GPUs. - PetscCallVoid(PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT, - mem_type, cf_markers_send_d_ptr, - mem_type, cf_markers_nonlocal_d_ptr, - MPI_REPLACE)); - } - - // ~~~~~~~~~~~~~~~ - // Do the local component so work/comms are overlapped - // ~~~~~~~~~~~~~~~ - - // ~~~~~~~~~~~~ - // Get pointers to the local i,j,vals on the device - // ~~~~~~~~~~~~ - const PetscInt *device_local_i = nullptr, *device_local_j = nullptr; - PetscScalar *device_local_vals = nullptr; - PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); - - // Have to store the diagonal entry - PetscScalarKokkosView diag_entry_d = PetscScalarKokkosView("diag_entry_d", local_rows_row); - Kokkos::deep_copy(diag_entry_d, 0); - - // Scoping to reduce peak memory - { - // We now go and do a reduce to get the diagonal entry, while also - // summing up the local non-diagonals into diag_dom_ratio_d - Kokkos::parallel_for( - Kokkos::TeamPolicy<>(exec, local_rows_row, Kokkos::AUTO()), - KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { - - const PetscInt i_idx_is_row = t.league_rank(); - const PetscInt i = is_fine_local_d(i_idx_is_row); - const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; - - PetscScalar sum_val = 0.0; - - // Reduce over local columns - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(t, ncols_local), - [&](const PetscInt j, PetscScalar& thread_sum) { - - // Get this local column in the input_mat - const PetscInt target_col = device_local_j[device_local_i[i] + j]; - // Is this column fine? F_POINT == -1 - if (cf_markers_d(target_col) == -1) - { - // Is this column the diagonal - const bool is_diagonal = i == target_col; - - // Get the abs value of the entry - PetscScalar val = Kokkos::abs(device_local_vals[device_local_i[i] + j]); - - // We have found a diagonal in this row - if (is_diagonal) { - // Will only happen for one thread - diag_entry_d(i_idx_is_row) = val; - } - else - { - thread_sum += val; - } - } - }, - Kokkos::Sum(sum_val) - ); - - // Only want one thread in the team to write the result - Kokkos::single(Kokkos::PerTeam(t), [&]() { - diag_dom_ratio_d(i_idx_is_row) = sum_val; - }); - }); - } - - // ~~~~~~~~~~~~~~~ - // Finish the comms and add the non-local entries to diag_dom_ratio_d - // before we divide by the diagonal entry - // ~~~~~~~~~~~~~~~ - - // The off-diagonal component requires some comms - // Basically a copy of MatCreateSubMatrix_MPIAIJ_SameRowColDist - if (mpi) - { - // Finish the scatter of the cf splitting - // Be careful these aren't petscints - // End releases the send snapshot for normal access again. - // The scattered cf_markers_nonlocal_d values are now safe to read. - PetscCallVoid(PetscSFBcastEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_send_d_ptr, cf_markers_nonlocal_d_ptr, MPI_REPLACE)); - - // ~~~~~~~~~~~~ - // Get pointers to the nonlocal i,j,vals on the device - // ~~~~~~~~~~~~ - const PetscInt *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; - PetscScalar *device_nonlocal_vals = nullptr; - PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); - - // Sum up the nonlocal matching entries into diag_dom_ratio_d - if (cols_ao > 0) - { - Kokkos::parallel_for( - Kokkos::TeamPolicy<>(exec, local_rows_row, Kokkos::AUTO()), - KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { - - const PetscInt i_idx_is_row = t.league_rank(); - const PetscInt i = is_fine_local_d(i_idx_is_row); - const PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; - - PetscScalar sum_val = 0.0; - - // Reduce over local columns - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(t, ncols_nonlocal), - [&](const PetscInt j, PetscScalar& thread_sum) { - - // This is the non-local column we have to check is present - const PetscInt target_col = device_nonlocal_j[device_nonlocal_i[i] + j]; - // Is this column in the input IS? F_POINT == -1 - if (cf_markers_nonlocal_d(target_col) == -1) - { - // Get the abs value of the entry - thread_sum += Kokkos::abs(device_nonlocal_vals[device_nonlocal_i[i] + j]); - } - }, - Kokkos::Sum(sum_val) - ); - - // Only want one thread in the team to write the result - Kokkos::single(Kokkos::PerTeam(t), [&]() { - // Add into existing - diag_dom_ratio_d(i_idx_is_row) += sum_val; - }); - }); - } - } - - // ~~~~~~~~~~~~~ - // Compute the diag dominance ratio - // ~~~~~~~~~~~~~ - Kokkos::parallel_for( - Kokkos::RangePolicy<>(0, local_rows_row), KOKKOS_LAMBDA(PetscInt i) { - - // If diag_val is zero we didn't find a diagonal - if (diag_entry_d(i) != 0.0){ - // Compute the diagonal dominance ratio - diag_dom_ratio_d(i) = diag_dom_ratio_d(i) / diag_entry_d(i); - } - else{ - diag_dom_ratio_d(i) = 0.0; - } - }); - // Ensure we're done before we exit - exec.fence(); - - return; -} - -//------------------------------------------------------------------------------------------------------------------------ - -// ddc cleanup but on the device - uses the global variable cf_markers_local_d -// This no longer copies back to the host pointer cf_markers_local at the end -// You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this -PETSC_INTERN void ddc_kokkos(Mat *input_mat, const PetscReal fraction_swap, PetscReal *max_dd_ratio) -{ - // Can't use the global directly within the parallel - // regions on the device - intKokkosView cf_markers_d = cf_markers_local_d; - PetscScalarKokkosView diag_dom_ratio_d; - PetscIntKokkosView is_fine_local_d; - - // Compute the diagonal dominance ratio over the fine points in cf_markers_local_d - // ie the diag domminance ratio of Aff - MatDiagDomRatio_kokkos(input_mat, is_fine_local_d, diag_dom_ratio_d); - PetscInt local_rows_aff = is_fine_local_d.extent(0); - MPI_Comm MPI_COMM_MATRIX; - PetscCallVoid(PetscObjectGetComm((PetscObject)*input_mat, &MPI_COMM_MATRIX)); - - bool trigger_dd_ratio_compute = *max_dd_ratio > 0; - auto exec = PetscGetKokkosExecutionSpace(); - - // Do a fixed alpha_diag - PetscInt search_size; - if (fraction_swap < 0) { - // We have to look through all the local rows - search_size = local_rows_aff; - } - // Or pick alpha_diag based on the worst % of rows - else { - // Only need to go through the biggest % of indices - PetscInt one = 1; - - // If we are trying to hit a given max_dd_ratio, then we need to continue coarsening, even - // if we only change one dof at a time - if (trigger_dd_ratio_compute) - { - search_size = std::max(one, static_cast(double(local_rows_aff) * fraction_swap)); - } - // If we're not trying to hit a given max_dd_ratio, then if fraction_swap is small - // we allow it to just not swap anything if the number of local rows is small - // This stops many lower levels in parallel where we are only changing one dof at a time - else - { - search_size = static_cast(double(local_rows_aff) * fraction_swap); - } - } - - PetscReal max_dd_ratio_achieved = 0.0; - // Compute the maximum diagonal dominance ratio - if (trigger_dd_ratio_compute) - { - PetscReal max_dd_ratio_local = 0.0; - // Do a reduction to get the local max - Kokkos::parallel_reduce("max_dd_ratio", local_rows_aff, - KOKKOS_LAMBDA(const PetscInt i, PetscReal& thread_max) { - PetscReal dd_ratio = diag_dom_ratio_d(i); - thread_max = (dd_ratio > thread_max) ? dd_ratio : thread_max; - }, - Kokkos::Max(max_dd_ratio_local) - ); - // Comms to get the global max - PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&max_dd_ratio_local, &max_dd_ratio_achieved, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_MATRIX)); - - //printf("computed diag dom ratio %f \n", max_dd_ratio_achieved); - // If we have hit the required diagonal dominance ratio, then return without swapping any F points - if (max_dd_ratio_achieved < *max_dd_ratio) - { - *max_dd_ratio = max_dd_ratio_achieved; - return; - } - } - - // Can't put this above because of collective operations in parallel (namely the MatDiagDomRatio_kokkos) - // If we have local points to swap - if (search_size > 0) - { - // If we reach here then we want to swap some local F points to C points - - // Create device memory for bins - auto dom_bins_d = PetscIntKokkosView("dom_bins_d", 1000); - Kokkos::deep_copy(dom_bins_d, 0); - - // Bin the diagonal dominance ratio - if (fraction_swap > 0) - { - Kokkos::parallel_for( - Kokkos::RangePolicy<>(0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) { - - // Let's bin the entry - int bin; - int test_bin = floor(diag_dom_ratio_d(i) * double(dom_bins_d.extent(0))) + 1; - if (test_bin < int(dom_bins_d.extent(0)) && test_bin >= 0) { - bin = test_bin; - } - else { - bin = dom_bins_d.extent(0); - } - // Has to be atomic as many threads from different rows - // may be writing to the same bin - Kokkos::atomic_add(&dom_bins_d(bin - 1), 1); - }); - } - - PetscReal swap_dom_val; - // Do a fixed alpha_diag - if (fraction_swap < 0){ - swap_dom_val = -fraction_swap; - } - // Otherwise swap everything bigger than a fixed fraction - else{ - - // Parallel scan to inclusive sum the number of entries we have in - // the bins - Kokkos::parallel_scan(dom_bins_d.extent(0), KOKKOS_LAMBDA (const PetscInt i, PetscInt& update, const bool final) { - // Inclusive scan - update += dom_bins_d(i); - if (final) { - dom_bins_d(i) = update; // only update array on final pass - } - }); - - // Now if we reduce how many are > the search_size, we know the bin boundary we want - int bin_boundary = 0; - Kokkos::parallel_reduce ("ReductionBin", dom_bins_d.extent(0), KOKKOS_LAMBDA (const int i, int& update) { - if (dom_bins_d(i) > dom_bins_d(dom_bins_d.extent(0)-1) - search_size) update++; - }, bin_boundary); - - bin_boundary = dom_bins_d.extent(0) - bin_boundary; - swap_dom_val = double(bin_boundary) / double(dom_bins_d.extent(0)); - - } - - // Go and swap F points to C points - Kokkos::parallel_for( - Kokkos::RangePolicy<>(0, local_rows_aff), KOKKOS_LAMBDA(PetscInt i) { - - if (diag_dom_ratio_d(i) != 0.0 && diag_dom_ratio_d(i) >= swap_dom_val) - { - // This is the actual numbering in A, rather than Aff - PetscInt idx = is_fine_local_d(i); - cf_markers_d(idx) *= -1; - } - }); - // Ensure we're done before we exit - exec.fence(); - } - - return; -} - -//------------------------------------------------------------------------------------------------------------------------ \ No newline at end of file diff --git a/src/PMISR_Module.F90 b/src/PMISR_Module.F90 new file mode 100644 index 00000000..e5916d28 --- /dev/null +++ b/src/PMISR_Module.F90 @@ -0,0 +1,1236 @@ +module pmisr_module + + use iso_c_binding + use petscmat + use petsc_helper, only: kokkos_debug + use c_petsc_interfaces, only: pmisr_kokkos, copy_cf_markers_d2h, & + vecscatter_mat_begin_c, vecscatter_mat_end_c, vecscatter_mat_restore_c, & + allreducesum_petscint_mine, boolscatter_mat_begin_c, boolscatter_mat_end_c, & + boolscatter_mat_reverse_begin_c, boolscatter_mat_reverse_end_c + use pflare_parameters, only: C_POINT, F_POINT + +#include "petsc/finclude/petscmat.h" +#include "finclude/PETSc_ISO_Types.h" + + implicit none + + public + + contains + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine pmisr(strength_mat, max_luby_steps, pmis, cf_markers_local, zero_measure_c_point) + + ! Wrapper + + ! ~~~~~~ + + type(tMat), target, intent(in) :: strength_mat + integer, intent(in) :: max_luby_steps + logical, intent(in) :: pmis + integer, dimension(:), allocatable, target, intent(inout) :: cf_markers_local + logical, optional, intent(in) :: zero_measure_c_point + +#if defined(PETSC_HAVE_KOKKOS) + integer(c_long_long) :: A_array + PetscErrorCode :: ierr + MatType :: mat_type + integer :: pmis_int, zero_measure_c_point_int, seed_size, kfree, comm_rank, errorcode + integer, dimension(:), allocatable :: seed + PetscReal, dimension(:), allocatable, target :: measure_local + PetscInt :: local_rows, local_cols + MPIU_Comm :: MPI_COMM_MATRIX + type(c_ptr) :: measure_local_ptr, cf_markers_local_ptr + integer, dimension(:), allocatable :: cf_markers_local_two +#endif + ! ~~~~~~~~~~ + +#if defined(PETSC_HAVE_KOKKOS) + + call MatGetType(strength_mat, mat_type, ierr) + if (mat_type == MATMPIAIJKOKKOS .OR. mat_type == MATSEQAIJKOKKOS .OR. & + mat_type == MATAIJKOKKOS) then + + call PetscObjectGetComm(strength_mat, MPI_COMM_MATRIX, ierr) + call MPI_Comm_rank(MPI_COMM_MATRIX, comm_rank, errorcode) + + A_array = strength_mat%v + pmis_int = 0 + if (pmis) pmis_int = 1 + zero_measure_c_point_int = 0 + if (present(zero_measure_c_point)) then + if (zero_measure_c_point) zero_measure_c_point_int = 1 + end if + + ! Let's generate the random values on the host for now so they match + ! for comparisons with pmisr_cpu + call MatGetLocalSize(strength_mat, local_rows, local_cols, ierr) + allocate(measure_local(local_rows)) + call random_seed(size=seed_size) + allocate(seed(seed_size)) + do kfree = 1, seed_size + seed(kfree) = comm_rank + 1 + kfree + end do + call random_seed(put=seed) + ! Fill the measure with random numbers + call random_number(measure_local) + deallocate(seed) + + measure_local_ptr = c_loc(measure_local) + + allocate(cf_markers_local(local_rows)) + cf_markers_local_ptr = c_loc(cf_markers_local) + + ! Creates a cf_markers on the device + call pmisr_kokkos(A_array, max_luby_steps, pmis_int, measure_local_ptr, zero_measure_c_point_int) + + ! If debugging do a comparison between CPU and Kokkos results + if (kokkos_debug()) then + + ! Kokkos PMISR by default now doesn't copy back to the host, as any following ddc calls + ! use the device data + call copy_cf_markers_d2h(cf_markers_local_ptr) + call pmisr_cpu(strength_mat, max_luby_steps, pmis, cf_markers_local_two, zero_measure_c_point) + + if (any(cf_markers_local /= cf_markers_local_two)) then + + ! do kfree = 1, local_rows + ! if (cf_markers_local(kfree) /= cf_markers_local_two(kfree)) then + ! print *, kfree, "no match", cf_markers_local(kfree), cf_markers_local_two(kfree) + ! end if + ! end do + print *, "Kokkos and CPU versions of pmisr do not match" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + deallocate(cf_markers_local_two) + end if + + else + call pmisr_cpu(strength_mat, max_luby_steps, pmis, cf_markers_local, zero_measure_c_point) + end if +#else + call pmisr_cpu(strength_mat, max_luby_steps, pmis, cf_markers_local, zero_measure_c_point) +#endif + + ! ~~~~~~ + + end subroutine pmisr + +! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine pmisr_cpu(strength_mat, max_luby_steps, pmis, cf_markers_local, zero_measure_c_point) + + ! Let's do our own independent set with a Luby algorithm + ! If PMIS is true, this is a traditional PMIS algorithm + ! If PMIS is false, this is a PMISR + ! PMISR swaps the C-F definition compared to a PMIS and + ! also checks the measure from smallest, rather than the largest + ! PMISR should give an Aff with no off-diagonal strong connections + ! If you set positive max_luby_steps, it will avoid all parallel reductions + ! by taking a fixed number of times in the Luby top loop + + ! ~~~~~~ + + type(tMat), target, intent(in) :: strength_mat + integer, intent(in) :: max_luby_steps + logical, intent(in) :: pmis + integer, dimension(:), allocatable, intent(inout) :: cf_markers_local + logical, optional, intent(in) :: zero_measure_c_point + + ! Local + PetscInt :: local_rows, local_cols, global_rows, global_cols + PetscInt :: global_row_start, global_row_end_plus_one, ifree, ncols + PetscInt :: rows_ao, cols_ao, n_ad, n_ao + integer :: comm_size, seed_size + integer :: comm_rank, errorcode + integer :: kfree + PetscErrorCode :: ierr + MPIU_Comm :: MPI_COMM_MATRIX + integer, dimension(:), allocatable :: seed + PetscReal, dimension(:), allocatable :: measure_local + type(tMat) :: Ad, Ao + PetscInt, dimension(:), pointer :: colmap + PetscInt, dimension(:), pointer :: ad_ia, ad_ja, ao_ia, ao_ja + PetscInt :: shift = 0 + PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done + logical :: zero_measure_c = .FALSE. + + ! ~~~~~~ + + if (present(zero_measure_c_point)) zero_measure_c = zero_measure_c_point + + ! Get the comm size + call PetscObjectGetComm(strength_mat, MPI_COMM_MATRIX, ierr) + call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) + ! Get the comm rank + call MPI_Comm_rank(MPI_COMM_MATRIX, comm_rank, errorcode) + + ! Get the local sizes + call MatGetLocalSize(strength_mat, local_rows, local_cols, ierr) + call MatGetSize(strength_mat, global_rows, global_cols, ierr) + call MatGetOwnershipRange(strength_mat, global_row_start, global_row_end_plus_one, ierr) + + if (comm_size /= 1) then + call MatMPIAIJGetSeqAIJ(strength_mat, Ad, Ao, colmap, ierr) + ! We know the col size of Ao is the size of colmap, the number of non-zero offprocessor columns + call MatGetSize(Ao, rows_ao, cols_ao, ierr) + else + Ad = strength_mat + end if + + ! ~~~~~~~~ + ! Get pointers to the sequential diagonal and off diagonal aij structures + ! ~~~~~~~~ + call MatGetRowIJ(Ad,shift,symmetric,inodecompressed,n_ad,ad_ia,ad_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + if (comm_size /= 1) then + call MatGetRowIJ(Ao,shift,symmetric,inodecompressed,n_ao,ao_ia,ao_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + end if + ! ~~~~~~~~~~ + + ! Get the number of connections in S + allocate(measure_local(local_rows)) + allocate(cf_markers_local(local_rows)) + cf_markers_local = 0 + + ! ~~~~~~~~~~~~ + ! Seed the measure_local between 0 and 1 + ! ~~~~~~~~~~~~ + call random_seed(size=seed_size) + allocate(seed(seed_size)) + do kfree = 1, seed_size + seed(kfree) = comm_rank + 1 + kfree + end do + call random_seed(put=seed) + + ! To get the same results regardless of number of processors, you can + ! force the random number on each node to match across all processors + ! This is tricky to do, given the numbering of rows is different in parallel + ! I did code up a version that used the unique spatial node positions to seed the random + ! number generator and test that and it works the same regardless of num of procs + ! so I'm fairly confident things are correct + + ! Fill the measure with random numbers + call random_number(measure_local) + deallocate(seed) + + ! ~~~~~~~~~~ + + ! ~~~~~~~~~~~~ + ! Add the number of connections in S to the randomly seeded measure_local + ! The number of connections is just equal to a matvec with a vec of all ones and the strength_mat + ! We don't have to bother with a matvec though as we know the strenth_mat has entries of one + ! ~~~~~~~~~~~~ + do ifree = 1, local_rows + + ! Do local component + ncols = ad_ia(ifree+1) - ad_ia(ifree) + measure_local(ifree) = measure_local(ifree) + ncols + + ! Do non local component + if (comm_size /= 1) then + ncols = ao_ia(ifree+1) - ao_ia(ifree) + measure_local(ifree) = measure_local(ifree) + ncols + end if + end do + + ! Restore the sequantial pointers once we're done + call MatRestoreRowIJ(Ad,shift,symmetric,inodecompressed,n_ad,ad_ia,ad_ja,done,ierr) + if (comm_size /= 1) then + call MatRestoreRowIJ(Ao,shift,symmetric,inodecompressed,n_ao,ao_ia,ao_ja,done,ierr) + end if + + ! If PMIS then we want to search the measure based on the largest entry + ! PMISR searches the measure based on the smallest entry + ! We just let the measure be negative rather than change the .ge. comparison + ! in our Luby below + if (pmis) measure_local = measure_local * (-1) + + call pmisr_existing_measure_cf_markers(strength_mat, max_luby_steps, pmis, & + measure_local, cf_markers_local, zero_measure_c_point) + + deallocate(measure_local) + ! If PMIS then we swap the CF markers from PMISR + if (pmis) then + cf_markers_local = cf_markers_local * (-1) + end if + + end subroutine pmisr_cpu + + ! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine pmisr_existing_measure_cf_markers(strength_mat, max_luby_steps, pmis, & + measure_local, cf_markers_local, zero_measure_c_point) + + ! PMISR implementation that takes an existing measure_local and cf_markers_local + ! and then does the Luby algorithm to assign the rest of the CF markers + + ! ~~~~~~ + + type(tMat), target, intent(in) :: strength_mat + integer, intent(in) :: max_luby_steps + logical, intent(in) :: pmis + PetscReal, dimension(:), allocatable :: measure_local + integer, dimension(:), intent(inout) :: cf_markers_local + logical, optional, intent(in) :: zero_measure_c_point + + ! Local + PetscInt :: local_rows, local_cols, global_rows, global_cols + PetscInt :: global_row_start, global_row_end_plus_one, ifree + PetscInt :: jfree + PetscInt :: rows_ao, cols_ao, n_ad, n_ao + PetscInt :: counter_undecided, counter_in_set_start, counter_parallel + integer :: comm_size, loops_through + integer :: comm_rank, errorcode + PetscErrorCode :: ierr + MPIU_Comm :: MPI_COMM_MATRIX + PFLARE_PETSCBOOL_C_TYPE, dimension(:), allocatable :: in_set_this_loop + PFLARE_PETSCBOOL_C_TYPE, dimension(:), allocatable, target :: assigned_local, assigned_nonlocal + type(c_ptr) :: measure_nonlocal_ptr=c_null_ptr, assigned_local_ptr=c_null_ptr, assigned_nonlocal_ptr=c_null_ptr + real(c_double), pointer :: measure_nonlocal(:) => null() + type(tMat) :: Ad, Ao + type(tVec) :: measure_vec + PetscInt, dimension(:), pointer :: colmap + integer(c_long_long) :: A_array, vec_long + PetscInt, dimension(:), pointer :: ad_ia, ad_ja, ao_ia, ao_ja + PetscInt :: shift = 0 + PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done + logical :: zero_measure_c = .FALSE. + PetscInt, parameter :: nz_ignore = -1, one=1, zero=0 + + ! ~~~~~~ + + if (present(zero_measure_c_point)) zero_measure_c = zero_measure_c_point + + ! Get the comm size + call PetscObjectGetComm(strength_mat, MPI_COMM_MATRIX, ierr) + call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) + ! Get the comm rank + call MPI_Comm_rank(MPI_COMM_MATRIX, comm_rank, errorcode) + + ! Get the local sizes + call MatGetLocalSize(strength_mat, local_rows, local_cols, ierr) + call MatGetSize(strength_mat, global_rows, global_cols, ierr) + call MatGetOwnershipRange(strength_mat, global_row_start, global_row_end_plus_one, ierr) + + if (comm_size /= 1) then + call MatMPIAIJGetSeqAIJ(strength_mat, Ad, Ao, colmap, ierr) + ! We know the col size of Ao is the size of colmap, the number of non-zero offprocessor columns + call MatGetSize(Ao, rows_ao, cols_ao, ierr) + else + Ad = strength_mat + end if + + ! ~~~~~~~~ + ! Get pointers to the sequential diagonal and off diagonal aij structures + ! ~~~~~~~~ + call MatGetRowIJ(Ad,shift,symmetric,inodecompressed,n_ad,ad_ia,ad_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + if (comm_size /= 1) then + call MatGetRowIJ(Ao,shift,symmetric,inodecompressed,n_ao,ao_ia,ao_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + end if + ! ~~~~~~~~~~ + + ! Get the number of connections in S + allocate(in_set_this_loop(local_rows)) + allocate(assigned_local(local_rows)) + + ! ~~~~~~~~~~~~ + ! Create parallel vec and scatter the measure + ! ~~~~~~~~~~~~ + if (comm_size/=1) then + + ! This is fine being mpi type specifically as strength_mat is always a mataij + call VecCreateMPIWithArray(MPI_COMM_MATRIX, one, & + local_rows, global_rows, measure_local, measure_vec, ierr) + + A_array = strength_mat%v + vec_long = measure_vec%v + ! We're just going to use the existing lvec to scatter the measure + ! Have to call restore after we're done with lvec (ie measure_nonlocal_ptr) + call vecscatter_mat_begin_c(A_array, vec_long, measure_nonlocal_ptr) + call vecscatter_mat_end_c(A_array, vec_long, measure_nonlocal_ptr) + ! This is the lvec so we have to make sure we don't do a matvec anywhere + ! before calling restore + call c_f_pointer(measure_nonlocal_ptr, measure_nonlocal, shape=[cols_ao]) + + allocate(assigned_nonlocal(cols_ao)) + assigned_local_ptr = c_loc(assigned_local) + assigned_nonlocal_ptr = c_loc(assigned_nonlocal) + else + ! Need to avoid uninitialised warning + allocate(assigned_nonlocal(0)) + end if + + ! ~~~~~~~~~~~~ + ! Initialise the set + ! ~~~~~~~~~~~~ + counter_in_set_start = 0 + assigned_local = .FALSE. + assigned_nonlocal = .FALSE. + + ! If already assigned by the input + do ifree = 1, local_rows + if (cf_markers_local(ifree) /= 0) assigned_local(ifree) = .TRUE. + end do + + do ifree = 1, local_rows + + ! Skip if already assigned + if (assigned_local(ifree)) then + counter_in_set_start = counter_in_set_start + 1 + cycle + end if + + ! If there are no strong neighbours (not measure_local == 0 as we have added a random number to it) + ! then we treat it special + ! Absolute value here given measure_local could be negative (pmis) or positive (pmisr) + if (abs(measure_local(ifree)) < 1) then + + ! Assign this node + assigned_local(ifree) = .TRUE. + + ! This is typically enabled in a second pass of PMIS just on C points + ! (ie aggressive coarsening based on MIS(MIS(1))), we want to keep + ! C-points with no other strong C connections as C points + if (zero_measure_c) then + if (pmis) then + ! Set as F here but reversed below to become C + cf_markers_local(ifree) = F_POINT + else + ! Becomes C + cf_markers_local(ifree) = C_POINT + end if + else + if (pmis) then + ! Set as C here but reversed below to become F + ! Otherwise dirichlet conditions persist down onto the coarsest grid + cf_markers_local(ifree) = C_POINT + else + ! Becomes F + cf_markers_local(ifree) = F_POINT + end if + end if + counter_in_set_start = counter_in_set_start + 1 + end if + end do + + ! Check the total number of undecided in parallel + if (max_luby_steps < 0) then + counter_undecided = local_rows - counter_in_set_start + ! Parallel reduction! + ! This is just an allreduce sum, but we can't use MPIU_INTEGER, as if we call the pmisr + ! cf splitting from C it is not defined - also have to pass the matrix so we can get the comm + ! given they're different in C and fortran + A_array = strength_mat%v + call allreducesum_petscint_mine(A_array, counter_undecided, counter_parallel) + counter_undecided = counter_parallel + + ! If we're doing a fixed number of steps, then we don't care + ! how many undecided nodes we have - have to take care here not to use + ! local_rows for counter_undecided, as we may have zero DOFs on some procs + ! but we have to enter the loop below for the collective scatters + else + counter_undecided = 1 + end if + + ! ~~~~~~~~~~~~ + ! Now go through the outer Luby loop + ! ~~~~~~~~~~~~ + + ! Let's keep track of how many times we go through the loops + loops_through = -1 + + do while (counter_undecided /= 0) + + ! If max_luby_steps is positive, then we only take that many times through this top loop + ! We typically find 2-3 iterations decides >99% of the nodes + ! and a fixed number of outer loops means we don't have to do any parallel reductions + ! We will do redundant nearest neighbour comms in the case we have already + ! finished deciding all the nodes, but who cares + ! Any undecided nodes just get turned into C points + ! We can do this as we know we won't ruin Aff by doing so, unlike in a normal multigrid + if (max_luby_steps > 0 .AND. max_luby_steps+1 == -loops_through) exit + + ! ~~~~~~~~~ + ! Start the async broadcast of assigned_local to assigned_nonlocal + ! ~~~~~~~~~ + if (comm_size /= 1) then + call boolscatter_mat_begin_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) + end if + + ! Reset in_set_this_loop, which keeps track of which nodes are added to the set this loop + do ifree = 1, local_rows + ! If they're already assigned they can't be added + if (assigned_local(ifree)) then + in_set_this_loop(ifree) = .FALSE. + ! We assume any unassigned are added to the set this loop and then rule them out below + else + in_set_this_loop(ifree) = .TRUE. + end if + end do + + ! ~~~~~~~~ + ! The Luby algorithm has measure_local(v) > measure_local(u) for all u in active neighbours + ! and then you have to loop from the nodes with biggest measure_local down + ! That is the definition of PMIS + ! PMISR swaps the CF definitions from a traditional PMIS + ! PMISR starts from the smallest measure_local and ensure + ! measure_local(v) < measure_local(u) for all u in active neighbours + ! measure_local is negative for PMIS and positive for PMISR + ! that way we dont have to change the .ge. in the comparison code below + ! ~~~~~~~~ + + ! ~~~~~~~~ + ! Go and do the local component + ! ~~~~~~~~ + node_loop_local: do ifree = 1, local_rows + + ! Check if this node is already in A + if (assigned_local(ifree)) cycle node_loop_local + ! Loop over all the active strong neighbours on the local processors + do jfree = ad_ia(ifree)+1, ad_ia(ifree+1) + + ! Have to only check unassigned strong neighbours + if (assigned_local(ad_ja(jfree) + 1)) cycle + + ! Check the measure_local + if (measure_local(ifree) .ge. measure_local(ad_ja(jfree) + 1)) then + in_set_this_loop(ifree) = .FALSE. + cycle node_loop_local + end if + end do + end do node_loop_local + + ! ~~~~~~~~ + ! Finish the async broadcast, assigned_nonlocal is now correct + ! ~~~~~~~~ + if (comm_size /= 1) then + call boolscatter_mat_end_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) + end if + + ! ~~~~~~~~ + ! Now go through and do the non-local part of the matrix + ! ~~~~~~~~ + if (comm_size /= 1) then + + node_loop: do ifree = 1, local_rows + + ! Check if already ruled out by local loop or already assigned + if (assigned_local(ifree) .OR. .NOT. in_set_this_loop(ifree)) cycle node_loop + + ! Loop over all the active strong neighbours on the non-local processors + do jfree = ao_ia(ifree)+1, ao_ia(ifree+1) + + ! Have to only check unassigned strong neighbours + if (assigned_nonlocal(ao_ja(jfree) + 1)) cycle + + ! Check the measure_local + if (measure_local(ifree) .ge. measure_nonlocal(ao_ja(jfree) + 1)) then + in_set_this_loop(ifree) = .FALSE. + cycle node_loop + end if + end do + + end do node_loop + end if + + ! We now know all nodes which were added to the set this loop, so let's record them + do ifree = 1, local_rows + if (in_set_this_loop(ifree)) then + assigned_local(ifree) = .TRUE. + cf_markers_local(ifree) = F_POINT + end if + end do + + ! ~~~~~~~~~~~~~~ + ! All the work below here is now to ensure assigned_local is correct for the next iteration + ! Update the nonlocal values first then comm them + ! ~~~~~~~~~~~~~~ + if (comm_size /= 1) then + + ! We're going to do an LOR reduce so start all as false + assigned_nonlocal = .FALSE. + + do ifree = 1, local_rows + + ! Only need to update neighbours of nodes assigned this top loop + if (.NOT. in_set_this_loop(ifree)) cycle + + ! We know all neighbours of points assigned this loop are C points + ! We don't actually need to record that they're C points, just that they're assigned + do jfree = ao_ia(ifree)+1, ao_ia(ifree+1) + assigned_nonlocal(ao_ja(jfree) + 1) = .TRUE. + end do + end do + + ! ~~~~~~~~~~~ + ! We need to start the async reduce LOR of the assigned_nonlocal into assigned_local + ! After this comms finishes any local node in another processors halo + ! that has been assigned on another process will be correctly marked in assigned_local + ! ~~~~~~~~~~~ + call boolscatter_mat_reverse_begin_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) + + end if + + ! ~~~~~~~~~~~~~~ + ! Now go and update the local values + ! ~~~~~~~~~~~~~~ + + do ifree = 1, local_rows + + ! Only need to update neighbours of nodes assigned this top loop + if (.NOT. in_set_this_loop(ifree)) cycle + + ! Don't need a guard here to check if they're already assigned, as we + ! can guarantee they won't be + do jfree = ad_ia(ifree)+1, ad_ia(ifree+1) + assigned_local(ad_ja(jfree) + 1) = .TRUE. + end do + end do + + ! ~~~~~~~~~ + ! In parallel we have to finish our asyn comms + ! ~~~~~~~~~ + if (comm_size /= 1) then + ! Finishes the reduce LOR, assigned_local will now be correct + call boolscatter_mat_reverse_end_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) + end if + + ! ~~~~~~~~~~~~ + ! We've now done another top level loop + ! ~~~~~~~~~~~~ + loops_through = loops_through - 1 + + ! ~~~~~~~~~~~~ + ! Check the total number of undecided in parallel before we loop again + ! ~~~~~~~~~~~~ + if (max_luby_steps < 0) then + ! Count how many are undecided + counter_undecided = local_rows - count(assigned_local) + ! Parallel reduction! + A_array = strength_mat%v + call allreducesum_petscint_mine(A_array, counter_undecided, counter_parallel) + counter_undecided = counter_parallel + end if + end do + + ! Any unassigned become C points + do ifree = 1, local_rows + if (cf_markers_local(ifree) == 0) cf_markers_local(ifree) = C_POINT + end do + + ! ~~~~~~~~~~~~ + ! We're finished our IS now + ! ~~~~~~~~~~~~ + + ! Restore the sequantial pointers once we're done + call MatRestoreRowIJ(Ad,shift,symmetric,inodecompressed,n_ad,ad_ia,ad_ja,done,ierr) + if (comm_size /= 1) then + call MatRestoreRowIJ(Ao,shift,symmetric,inodecompressed,n_ao,ao_ia,ao_ja,done,ierr) + end if + + ! ~~~~~~~~~ + ! Cleanup + ! ~~~~~~~~~ + deallocate(in_set_this_loop, assigned_local) + if (comm_size/=1) then + call VecDestroy(measure_vec, ierr) + ! Don't forget to restore on lvec from our matrix + call vecscatter_mat_restore_c(A_array, measure_nonlocal_ptr) + end if + deallocate(assigned_nonlocal) + + end subroutine pmisr_existing_measure_cf_markers + + ! ------------------------------------------------------------------------------------------------------------------------------- + + subroutine pmisr_existing_measure_implicit_transpose(strength_mat, max_luby_steps, pmis, & + measure_local, cf_markers_local, zero_measure_c_point) + + ! ~~~~~~~~~~~~~~~~~~~~~ + ! PMISR implementation that takes an existing measure_local and cf_markers_local + ! and then does the Luby algorithm to assign the rest of the CF markers + ! + ! Unlike pmisr_existing_measure_cf_markers, this routine takes the strength matrix S + ! (not S+S^T) and handles the transpose implicitly. This avoids the expensive explicit + ! formation of S+S^T, particularly in parallel. + ! This only works because you the measure is generated outside + ! of this routine and so can be based on S+S^T, but the Luby loop only needs to know + ! the strong dependencies (from S) and strong influences (from S^T) of each node, + ! not the full S+S^T + ! Unlike pmisr_existing_measure_cf_markers you don't have to have removed the diagonal + ! of S + ! + ! PMISR needs to work with S+S^T to keep out large entries from Aff + ! So instead we do several comms steps in our Luby loop to get/send the data we need + ! We do compute local copies of the transpose of S (which is cheap and local) + ! but we never have the full parallel S+S^T + ! On this rank we have the number of: + ! local strong dependencies (from the local S) + ! local strong influences (from the local S^T) + ! non-local strong dependencies (from the non-local part of S) + ! But we don't have the number of non-local strong influences (from the non-local part of S^T) + ! Now we have to be careful as the local part of S and S^T may have entries in the same + ! row/column position, so we have to be sure not to count them twice (the same can't happen + ! for the non-local components) + ! ~~~~~~~~~~~~~~~~~~~~~ + + ! ~~~~~~ + + type(tMat), target, intent(in) :: strength_mat + integer, intent(in) :: max_luby_steps + logical, intent(in) :: pmis + PetscReal, dimension(:), allocatable :: measure_local + integer, dimension(:), intent(inout) :: cf_markers_local + logical, optional, intent(in) :: zero_measure_c_point + + ! Local + PetscInt :: local_rows, local_cols, global_rows, global_cols + PetscInt :: global_row_start, global_row_end_plus_one, ifree + PetscInt :: jfree, kfree + PetscInt :: rows_ao, cols_ao, n_ad, n_ao, n_spst, n_aot + PetscInt :: counter_undecided, counter_in_set_start, counter_parallel + integer :: comm_size, loops_through + integer :: comm_rank, errorcode + PetscErrorCode :: ierr + MPIU_Comm :: MPI_COMM_MATRIX + PFLARE_PETSCBOOL_C_TYPE, dimension(:), allocatable, target :: in_set_this_loop + PFLARE_PETSCBOOL_C_TYPE, dimension(:), allocatable, target :: assigned_local, assigned_nonlocal + PFLARE_PETSCBOOL_C_TYPE, dimension(:), allocatable, target :: veto_local, veto_nonlocal + type(c_ptr) :: measure_nonlocal_ptr=c_null_ptr, assigned_local_ptr=c_null_ptr, assigned_nonlocal_ptr=c_null_ptr + type(c_ptr) :: veto_local_ptr=c_null_ptr, veto_nonlocal_ptr=c_null_ptr, in_set_ptr=c_null_ptr + real(c_double), pointer :: measure_nonlocal(:) => null() + type(tMat) :: Ad, Ao, Ad_spst, Ao_transpose + type(tVec) :: measure_vec + PetscInt, dimension(:), pointer :: colmap + integer(c_long_long) :: A_array, vec_long + PetscInt, dimension(:), pointer :: ad_ia, ad_ja, ao_ia, ao_ja + PetscInt, dimension(:), pointer :: spst_ia, spst_ja, aot_ia, aot_ja + PetscInt :: shift = 0 + PetscBool :: symmetric = PETSC_FALSE, inodecompressed = PETSC_FALSE, done + logical :: zero_measure_c = .FALSE. + logical :: destroy_spst, destroy_aot + PetscInt, parameter :: nz_ignore = -1, one=1, zero=0 + PetscReal :: petsc_one = 1d0 + + ! ~~~~~~ + + if (present(zero_measure_c_point)) zero_measure_c = zero_measure_c_point + + ! Get the comm size + call PetscObjectGetComm(strength_mat, MPI_COMM_MATRIX, ierr) + call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) + ! Get the comm rank + call MPI_Comm_rank(MPI_COMM_MATRIX, comm_rank, errorcode) + + ! Get the local sizes + call MatGetLocalSize(strength_mat, local_rows, local_cols, ierr) + call MatGetSize(strength_mat, global_rows, global_cols, ierr) + call MatGetOwnershipRange(strength_mat, global_row_start, global_row_end_plus_one, ierr) + + if (comm_size /= 1) then + call MatMPIAIJGetSeqAIJ(strength_mat, Ad, Ao, colmap, ierr) + ! We know the col size of Ao is the size of colmap, the number of non-zero offprocessor columns + call MatGetSize(Ao, rows_ao, cols_ao, ierr) + else + Ad = strength_mat + end if + + ! ~~~~~~~~ + ! Get pointers to the sequential diagonal and off diagonal aij structures + ! ~~~~~~~~ + call MatGetRowIJ(Ad,shift,symmetric,inodecompressed,n_ad,ad_ia,ad_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + if (comm_size /= 1) then + call MatGetRowIJ(Ao,shift,symmetric,inodecompressed,n_ao,ao_ia,ao_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + end if + ! ~~~~~~~~~~ + + ! ~~~~~~~~ + ! Compute local Ad + Ad^T + ! We explicitly compute the local part of S+S^T so we don't have to + ! match the row/column indices in the Luby loop + ! This is cheap as it is purely local (no communication) + ! ~~~~~~~~ + destroy_spst = .FALSE. + if (ad_ia(n_ad+1) > 0) then + call MatTranspose(Ad, MAT_INITIAL_MATRIX, Ad_spst, ierr) + call MatAXPY(Ad_spst, petsc_one, Ad, DIFFERENT_NONZERO_PATTERN, ierr) + destroy_spst = .TRUE. + else + Ad_spst = Ad + end if + + ! Get CSR pointers for Ad+Ad^T + call MatGetRowIJ(Ad_spst,shift,symmetric,inodecompressed,n_spst,spst_ia,spst_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ for Ad_spst" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + + ! ~~~~~~~~ + ! Compute local Ao^T (MPI only) + ! Ao has dimensions [local_rows x cols_ao] + ! Ao^T has dimensions [cols_ao x local_rows] + ! Row k of Ao^T tells us which local rows have connections to nonlocal column k + ! This lets us handle the non-local strong influences without forming the + ! full parallel S+S^T + ! ~~~~~~~~ + destroy_aot = .FALSE. + if (comm_size /= 1) then + if (ao_ia(n_ao+1) > 0) then + call MatTranspose(Ao, MAT_INITIAL_MATRIX, Ao_transpose, ierr) + destroy_aot = .TRUE. + call MatGetRowIJ(Ao_transpose,shift,symmetric,inodecompressed,n_aot,aot_ia,aot_ja,done,ierr) + if (.NOT. done) then + print *, "Pointers not set in call to MatGetRowIJ for Ao_transpose" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if + end if + end if + + ! ~~~~~~~~~~ + + allocate(in_set_this_loop(local_rows)) + allocate(assigned_local(local_rows)) + allocate(veto_local(local_rows)) + + ! ~~~~~~~~~~~~ + ! Create parallel vec and scatter the measure + ! ~~~~~~~~~~~~ + if (comm_size/=1) then + + ! This is fine being mpi type specifically as strength_mat is always a mataij + call VecCreateMPIWithArray(MPI_COMM_MATRIX, one, & + local_rows, global_rows, measure_local, measure_vec, ierr) + + A_array = strength_mat%v + vec_long = measure_vec%v + ! We're just going to use the existing lvec to scatter the measure + ! Have to call restore after we're done with lvec (ie measure_nonlocal_ptr) + call vecscatter_mat_begin_c(A_array, vec_long, measure_nonlocal_ptr) + call vecscatter_mat_end_c(A_array, vec_long, measure_nonlocal_ptr) + ! This is the lvec so we have to make sure we don't do a matvec anywhere + ! before calling restore + call c_f_pointer(measure_nonlocal_ptr, measure_nonlocal, shape=[cols_ao]) + + allocate(assigned_nonlocal(cols_ao)) + allocate(veto_nonlocal(cols_ao)) + assigned_local_ptr = c_loc(assigned_local) + assigned_nonlocal_ptr = c_loc(assigned_nonlocal) + veto_local_ptr = c_loc(veto_local) + veto_nonlocal_ptr = c_loc(veto_nonlocal) + in_set_ptr = c_loc(in_set_this_loop) + else + ! Need to avoid uninitialised warning + allocate(assigned_nonlocal(0)) + allocate(veto_nonlocal(0)) + end if + + ! ~~~~~~~~~~~~ + ! Initialise the set + ! ~~~~~~~~~~~~ + counter_in_set_start = 0 + assigned_local = .FALSE. + assigned_nonlocal = .FALSE. + + ! If already assigned by the input + do ifree = 1, local_rows + if (cf_markers_local(ifree) /= 0) assigned_local(ifree) = .TRUE. + end do + + do ifree = 1, local_rows + + ! Skip if already assigned + if (assigned_local(ifree)) then + counter_in_set_start = counter_in_set_start + 1 + cycle + end if + + ! If there are no strong neighbours (not measure_local == 0 as we have added a random number to it) + ! then we treat it special + ! Absolute value here given measure_local could be negative (pmis) or positive (pmisr) + if (abs(measure_local(ifree)) < 1) then + + ! Assign this node + assigned_local(ifree) = .TRUE. + + ! This is typically enabled in a second pass of PMIS just on C points + ! (ie aggressive coarsening based on MIS(MIS(1))), we want to keep + ! C-points with no other strong C connections as C points + if (zero_measure_c) then + if (pmis) then + ! Set as F here but reversed below to become C + cf_markers_local(ifree) = F_POINT + else + ! Becomes C + cf_markers_local(ifree) = C_POINT + end if + else + if (pmis) then + ! Set as C here but reversed below to become F + ! Otherwise dirichlet conditions persist down onto the coarsest grid + cf_markers_local(ifree) = C_POINT + else + ! Becomes F + cf_markers_local(ifree) = F_POINT + end if + end if + counter_in_set_start = counter_in_set_start + 1 + end if + end do + + ! Check the total number of undecided in parallel + if (max_luby_steps < 0) then + counter_undecided = local_rows - counter_in_set_start + ! Parallel reduction! + ! This is just an allreduce sum, but we can't use MPIU_INTEGER, as if we call the pmisr + ! cf splitting from C it is not defined - also have to pass the matrix so we can get the comm + ! given they're different in C and fortran + A_array = strength_mat%v + call allreducesum_petscint_mine(A_array, counter_undecided, counter_parallel) + counter_undecided = counter_parallel + + ! If we're doing a fixed number of steps, then we don't care + ! how many undecided nodes we have - have to take care here not to use + ! local_rows for counter_undecided, as we may have zero DOFs on some procs + ! but we have to enter the loop below for the collective scatters + else + counter_undecided = 1 + end if + + ! ~~~~~~~~~~~~ + ! Now go through the outer Luby loop + ! The key difference from pmisr_existing_measure_cf_markers is that we use a + ! veto pattern to handle the implicit transpose. Instead of iterating over + ! neighbours in S+S^T directly, we: + ! 1. Check local dependencies and influences via Ad+Ad^T (computed locally) + ! 2. Check non-local influences via Ao^T + reverse scatter + ! 3. Check non-local dependencies via Ao (same as before) + ! This avoids forming the full parallel S+S^T + ! ~~~~~~~~~~~~ + + ! Let's keep track of how many times we go through the loops + loops_through = -1 + + do while (counter_undecided /= 0) + + ! If max_luby_steps is positive, then we only take that many times through this top loop + ! We typically find 2-3 iterations decides >99% of the nodes + ! and a fixed number of outer loops means we don't have to do any parallel reductions + ! We will do redundant nearest neighbour comms in the case we have already + ! finished deciding all the nodes, but who cares + ! Any undecided nodes just get turned into C points + ! We can do this as we know we won't ruin Aff by doing so, unlike in a normal multigrid + if (max_luby_steps > 0 .AND. max_luby_steps+1 == -loops_through) exit + + ! ~~~~~~~~~ + ! Start the async broadcast of assigned_local to assigned_nonlocal + ! We need assigned_nonlocal for both the non-local dependency check + ! and the non-local influence veto + ! ~~~~~~~~~ + if (comm_size /= 1) then + call boolscatter_mat_begin_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) + end if + + ! ~~~~~~~~ + ! Now we use veto to keep track of which candidates can be in the set + ! Locally we know which ones cannot be in the set due to local strong + ! dependencies (from Ad) and strong influences (from Ad^T), combined in Ad+Ad^T + ! but not the non-local influences as they are stored on many other ranks (ie in S^T) + ! ~~~~~~~~ + + ! ~~~~~~~~ + ! Local veto: check neighbours in Ad+Ad^T (covers both local dependencies and influences) + ! ~~~~~~~~ + node_loop_local: do ifree = 1, local_rows + + ! Already assigned nodes are always vetoed + if (assigned_local(ifree)) then + veto_local(ifree) = .TRUE. + cycle node_loop_local + end if + + ! Assume not vetoed, then check local Ad+Ad^T neighbours + veto_local(ifree) = .FALSE. + + do jfree = spst_ia(ifree)+1, spst_ia(ifree+1) + + ! Skip the diagonal - Ad+Ad^T includes diagonal entries from the original + ! matrix but we only care about off-diagonal strong connections + if (spst_ja(jfree) + 1 == ifree) cycle + + ! Have to only check unassigned strong neighbours + if (assigned_local(spst_ja(jfree) + 1)) cycle + + ! Check the measure_local + if (measure_local(ifree) .ge. measure_local(spst_ja(jfree) + 1)) then + veto_local(ifree) = .TRUE. + cycle node_loop_local + end if + end do + end do node_loop_local + + ! ~~~~~~~~ + ! Finish the async broadcast, assigned_nonlocal is now correct + ! ~~~~~~~~ + if (comm_size /= 1) then + call boolscatter_mat_end_c(A_array, assigned_local_ptr, assigned_nonlocal_ptr) + end if + + ! ~~~~~~~~ + ! Non-local influence veto using Ao^T + ! For each nonlocal column k, Ao^T row k tells us which local rows + ! have connections TO nonlocal node k. If nonlocal node k is unassigned + ! and has measure >= any of those local rows' measures, then nonlocal node k + ! has a local influence that vetoes it. + ! We set veto_nonlocal(k) = TRUE which will be reverse scattered back to + ! the owning processor to veto their local node. + ! ~~~~~~~~ + if (comm_size /= 1) then + + veto_nonlocal = .FALSE. + + if (destroy_aot) then + do kfree = 1, cols_ao + + ! Only check unassigned nonlocal nodes + if (assigned_nonlocal(kfree)) cycle + + do jfree = aot_ia(kfree)+1, aot_ia(kfree+1) + + ! The column index in Ao^T is a local row index + ! Have to only check unassigned local rows + if (assigned_local(aot_ja(jfree) + 1)) cycle + + ! If the nonlocal node's measure >= local row's measure, + ! the nonlocal node is vetoed by this local influence + if (measure_nonlocal(kfree) .ge. measure_local(aot_ja(jfree) + 1)) then + veto_nonlocal(kfree) = .TRUE. + exit + end if + end do + end do + end if + + ! ~~~~~~~~ + ! Reverse scatter the veto: veto_nonlocal → veto_local with LOR + ! After this, veto_local(i) is TRUE if any non-local transpose neighbour + ! vetoes local node i + ! ~~~~~~~~ + call boolscatter_mat_reverse_begin_c(A_array, veto_local_ptr, veto_nonlocal_ptr) + ! Not sure we have any chance to overlap this with anything else + call boolscatter_mat_reverse_end_c(A_array, veto_local_ptr, veto_nonlocal_ptr) + + ! ~~~~~~~~ + ! Now the comms have finished, we know exactly which local nodes on this rank have no + ! local strong dependencies, influences, non-local influences but not yet non-local + ! dependencies + ! Let's do the non-local dependencies and then now that the comms are done on veto_local + ! the combination of both of those gives us all our vetos, so we can assign anything + ! without a veto into the set + ! ~~~~~~~~ + node_loop: do ifree = 1, local_rows + + ! Check if already vetoed (by local check or reverse scatter) or already assigned + if (veto_local(ifree)) cycle node_loop + + ! Loop over all the active strong neighbours on the non-local processors + do jfree = ao_ia(ifree)+1, ao_ia(ifree+1) + + ! Have to only check unassigned strong neighbours + if (assigned_nonlocal(ao_ja(jfree) + 1)) cycle + + ! Check the measure_local + if (measure_local(ifree) .ge. measure_nonlocal(ao_ja(jfree) + 1)) then + veto_local(ifree) = .TRUE. + cycle node_loop + end if + end do + + end do node_loop + end if + + ! ~~~~~~~~ + ! We now know all nodes which were added to the set this loop + ! Nodes with veto_local = FALSE are in the set + ! Record them in cf_markers and assigned_local, and track which were + ! just assigned for the neighbour marking phase + ! ~~~~~~~~ + do ifree = 1, local_rows + if (.NOT. veto_local(ifree)) then + in_set_this_loop(ifree) = .TRUE. + assigned_local(ifree) = .TRUE. + cf_markers_local(ifree) = F_POINT + else + in_set_this_loop(ifree) = .FALSE. + end if + end do + + ! ~~~~~~~~~~~~~~ + ! All the work below here is now to ensure assigned_local is correct for the next iteration + ! We need to mark all neighbours of just-assigned nodes as assigned (C points) + ! This has four components: + ! 1. Local dependencies and influences via Ad+Ad^T + ! 2. Non-local dependencies via Ao + reverse scatter (existing pattern) + ! 3. Non-local influences via forward scatter of in_set + Ao^T + ! ~~~~~~~~~~~~~~ + + ! ~~~~~~~~~~~~~~ + ! 1. Local: mark Ad+Ad^T neighbours of just-assigned nodes + ! This covers both local strong dependencies and local strong influences + ! ~~~~~~~~~~~~~~ + do ifree = 1, local_rows + + ! Only need to update neighbours of nodes assigned this top loop + if (.NOT. in_set_this_loop(ifree)) cycle + + ! Don't need a guard here to check if they're already assigned, as we + ! can guarantee they won't be + do jfree = spst_ia(ifree)+1, spst_ia(ifree+1) + ! Skip the diagonal + if (spst_ja(jfree) + 1 == ifree) cycle + assigned_local(spst_ja(jfree) + 1) = .TRUE. + end do + end do + + if (comm_size /= 1) then + + ! ~~~~~~~~~~~~~~ + ! 2. Non-local dependencies: for each just-assigned local node, mark its + ! Ao neighbours as assigned on the owning processor via reverse scatter + ! This tells remote processors: "your local row is now assigned because + ! it is a forward neighbour of one of my just-assigned nodes" + ! ~~~~~~~~~~~~~~ + + ! We reuse veto_nonlocal for the reverse scatter + veto_nonlocal = .FALSE. + + do ifree = 1, local_rows + + ! Only need to update neighbours of nodes assigned this top loop + if (.NOT. in_set_this_loop(ifree)) cycle + + ! We know all neighbours of points assigned this loop are C points + ! We don't actually need to record that they're C points, just that they're assigned + do jfree = ao_ia(ifree)+1, ao_ia(ifree+1) + veto_nonlocal(ao_ja(jfree) + 1) = .TRUE. + end do + end do + + ! ~~~~~~~~~~~ + ! Reduce LOR of veto_nonlocal into assigned_local + ! After this comms finishes any local node in another processors halo + ! that has been assigned on another process will be correctly marked in assigned_local + ! ~~~~~~~~~~~ + call boolscatter_mat_reverse_begin_c(A_array, assigned_local_ptr, veto_nonlocal_ptr) + call boolscatter_mat_reverse_end_c(A_array, assigned_local_ptr, veto_nonlocal_ptr) + + ! ~~~~~~~~~~~~~~ + ! 3. Non-local influences: we need to know which nonlocal nodes were just + ! assigned this loop so we can mark their local transpose neighbours + ! Forward scatter in_set_this_loop to learn which nonlocal columns were just assigned + ! Then iterate over Ao^T to mark local rows that are influenced by those nodes + ! ~~~~~~~~~~~~~~ + + ! Reuse veto_nonlocal to receive the forward scatter result + veto_nonlocal = .FALSE. + call boolscatter_mat_begin_c(A_array, in_set_ptr, veto_nonlocal_ptr) + call boolscatter_mat_end_c(A_array, in_set_ptr, veto_nonlocal_ptr) + + ! Now veto_nonlocal(k) = TRUE means the remote node at nonlocal column k + ! was just assigned this loop on its owning processor + ! All local rows that connect to that nonlocal node via Ao (i.e. Ao^T row k) + ! should be marked as assigned + if (destroy_aot) then + do kfree = 1, cols_ao + if (.NOT. veto_nonlocal(kfree)) cycle + + do jfree = aot_ia(kfree)+1, aot_ia(kfree+1) + assigned_local(aot_ja(jfree) + 1) = .TRUE. + end do + end do + end if + + end if + + ! ~~~~~~~~~~~~ + ! We've now done another top level loop + ! ~~~~~~~~~~~~ + loops_through = loops_through - 1 + + ! ~~~~~~~~~~~~ + ! Check the total number of undecided in parallel before we loop again + ! ~~~~~~~~~~~~ + if (max_luby_steps < 0) then + ! Count how many are undecided + counter_undecided = local_rows - count(assigned_local) + ! Parallel reduction! + A_array = strength_mat%v + call allreducesum_petscint_mine(A_array, counter_undecided, counter_parallel) + counter_undecided = counter_parallel + end if + end do + + ! Any unassigned become C points + do ifree = 1, local_rows + if (cf_markers_local(ifree) == 0) cf_markers_local(ifree) = C_POINT + end do + + ! ~~~~~~~~~~~~ + ! We're finished our IS now + ! ~~~~~~~~~~~~ + + ! Restore the sequential pointers once we're done + call MatRestoreRowIJ(Ad_spst,shift,symmetric,inodecompressed,n_spst,spst_ia,spst_ja,done,ierr) + call MatRestoreRowIJ(Ad,shift,symmetric,inodecompressed,n_ad,ad_ia,ad_ja,done,ierr) + if (comm_size /= 1) then + call MatRestoreRowIJ(Ao,shift,symmetric,inodecompressed,n_ao,ao_ia,ao_ja,done,ierr) + if (destroy_aot) then + call MatRestoreRowIJ(Ao_transpose,shift,symmetric,inodecompressed,n_aot,aot_ia,aot_ja,done,ierr) + end if + end if + + ! ~~~~~~~~~ + ! Cleanup + ! ~~~~~~~~~ + if (destroy_spst) call MatDestroy(Ad_spst, ierr) + if (destroy_aot) call MatDestroy(Ao_transpose, ierr) + + deallocate(in_set_this_loop, assigned_local, veto_local) + if (comm_size/=1) then + call VecDestroy(measure_vec, ierr) + ! Don't forget to restore on lvec from our matrix + call vecscatter_mat_restore_c(A_array, measure_nonlocal_ptr) + end if + deallocate(assigned_nonlocal, veto_nonlocal) + + end subroutine pmisr_existing_measure_implicit_transpose + + ! ------------------------------------------------------------------------------------------------------------------------------- + +end module pmisr_module + diff --git a/src/PMISR_Modulek.kokkos.cxx b/src/PMISR_Modulek.kokkos.cxx new file mode 100644 index 00000000..70311cd0 --- /dev/null +++ b/src/PMISR_Modulek.kokkos.cxx @@ -0,0 +1,1189 @@ +// Our petsc kokkos definitions - has to go first +#include "kokkos_helper.hpp" +#include +#include <../src/mat/impls/aij/seq/aij.h> +#include <../src/mat/impls/aij/mpi/mpiaij.h> + +// The definition of the device copy of the cf markers on a given level +// is stored in Device_Datak.kokkos.cxx and imported as extern from +// kokkos_helper.hpp + +//------------------------------------------------------------------------------------------------------------------------ + +// PMISR implementation that takes an existing measure and cf_markers on the device +// and then does the Luby algorithm to assign the rest of the CF markers +// This mirrors the CPU version pmisr_existing_measure_cf_markers in PMISR_Module.F90 +PETSC_INTERN void pmisr_existing_measure_cf_markers_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int) +{ + + MPI_Comm MPI_COMM_MATRIX; + PetscInt local_rows, local_cols, global_rows, global_cols; + PetscInt global_row_start, global_row_end_plus_one; + PetscInt rows_ao, cols_ao; + MatType mat_type; + + PetscCallVoid(MatGetType(*strength_mat, &mat_type)); + // Are we in parallel? + const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; + + Mat_MPIAIJ *mat_mpi = nullptr; + Mat mat_local = NULL, mat_nonlocal = NULL; + + if (mpi) + { + mat_mpi = (Mat_MPIAIJ *)(*strength_mat)->data; + PetscCallVoid(MatMPIAIJGetSeqAIJ(*strength_mat, &mat_local, &mat_nonlocal, NULL)); + PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); + } + else + { + mat_local = *strength_mat; + } + + // Get the comm + PetscCallVoid(PetscObjectGetComm((PetscObject)*strength_mat, &MPI_COMM_MATRIX)); + PetscCallVoid(MatGetLocalSize(*strength_mat, &local_rows, &local_cols)); + PetscCallVoid(MatGetSize(*strength_mat, &global_rows, &global_cols)); + // This returns the global index of the local portion of the matrix + PetscCallVoid(MatGetOwnershipRange(*strength_mat, &global_row_start, &global_row_end_plus_one)); + + // ~~~~~~~~~~~~ + // Get pointers to the i,j,vals on the device + // ~~~~~~~~~~~~ + const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; + PetscMemType mtype; + PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; + PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); + if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); + + // PetscSF comms cannot be started with a pointer derived from a zero-extent Kokkos view - + // doing so causes intermittent failures in parallel on GPUs. Use a size-1 dummy view + // so that every pointer passed to PetscSF is always backed by valid device memory. + intKokkosView sf_int_dummy_d("sf_int_dummy_d", 1); + PetscScalarKokkosView sf_scalar_dummy_d("sf_scalar_dummy_d", 1); + + intKokkosView cf_markers_nonlocal_d; + int *cf_markers_d_ptr = NULL, *cf_markers_nonlocal_d_ptr = NULL; + cf_markers_d_ptr = local_rows > 0 ? cf_markers_d.data() : sf_int_dummy_d.data(); + + intKokkosView cf_markers_send_d; + int *cf_markers_send_d_ptr = NULL; + + PetscScalar *measure_local_d_ptr = NULL, *measure_nonlocal_d_ptr = NULL; + measure_local_d_ptr = local_rows > 0 ? measure_local_d.data() : sf_scalar_dummy_d.data(); + PetscScalarKokkosView measure_nonlocal_d; + + if (mpi) { + measure_nonlocal_d = PetscScalarKokkosView("measure_nonlocal_d", cols_ao); + measure_nonlocal_d_ptr = cols_ao > 0 ? measure_nonlocal_d.data() : sf_scalar_dummy_d.data(); + cf_markers_nonlocal_d = intKokkosView("cf_markers_nonlocal_d", cols_ao); + cf_markers_nonlocal_d_ptr = cols_ao > 0 ? cf_markers_nonlocal_d.data() : sf_int_dummy_d.data(); + cf_markers_send_d = intKokkosView("cf_markers_send_d", local_rows); + cf_markers_send_d_ptr = local_rows > 0 ? cf_markers_send_d.data() : sf_int_dummy_d.data(); + } + + // Device memory for the mark + boolKokkosView mark_d("mark_d", local_rows); + auto exec = PetscGetKokkosExecutionSpace(); + + // Start the scatter of the measure - the kokkos memtype is set as PETSC_MEMTYPE_HOST or + // one of the kokkos backends like PETSC_MEMTYPE_HIP + PetscMemType mem_type = PETSC_MEMTYPE_KOKKOS; + if (mpi) + { + // PetscSF owns measure_local_d_ptr as the active send buffer until End. + // Do not even read from that send buffer before End is called. + // If you alias it in overlapped GPU work, the failure shows up intermittently + // in parallel runs on GPUs. + PetscCallVoid(PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPIU_SCALAR, + mem_type, measure_local_d_ptr, + mem_type, measure_nonlocal_d_ptr, + MPI_REPLACE)); + } + + // Initialise the set + PetscInt counter_in_set_start = 0; + // Count how many in the set to begin with and set their CF markers + Kokkos::parallel_reduce ("Reduction", local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) { + // If already assigned by the input + if (cf_markers_d(i) != 0) + { + update++; + } + else if (Kokkos::abs(measure_local_d[i]) < 1) + { + if (zero_measure_c_point_int == 1) { + if (pmis_int == 1) { + // Set as F here but reversed below to become C + cf_markers_d(i) = -1; + } + else { + // Becomes C + cf_markers_d(i) = 1; + } + } + else { + if (pmis_int == 1) { + // Set as C here but reversed below to become F + // Otherwise dirichlet conditions persist down onto the coarsest grid + cf_markers_d(i) = 1; + } + else { + // Becomes F + cf_markers_d(i) = -1; + } + } + // Count + update++; + } + }, counter_in_set_start); + + // Check the total number of undecided in parallel + PetscInt counter_undecided, counter_parallel; + if (max_luby_steps < 0) { + counter_undecided = local_rows - counter_in_set_start; + // Parallel reduction! + PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&counter_undecided, &counter_parallel, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX)); + counter_undecided = counter_parallel; + + // If we're doing a fixed number of steps, then we don't care + // how many undecided nodes we have - have to take care here not to use + // local_rows for counter_undecided, as we may have zero DOFs on some procs + // but we have to enter the loop below for the collective scatters + } + else { + counter_undecided = 1; + } + + // Finish the broadcast for the nonlocal measure + if (mpi) + { + // End releases the active send buffer for normal access again. + // The scattered values in measure_nonlocal_d are now safe to consume. + PetscCallVoid(PetscSFBcastEnd(mat_mpi->Mvctx, MPIU_SCALAR, measure_local_d_ptr, measure_nonlocal_d_ptr, MPI_REPLACE)); + } + + // ~~~~~~~~~~~~ + // Now go through the outer Luby loop + // ~~~~~~~~~~~~ + + // Let's keep track of how many times we go through the loops + int loops_through = -1; + do + { + // Match the fortran version and include a pre-test on the do-while + if (counter_undecided == 0) break; + + // If max_luby_steps is positive, then we only take that many times through this top loop + // We typically find 2-3 iterations decides >99% of the nodes + // and a fixed number of outer loops means we don't have to do any parallel reductions + // We will do redundant nearest neighbour comms in the case we have already + // finished deciding all the nodes, but who cares + // Any undecided nodes just get turned into C points + // We can do this as we know we won't ruin Aff by doing so, unlike in a normal multigrid + if (max_luby_steps > 0 && max_luby_steps+1 == -loops_through) break; + + // ~~~~~~~~~ + // Start the async scatter of the nonlocal cf_markers + // ~~~~~~~~~ + if (mpi) { + // Copy cf_markers_d into a temporary buffer + // If we gave the comms routine cf_markers_d we couldn't even read from + // it until comms ended, meaning we couldn't do the work overlapping below + Kokkos::deep_copy(cf_markers_send_d, cf_markers_d); + // Be careful these aren't petscints + // PetscSF owns cf_markers_send_d_ptr as the active send buffer until End. + // Do not even read from that send buffer before End is called. + // If you alias it in overlapped GPU work, the failure shows up intermittently + // in parallel runs on GPUs. + PetscCallVoid(PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT, + mem_type, cf_markers_send_d_ptr, + mem_type, cf_markers_nonlocal_d_ptr, + MPI_REPLACE)); + } + + + // mark_d keeps track of which of the candidate nodes can become in the set + // Only need this because we want to do async comms so we need a way to trigger + // a node not being in the set due to either strong local neighbours *or* strong offproc neighbours + + // ~~~~~~~~ + // Go and do the local component + // ~~~~~~~~ + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + PetscInt strong_neighbours = 0; + + // Check this row isn't already marked + if (cf_markers_d(i) == 0) + { + const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; + + // Reduce over local columns to get the number of strong neighbours + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(t, ncols_local), + [&](const PetscInt j, PetscInt& strong_count) { + + // Have to only check active strong neighbours + if (measure_local_d(i) >= measure_local_d(device_local_j[device_local_i[i] + j]) && \ + cf_markers_d(device_local_j[device_local_i[i] + j]) == 0) + { + strong_count++; + } + + }, strong_neighbours + ); + + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + // If we have any strong neighbours + if (strong_neighbours > 0) + { + mark_d(i) = false; + } + else + { + mark_d(i) = true; + } + }); + } + // Any that aren't zero cf marker are already assigned so set to to false + else + { + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + mark_d(i) = false; + }); + } + }); + + // ~~~~~~~~ + // Now go through and do the non-local part of the matrix + // ~~~~~~~~ + if (mpi) { + + // Finish the async scatter + // Be careful these aren't petscints + // End releases the send snapshot for normal access again. + // The scattered cf_markers_nonlocal_d values are now safe to read. + PetscCallVoid(PetscSFBcastEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_send_d_ptr, cf_markers_nonlocal_d_ptr, MPI_REPLACE)); + + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + PetscInt strong_neighbours = 0; + + // Check this row isn't already marked + if (mark_d(i)) + { + PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; + + // Reduce over nonlocal columns to get the number of strong neighbours + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(t, ncols_nonlocal), + [&](const PetscInt j, PetscInt& strong_count) { + + if (measure_local_d(i) >= measure_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]) && \ + cf_markers_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]) == 0) + { + strong_count++; + } + + }, strong_neighbours + ); + + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + // If we don't have any strong neighbours + if (strong_neighbours == 0) cf_markers_d(i) = loops_through; + }); + } + }); + } + // This cf_markers_d(i) = loops_through happens above in the case of mpi, saves a kernel launch + else + { + // The nodes that have mark equal to true have no strong active neighbours in the IS + // hence they can be in the IS + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + + if (mark_d(i)) cf_markers_d(i) = loops_through; + }); + } + + if (mpi) + { + // We're going to do an add reverse scatter, so set them to zero + Kokkos::deep_copy(cf_markers_nonlocal_d, 0); + + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + + // Check if this node has been assigned during this top loop + if (cf_markers_d(i) == loops_through) + { + PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; + + // For over nonlocal columns + Kokkos::parallel_for( + Kokkos::TeamThreadRange(t, ncols_nonlocal), [&](const PetscInt j) { + + // Needs to be atomic as may being set by many threads + Kokkos::atomic_store(&cf_markers_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]), 1); + }); + } + }); + + // Ensure everything is done before we comm + exec.fence(); + + // We've updated the values in cf_markers_nonlocal + // Calling a reverse scatter add will then update the values of cf_markers_local + // Reduce with a sum, equivalent to VecScatterBegin with ADD_VALUES, SCATTER_REVERSE + // Be careful these aren't petscints + // PetscSF now owns cf_markers_nonlocal_d_ptr as the active send buffer. + // The local kernel below only touches cf_markers_d, and that is fine here + // because we only care about zero versus nonzero after ReduceEnd. + PetscCallVoid(PetscSFReduceWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT, + mem_type, cf_markers_nonlocal_d_ptr, + mem_type, cf_markers_d_ptr, + MPIU_SUM)); + } + + // Go and do local + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + + // Check if this node has been assigned during this top loop + if (cf_markers_d(i) == loops_through) + { + const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; + + // For over nonlocal columns + Kokkos::parallel_for( + Kokkos::TeamThreadRange(t, ncols_local), [&](const PetscInt j) { + + // Needs to be atomic as may being set by many threads + // Tried a version where instead of a "push" approach I tried a pull approach + // that doesn't need an atomic, but it was slower + Kokkos::atomic_store(&cf_markers_d(device_local_j[device_local_i[i] + j]), 1); + }); + } + }); + + if (mpi) + { + // Finish the scatter + // Be careful these aren't petscints + // After End the accumulated cf_markers_d values are complete. + // This is the first point where later logic should consume the reduced + // result rather than the in-flight root buffer. + PetscCallVoid(PetscSFReduceEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_nonlocal_d_ptr, cf_markers_d_ptr, MPIU_SUM)); + } + + // We've done another top level loop + loops_through = loops_through - 1; + + // ~~~~~~~~~~~~ + // Check the total number of undecided in parallel before we loop again + // ~~~~~~~~~~~~ + if (max_luby_steps < 0) { + + counter_undecided = 0; + Kokkos::parallel_reduce ("ReductionCounter_undecided", local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) { + if (cf_markers_d(i) == 0) update++; + }, counter_undecided); + + // Parallel reduction! + PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&counter_undecided, &counter_parallel, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX)); + counter_undecided = counter_parallel; + } else { + // If we're doing a fixed number of steps, then we need an extra fence + // as we don't hit the parallel reduce above (which implicitly fences) + exec.fence(); + } + + } + while (counter_undecided != 0); + + // ~~~~~~~~~ + // Now assign our final cf markers + // ~~~~~~~~~ + + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + + if (cf_markers_d(i) == 0) + { + cf_markers_d(i) = 1; + } + else if (cf_markers_d(i) < 0) + { + cf_markers_d(i) = -1; + } + else + { + cf_markers_d(i) = 1; + } + }); + // Ensure we're done before we exit + exec.fence(); + + return; +} + +//------------------------------------------------------------------------------------------------------------------------ + +// PMISR implementation that takes an existing measure and cf_markers on the device +// and then does the Luby algorithm to assign the rest of the CF markers +// This version takes S (not S+S^T) as the strength matrix and handles the transpose +// implicitly - it never forms the full parallel S+S^T +// See the full comments in the CPU version pmisr_existing_measure_implicit_transpose +PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscScalarKokkosView &measure_local_d, intKokkosView &cf_markers_d, const int zero_measure_c_point_int) +{ + + MPI_Comm MPI_COMM_MATRIX; + PetscInt local_rows, local_cols, global_rows, global_cols; + PetscInt global_row_start, global_row_end_plus_one; + PetscInt rows_ao, cols_ao; + MatType mat_type; + + PetscCallVoid(MatGetType(*strength_mat, &mat_type)); + // Are we in parallel? + const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; + + Mat_MPIAIJ *mat_mpi = nullptr; + Mat mat_local = NULL, mat_nonlocal = NULL, mat_local_spst = NULL, mat_nonlocal_transpose = NULL; + + // ~~~~~~~~~~~~~~~~~~~~~ + // PMISR needs to work with S+S^T to keep out large entries from Aff + // but we never want to form S+S^T explicitly as it is expensive + // So instead we do several comms steps in our Luby loop to get/send the data we need + // We do compute local copies of the transpose of S (which happen on the device) + // but we never have the full parallel S+S^T + // On this rank we have the number of: + // local strong dependencies (from the local S) + // local strong influences (from the local S^T) + // non-local strong dependencies (from the non-local part of S) + // But we don't have the number of non-local strong influences (from the non-local part of S^T) + // Now we have to be careful as the local part of S and S^T may have entries in the same + // row/column position, so we have to be sure not to count them twice (the same can't happen + // for the non-local components) + // ~~~~~~~~~~~~~~~~~~~~~ + + Mat_SeqAIJKokkos *mat_nonlocal_kok, *mat_local_kok; + PetscInt zero = 0; + bool destroy_nonlocal_transpose = false; + bool destroy_spst = false; + + if (mpi) + { + mat_mpi = (Mat_MPIAIJ *)(*strength_mat)->data; + PetscCallVoid(MatMPIAIJGetSeqAIJ(*strength_mat, &mat_local, &mat_nonlocal, NULL)); + PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); + mat_nonlocal_kok = static_cast(mat_nonlocal->spptr); + // The transpose can crash if mat_nonlocal is empty + if (mat_nonlocal_kok->csrmat.nnz() > zero) + { + PetscCallVoid(MatTranspose(mat_nonlocal, MAT_INITIAL_MATRIX, &mat_nonlocal_transpose)); + destroy_nonlocal_transpose = true; + } + } + else + { + mat_local = *strength_mat; + } + mat_local_kok = static_cast(mat_local->spptr); + + // Get the comm + PetscCallVoid(PetscObjectGetComm((PetscObject)*strength_mat, &MPI_COMM_MATRIX)); + PetscCallVoid(MatGetLocalSize(*strength_mat, &local_rows, &local_cols)); + PetscCallVoid(MatGetSize(*strength_mat, &global_rows, &global_cols)); + // This returns the global index of the local portion of the matrix + PetscCallVoid(MatGetOwnershipRange(*strength_mat, &global_row_start, &global_row_end_plus_one)); + + // ~~~~~~~~~~~~ + // Form the local S+S^T and get CSR pointers + // We explicitly compute the local part of S+S^T so we don't have to + // match the row/column indices - could do this as a symbolic as we don't need the values + // ~~~~~~~~~~~~ + PetscScalar one = 1.0; + if (mat_local_kok->csrmat.nnz() > zero) + { + PetscCallVoid(MatTranspose(mat_local, MAT_INITIAL_MATRIX, &mat_local_spst)); + PetscCallVoid(MatAXPY(mat_local_spst, one, mat_local, DIFFERENT_NONZERO_PATTERN)); + destroy_spst = true; + } + else + { + mat_local_spst = mat_local; + } + + // ~~~~~~~~~~~~ + // Get pointers to the i,j on the device for all the matrices we need + // ~~~~~~~~~~~~ + const PetscInt *device_local_i_spst = nullptr, *device_local_j_spst = nullptr; + const PetscInt *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; + const PetscInt *device_nonlocal_i_transpose = nullptr, *device_nonlocal_j_transpose = nullptr; + PetscMemType mtype; + PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local_spst, &device_local_i_spst, &device_local_j_spst, NULL, &mtype)); + if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, NULL, &mtype)); + if (mpi && mat_nonlocal_transpose != NULL) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal_transpose, &device_nonlocal_i_transpose, &device_nonlocal_j_transpose, NULL, &mtype)); + + // PetscSF comms cannot be started with a pointer derived from a zero-extent Kokkos view - + // doing so causes intermittent failures in parallel on GPUs. Use a size-1 dummy view + // so that every pointer passed to PetscSF is always backed by valid device memory. + intKokkosView sf_int_dummy_d("sf_int_dummy_d", 1); + PetscScalarKokkosView sf_scalar_dummy_d("sf_scalar_dummy_d", 1); + boolKokkosView sf_bool_dummy_d("sf_bool_dummy_d", 1); + + intKokkosView cf_markers_nonlocal_d; + int *cf_markers_nonlocal_d_ptr = NULL; + + intKokkosView cf_markers_send_d; + int *cf_markers_send_d_ptr = NULL; + + PetscScalar *measure_local_d_ptr = NULL, *measure_nonlocal_d_ptr = NULL; + measure_local_d_ptr = local_rows > 0 ? measure_local_d.data() : sf_scalar_dummy_d.data(); + PetscScalarKokkosView measure_nonlocal_d; + + // ~~~~~~~~~~~~~~~ + // veto stores whether a node has been veto'd as a candidate + // .NOT. veto(i) means the node can be in the set + // veto(i) means the node cannot be in the set + // ~~~~~~~~~~~~~~~ + boolKokkosView veto_local_d("veto_local_d", local_rows); + boolKokkosView veto_nonlocal_d; + bool *veto_local_d_ptr = nullptr, *veto_nonlocal_d_ptr = nullptr; + veto_local_d_ptr = local_rows > 0 ? veto_local_d.data() : sf_bool_dummy_d.data(); + + if (mpi) { + measure_nonlocal_d = PetscScalarKokkosView("measure_nonlocal_d", cols_ao); + measure_nonlocal_d_ptr = cols_ao > 0 ? measure_nonlocal_d.data() : sf_scalar_dummy_d.data(); + cf_markers_nonlocal_d = intKokkosView("cf_markers_nonlocal_d", cols_ao); + cf_markers_nonlocal_d_ptr = cols_ao > 0 ? cf_markers_nonlocal_d.data() : sf_int_dummy_d.data(); + cf_markers_send_d = intKokkosView("cf_markers_send_d", local_rows); + cf_markers_send_d_ptr = local_rows > 0 ? cf_markers_send_d.data() : sf_int_dummy_d.data(); + veto_nonlocal_d = boolKokkosView("veto_nonlocal_d", cols_ao); + veto_nonlocal_d_ptr = cols_ao > 0 ? veto_nonlocal_d.data() : sf_bool_dummy_d.data(); + } + + auto exec = PetscGetKokkosExecutionSpace(); + + // The PETSC_MEMTYPE_KOKKOS is either as PETSC_MEMTYPE_HOST or + // one of the backends like PETSC_MEMTYPE_HIP + PetscMemType mem_type = PETSC_MEMTYPE_KOKKOS; + + // Start the scatter of the measure - the kokkos memtype is set as PETSC_MEMTYPE_HOST or + // one of the kokkos backends like PETSC_MEMTYPE_HIP + if (mpi) + { + // PetscSF owns measure_local_d_ptr as the active send buffer until End. + // Do not even read from that send buffer before End is called. + // If you alias it in overlapped GPU work, the failure shows up intermittently + // in parallel runs on GPUs. + PetscCallVoid(PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPIU_SCALAR, + mem_type, measure_local_d_ptr, + mem_type, measure_nonlocal_d_ptr, + MPI_REPLACE)); + } + + // ~~~~~~~~~~~~ + // Initialise the set + // ~~~~~~~~~~~~ + PetscInt counter_in_set_start = 0; + // Count how many in the set to begin with and set their CF markers + Kokkos::parallel_reduce ("Reduction", local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) { + // If already assigned by the input + if (cf_markers_d(i) != 0) + { + update++; + } + else if (Kokkos::abs(measure_local_d[i]) < 1) + { + if (zero_measure_c_point_int == 1) { + if (pmis_int == 1) { + // Set as F here but reversed below to become C + cf_markers_d(i) = -1; + } + else { + // Becomes C + cf_markers_d(i) = 1; + } + } + else { + if (pmis_int == 1) { + // Set as C here but reversed below to become F + // Otherwise dirichlet conditions persist down onto the coarsest grid + cf_markers_d(i) = 1; + } + else { + // Becomes F + cf_markers_d(i) = -1; + } + } + // Count + update++; + } + }, counter_in_set_start); + + // Check the total number of undecided in parallel + PetscInt counter_undecided, counter_parallel; + if (max_luby_steps < 0) { + counter_undecided = local_rows - counter_in_set_start; + // Parallel reduction! + PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&counter_undecided, &counter_parallel, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX)); + counter_undecided = counter_parallel; + + // If we're doing a fixed number of steps, then we don't care + // how many undecided nodes we have - have to take care here not to use + // local_rows for counter_undecided, as we may have zero DOFs on some procs + // but we have to enter the loop below for the collective scatters + } + else { + counter_undecided = 1; + } + + // Finish the broadcast for the nonlocal measure + if (mpi) + { + // End releases the active send buffer for normal access again. + // The scattered values in measure_nonlocal_d are now safe to consume. + PetscCallVoid(PetscSFBcastEnd(mat_mpi->Mvctx, MPIU_SCALAR, measure_local_d_ptr, measure_nonlocal_d_ptr, MPI_REPLACE)); + } + + // ~~~~~~~~~~~~ + // Now go through the outer Luby loop + // ~~~~~~~~~~~~ + + // Let's keep track of how many times we go through the loops + int loops_through = -1; + do + { + // Match the fortran version and include a pre-test on the do-while + if (counter_undecided == 0) break; + + // If max_luby_steps is positive, then we only take that many times through this top loop + // We typically find 2-3 iterations decides >99% of the nodes + // and a fixed number of outer loops means we don't have to do any parallel reductions + // We will do redundant nearest neighbour comms in the case we have already + // finished deciding all the nodes, but who cares + // Any undecided nodes just get turned into C points + // We can do this as we know we won't ruin Aff by doing so, unlike in a normal multigrid + if (max_luby_steps > 0 && max_luby_steps+1 == -loops_through) break; + + // ~~~~~~~~~ + // Start the async scatter of the nonlocal cf_markers + // ~~~~~~~~~ + if (mpi) { + // Copy cf_markers_d into a temporary buffer + // If we gave the comms routine cf_markers_d we couldn't even read from + // it until comms ended, meaning we couldn't do the work overlapping below + Kokkos::deep_copy(cf_markers_send_d, cf_markers_d); + exec.fence(); + // Be careful these aren't petscints + // PetscSF owns cf_markers_send_d_ptr as the active send buffer until End. + // Do not even read from that send buffer before End is called. + // If you alias it in overlapped GPU work, the failure shows up intermittently + // in parallel runs on GPUs. + PetscCallVoid(PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT, + mem_type, cf_markers_send_d_ptr, + mem_type, cf_markers_nonlocal_d_ptr, + MPI_REPLACE)); + } + + // ~~~~~~~~ + // Now we use veto to keep track of which candidates can be in the set + // Locally we know which ones cannot be in the set due to local strong dependencies (mat_local), + // strong influences (mat_local_transpose), and non-local dependencies (mat_nonlocal) + // but not the non-local influences as they are stored on many other ranks (ie in S^T) + // ~~~~~~~~ + + // Let's start by veto'ing any candidates that have strong local dependencies or influences + // using mat_local_spst which is the local S+S^T + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + PetscInt strong_neighbours = 0; + + // Check this row is unassigned + if (cf_markers_d(i) == 0) + { + PetscInt ncols_local = device_local_i_spst[i + 1] - device_local_i_spst[i]; + + // Reduce over local columns in S+S^T to get the number of strong unassigned influences + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(t, ncols_local), + [&](const PetscInt j, PetscInt& strong_count) { + + const PetscInt col = device_local_j_spst[device_local_i_spst[i] + j]; + + // Skip the diagonal + // Have to only check active strong influences + if (measure_local_d(i) >= measure_local_d(col) && cf_markers_d(col) == 0 && col != i) + { + strong_count++; + } + + }, strong_neighbours + ); + + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + // If we have any strong neighbours + if (strong_neighbours > 0) + { + veto_local_d(i) = true; + } + else + { + veto_local_d(i) = false; + } + }); + } + // Any that aren't zero cf marker are already assigned so set to true + else + { + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + veto_local_d(i) = true; + }); + } + }); + + // ~~~~~~~~ + // Now let's go through and veto candidates which have strong influences on this rank + // ie non-local nodes that influence local nodes through S^T + // ~~~~~~~~ + if (mpi) { + + // Initialise to false + Kokkos::deep_copy(veto_nonlocal_d, false); + + // Finish the async scatter + // Be careful these aren't petscints + // End releases the send snapshot for normal access again. + // The scattered cf_markers_nonlocal_d values are now safe to read. + PetscCallVoid(PetscSFBcastEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_send_d_ptr, cf_markers_nonlocal_d_ptr, MPI_REPLACE)); + + // Let's go and mark any non-local entries that have strong influences and comm to other ranks + // We iterate over the transpose of the non-local part of S + // Row k of Ao^T tells us which local rows connect to nonlocal column k + if (mat_nonlocal_transpose != NULL) + { + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, cols_ao, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + PetscInt strong_influences = 0; + + // Check this row is unassigned + if (cf_markers_nonlocal_d(i) == 0) + { + PetscInt ncols_nonlocal = device_nonlocal_i_transpose[i + 1] - device_nonlocal_i_transpose[i]; + + // Reduce over nonlocal columns in the transpose to get the number of strong unassigned influences + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(t, ncols_nonlocal), + [&](const PetscInt j, PetscInt& strong_count) { + + const PetscInt col = device_nonlocal_j_transpose[device_nonlocal_i_transpose[i] + j]; + + // Have to only check active strong influences + if (measure_nonlocal_d(i) >= measure_local_d(col) && cf_markers_d(col) == 0) + { + strong_count++; + } + + }, strong_influences + ); + } + + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + // If this non-local node has strong influences on this rank it may veto it + if (strong_influences > 0) veto_nonlocal_d(i) = true; + }); + }); + } + + // Ensure everything is done before we comm + exec.fence(); + + // Now we reduce the vetos with a lor + // This tells each rank whether any of its local nodes have been vetoed by non-local influences + PetscCallVoid(PetscSFReduceWithMemTypeBegin(mat_mpi->Mvctx, MPI_C_BOOL, + mem_type, veto_nonlocal_d_ptr, + mem_type, veto_local_d_ptr, + MPI_LOR)); + // Not sure we have any chance to overlap this with anything else + PetscCallVoid(PetscSFReduceEnd(mat_mpi->Mvctx, MPI_C_BOOL, veto_nonlocal_d_ptr, veto_local_d_ptr, MPI_LOR)); + + // Now the comms have finished, we know exactly which local nodes on this rank have no + // local strong dependencies, influences, non-local influences but not yet non-local dependencies + // Let's do the non-local dependencies and then now that the comms are done on veto_local_d + // the combination of both of those gives us all our vetos, so we can assign anything without + // a veto into the set + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + PetscInt strong_neighbours = 0; + + // Check this row isn't already marked + if (!veto_local_d(i)) + { + PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; + + // Reduce over nonlocal columns to get the number of non-local strong unassigned dependencies + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(t, ncols_nonlocal), + [&](const PetscInt j, PetscInt& strong_count) { + + if (measure_local_d(i) >= measure_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]) && \ + cf_markers_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]) == 0) + { + strong_count++; + } + + }, strong_neighbours + ); + + // Only want one thread in the team to write the result + Kokkos::single(Kokkos::PerTeam(t), [&]() { + // If we don't have any non-local strong dependencies and the rest of our vetos are false + // we know we are in the set + if (strong_neighbours == 0 && !veto_local_d(i)) cf_markers_d(i) = loops_through; + }); + } + }); + } + // This cf_markers_d(i) = loops_through happens above in the case of mpi, saves a kernel launch + else + { + // The nodes that have .NOT. veto(i) have no strong active neighbours in the IS + // hence they can be in the IS + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + + if (!veto_local_d(i)) cf_markers_d(i) = loops_through; + }); + } + + // ~~~~~~~~~~~~~ + // At this point all the local cf_markers that have been included in the set in this loop are correct + // We need to set all the strong neighbours of these as not in the set + // We can do all the local strong dependencies and influences without comms, but we need to do + // comms to set the non-local strong dependencies and influences + // ~~~~~~~~~~~~~ + + // Go and do local strong dependencies and influences via S+S^T + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + + // Check if this node has been assigned during this top loop + if (cf_markers_d(i) == loops_through) + { + // Do the strong dependencies and influences + PetscInt ncols_local = device_local_i_spst[i + 1] - device_local_i_spst[i]; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(t, ncols_local), [&](const PetscInt j) { + + const PetscInt col = device_local_j_spst[device_local_i_spst[i] + j]; + + // Skip the diagonal - we don't want to mark ourselves as a neighbor + // Needs to be atomic as may being set by many threads + if (cf_markers_d(col) != 1 && col != i) + { + Kokkos::atomic_store(&cf_markers_d(col), 1); + } + }); + } + }); + + // Now we need to set any non-local dependencies or influences of local nodes added to the set in this loop + // to be not in the set + if (mpi) + { + // Now for the influences, we need to broadcast the cf_markers so that + // on other ranks we know which nodes have cf_markers_nonlocal_d(i) == loops_through + // Copy cf_markers_d into a temporary buffer for the forward scatter + Kokkos::deep_copy(cf_markers_send_d, cf_markers_d); + exec.fence(); + // Be careful these aren't petscints + PetscCallVoid(PetscSFBcastWithMemTypeBegin(mat_mpi->Mvctx, MPI_INT, + mem_type, cf_markers_send_d_ptr, + mem_type, cf_markers_nonlocal_d_ptr, + MPI_REPLACE)); + + // We can overlap this with setting the non-local dependencies + + // We use the veto arrays here to do this comms + Kokkos::deep_copy(veto_nonlocal_d, false); + Kokkos::deep_copy(veto_local_d, false); + + // Set non-local strong dependencies + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, local_rows, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + + // Check if this node has been assigned during this top loop + if (cf_markers_d(i) == loops_through) + { + PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; + + // For over nonlocal columns + Kokkos::parallel_for( + Kokkos::TeamThreadRange(t, ncols_nonlocal), [&](const PetscInt j) { + + // Needs to be atomic as may being set by many threads + // If false set it with true, if not do nothing + Kokkos::atomic_compare_exchange(&veto_nonlocal_d(device_nonlocal_j[device_nonlocal_i[i] + j]), false, true); + }); + } + }); + + // Finish the forward scatter before we write to cf_markers_d + // Also ensure this broadcast is done before we launch another on the same SF mat_mpi->Mvctx + PetscCallVoid(PetscSFBcastEnd(mat_mpi->Mvctx, MPI_INT, cf_markers_send_d_ptr, cf_markers_nonlocal_d_ptr, MPI_REPLACE)); + + // Ensure everything is done before we comm + exec.fence(); + + // Now we reduce the veto_nonlocal_d with a lor + // Any local node with veto set to true is not in the set + PetscCallVoid(PetscSFReduceWithMemTypeBegin(mat_mpi->Mvctx, MPI_C_BOOL, + mem_type, veto_nonlocal_d_ptr, + mem_type, veto_local_d_ptr, + MPI_LOR)); + // Not sure if there is anywhere to overlap these comms + PetscCallVoid(PetscSFReduceEnd(mat_mpi->Mvctx, MPI_C_BOOL, veto_nonlocal_d_ptr, veto_local_d_ptr, MPI_LOR)); + + // Let's finish the non-local dependencies + // If this node has been veto'd, then set it to not in the set + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + if (veto_local_d(i)) { + cf_markers_d(i) = 1; + } + }); + + // And now we have the information we need to set any of the non-local influences + if (mat_nonlocal_transpose != NULL) + { + Kokkos::parallel_for( + Kokkos::TeamPolicy<>(exec, cols_ao, Kokkos::AUTO()), + KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { + + // Row + const PetscInt i = t.league_rank(); + + // Check if this node has been assigned during this top loop + if (cf_markers_nonlocal_d(i) == loops_through) + { + PetscInt ncols_nonlocal = device_nonlocal_i_transpose[i + 1] - device_nonlocal_i_transpose[i]; + + // For over nonlocal columns + Kokkos::parallel_for( + Kokkos::TeamThreadRange(t, ncols_nonlocal), [&](const PetscInt j) { + + // Needs to be atomic as may being set by many threads + if (cf_markers_d(device_nonlocal_j_transpose[device_nonlocal_i_transpose[i] + j]) != 1) + { + Kokkos::atomic_store(&cf_markers_d(device_nonlocal_j_transpose[device_nonlocal_i_transpose[i] + j]), 1); + } + }); + } + }); + } + } + + // We've done another top level loop + loops_through = loops_through - 1; + + // ~~~~~~~~~~~~ + // Check the total number of undecided in parallel before we loop again + // ~~~~~~~~~~~~ + if (max_luby_steps < 0) { + + counter_undecided = 0; + Kokkos::parallel_reduce ("ReductionCounter_undecided", local_rows, KOKKOS_LAMBDA (const PetscInt i, PetscInt& update) { + if (cf_markers_d(i) == 0) update++; + }, counter_undecided); + + // Parallel reduction! + PetscCallMPIAbort(MPI_COMM_MATRIX, MPI_Allreduce(&counter_undecided, &counter_parallel, 1, MPIU_INT, MPI_SUM, MPI_COMM_MATRIX)); + counter_undecided = counter_parallel; + } else { + // If we're doing a fixed number of steps, then we need an extra fence + // as we don't hit the parallel reduce above (which implicitly fences) + exec.fence(); + } + + } + while (counter_undecided != 0); + + // Cleanup the local transposes + if (destroy_spst) PetscCallVoid(MatDestroy(&mat_local_spst)); + if (destroy_nonlocal_transpose) PetscCallVoid(MatDestroy(&mat_nonlocal_transpose)); + + // ~~~~~~~~~ + // Now assign our final cf markers + // ~~~~~~~~~ + + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + + if (cf_markers_d(i) == 0) + { + cf_markers_d(i) = 1; + } + else if (cf_markers_d(i) < 0) + { + cf_markers_d(i) = -1; + } + else + { + cf_markers_d(i) = 1; + } + }); + // Ensure we're done before we exit + exec.fence(); + + return; +} + +//------------------------------------------------------------------------------------------------------------------------ + +// PMISR cf splitting but on the device +// This no longer copies back to the host pointer cf_markers_local at the end +// You have to explicitly call copy_cf_markers_d2h(cf_markers_local) to do this +PETSC_INTERN void pmisr_kokkos(Mat *strength_mat, const int max_luby_steps, const int pmis_int, PetscReal *measure_local, const int zero_measure_c_point_int) +{ + + MPI_Comm MPI_COMM_MATRIX; + PetscInt local_rows, local_cols, global_rows, global_cols; + PetscInt rows_ao, cols_ao; + MatType mat_type; + + PetscCallVoid(MatGetType(*strength_mat, &mat_type)); + // Are we in parallel? + const bool mpi = strcmp(mat_type, MATMPIAIJKOKKOS) == 0; + + Mat mat_local = NULL, mat_nonlocal = NULL; + + if (mpi) + { + PetscCallVoid(MatMPIAIJGetSeqAIJ(*strength_mat, &mat_local, &mat_nonlocal, NULL)); + PetscCallVoid(MatGetSize(mat_nonlocal, &rows_ao, &cols_ao)); + } + else + { + mat_local = *strength_mat; + } + + // Get the comm + PetscCallVoid(PetscObjectGetComm((PetscObject)*strength_mat, &MPI_COMM_MATRIX)); + PetscCallVoid(MatGetLocalSize(*strength_mat, &local_rows, &local_cols)); + PetscCallVoid(MatGetSize(*strength_mat, &global_rows, &global_cols)); + + // ~~~~~~~~~~~~ + // Get pointers to the i,j,vals on the device + // ~~~~~~~~~~~~ + const PetscInt *device_local_i = nullptr, *device_local_j = nullptr, *device_nonlocal_i = nullptr, *device_nonlocal_j = nullptr; + PetscMemType mtype; + PetscScalar *device_local_vals = nullptr, *device_nonlocal_vals = nullptr; + PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_local, &device_local_i, &device_local_j, &device_local_vals, &mtype)); + if (mpi) PetscCallVoid(MatSeqAIJGetCSRAndMemType(mat_nonlocal, &device_nonlocal_i, &device_nonlocal_j, &device_nonlocal_vals, &mtype)); + + // Device memory for the global variable cf_markers_local_d - be careful these aren't petsc ints + cf_markers_local_d = intKokkosView("cf_markers_local_d", local_rows); + // Can't use the global directly within the parallel + // regions on the device so just take a shallow copy + intKokkosView cf_markers_d = cf_markers_local_d; + + // Host and device memory for the measure + PetscScalarKokkosViewHost measure_local_h(measure_local, local_rows); + PetscScalarKokkosView measure_local_d("measure_local_d", local_rows); + + auto exec = PetscGetKokkosExecutionSpace(); + + // If you want to generate the randoms on the device + //Kokkos::Random_XorShift64_Pool<> random_pool(/*seed=*/12345); + // Copy the input measure from host to device + Kokkos::deep_copy(measure_local_d, measure_local_h); + // Log copy with petsc + size_t bytes = measure_local_h.extent(0) * sizeof(PetscReal); + PetscCallVoid(PetscLogCpuToGpu(bytes)); + + // Compute the measure + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + + // Randoms on the device + // auto generator = random_pool.get_state(); + // measure_local_d(i) = generator.drand(0., 1.); + // random_pool.free_state(generator); + + const PetscInt ncols_local = device_local_i[i + 1] - device_local_i[i]; + measure_local_d(i) += ncols_local; + + if (mpi) + { + PetscInt ncols_nonlocal = device_nonlocal_i[i + 1] - device_nonlocal_i[i]; + measure_local_d(i) += ncols_nonlocal; + } + // Flip the sign if pmis + if (pmis_int == 1) measure_local_d(i) *= -1; + }); + // Have to ensure the parallel for above finishes before comms + exec.fence(); + + // Call the existing measure cf markers function + pmisr_existing_measure_cf_markers_kokkos(strength_mat, max_luby_steps, pmis_int, measure_local_d, cf_markers_d, zero_measure_c_point_int); + + // If PMIS then we swap the CF markers from PMISR + if (pmis_int) { + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, local_rows), KOKKOS_LAMBDA(PetscInt i) { + cf_markers_d(i) *= -1; + }); + // Ensure we're done before we exit + exec.fence(); + } + + return; +} + +//------------------------------------------------------------------------------------------------------------------------ diff --git a/src/SAbs.F90 b/src/SAbs.F90 new file mode 100644 index 00000000..539a1e5c --- /dev/null +++ b/src/SAbs.F90 @@ -0,0 +1,147 @@ +module sabs + + use petscmat + use petsc_helper, only: MatAXPYWrapper, MatSetAllValues, remove_small_from_sparse + +#include "petsc/finclude/petscmat.h" + + implicit none + public + + contains + +!------------------------------------------------------------------------------------------------------------------------ + + subroutine generate_sabs(input_mat, strong_threshold, symmetrize, square, output_mat, & + allow_drop_diagonal, allow_diag_strength) + + ! Generate strength of connection matrix with absolute value + ! Output has no diagonal entries + + ! ~~~~~~~~~~ + ! Input + type(tMat), intent(in) :: input_mat + type(tMat), intent(inout) :: output_mat + PetscReal, intent(in) :: strong_threshold + logical, intent(in) :: symmetrize, square + logical, intent(in), optional :: allow_drop_diagonal, allow_diag_strength + + PetscInt :: ifree + PetscInt :: local_rows, local_cols, global_rows, global_cols + PetscInt :: global_row_start, global_row_end_plus_one + PetscInt :: global_col_start, global_col_end_plus_one, counter + integer :: errorcode, comm_size + PetscErrorCode :: ierr + PetscInt, parameter :: nz_ignore = -1, one=1, zero=0 + MPIU_Comm :: MPI_COMM_MATRIX + type(tMat) :: transpose_mat + type(tIS) :: zero_diags + PetscInt, dimension(:), pointer :: zero_diags_pointer + logical :: drop_diag, diag_strength + + ! ~~~~~~~~~~ + + drop_diag = .TRUE. + diag_strength = .FALSE. + if (present(allow_drop_diagonal)) drop_diag = allow_drop_diagonal + if (present(allow_diag_strength)) diag_strength = allow_diag_strength + + call PetscObjectGetComm(input_mat, MPI_COMM_MATRIX, ierr) + ! Get the comm size + call MPI_Comm_size(MPI_COMM_MATRIX, comm_size, errorcode) + + ! Get the local sizes + call MatGetLocalSize(input_mat, local_rows, local_cols, ierr) + call MatGetSize(input_mat, global_rows, global_cols, ierr) + ! This returns the global index of the local portion of the matrix + call MatGetOwnershipRange(input_mat, global_row_start, global_row_end_plus_one, ierr) + call MatGetOwnershipRangeColumn(input_mat, global_col_start, global_col_end_plus_one, ierr) + + ! Drop entries smaller than the strong_threshold, with a relative tolerance measured + ! against the biggest abs non-diagonal entry, don't lump and always drop the diagonal + if (.NOT. diag_strength) then + call remove_small_from_sparse(input_mat, strong_threshold, output_mat, & + relative_max_row_tol_int = -1, lump=.FALSE., drop_diagonal_int=-1) + else + ! Measure the strength of connection relative to the diagonal entry, + ! not the max row value excluding the diagonal + call remove_small_from_sparse(input_mat, strong_threshold, output_mat, & + relative_max_row_tol_int = -1, lump=.FALSE., drop_diagonal_int=-1, & + diag_strength_int = 1) + end if + + ! Now symmetrize if desired + if (symmetrize) then + + ! We could just do a symbolic transpose and add the two sets of indices together, + ! but its so much simpler to just add the two together - and the symbolic will be the expensive part + ! anyway + call MatTranspose(output_mat, MAT_INITIAL_MATRIX, transpose_mat, ierr) + ! Kokkos + MPI doesn't have a gpu mataxpy yet, so we have a wrapper around our own version + call MatAXPYWrapper(output_mat, 1d0, transpose_mat) + + ! Don't forget to destroy the explicit transpose + call MatDestroy(transpose_mat, ierr) + + end if + + ! Square the strength matrix to aggressively coarsen (gives a distance 2 MIS) + if (square) then + + if (symmetrize) then + call MatMatMult(output_mat, output_mat, & + MAT_INITIAL_MATRIX, 1d0, transpose_mat, ierr) + else + call MatTransposeMatMult(output_mat, output_mat, & + MAT_INITIAL_MATRIX, 1d0, transpose_mat, ierr) + endif + + ! Also have to add in the original distance 1 connections to the square + ! as the dist 1 strength matrix has had the diagonals removed, so the square won't + ! have the dist 1 connetions in it + call MatAXPYWrapper(transpose_mat, 1d0, output_mat) + call MatDestroy(output_mat, ierr) + + ! Can end up with diagonal entries we have to remove + ! Let's get the diagonals that are zero or unassigned + call MatFindZeroDiagonals(transpose_mat, zero_diags, ierr) + call ISGetIndices(zero_diags, zero_diags_pointer, ierr) + ! Then let's just set every other row to have a zero diagonal + ! as we know they're already preallocated + counter = 1 + do ifree = 1, local_rows + + if (counter .le. size(zero_diags_pointer)) then + ! Skip over any rows that don't have diagonals or are already zero + if (zero_diags_pointer(counter) - global_row_start + 1 == ifree) then + counter = counter + 1 + cycle + end if + end if + + ! Set the diagonal to 0 + call MatSetValue(transpose_mat, ifree - 1 + global_row_start, ifree - 1 + global_row_start, 0d0, INSERT_VALUES, ierr) + end do + + call ISRestoreIndices(zero_diags, zero_diags_pointer, ierr) + + call MatAssemblyBegin(transpose_mat, MAT_FINAL_ASSEMBLY, ierr) + call MatAssemblyEnd(transpose_mat, MAT_FINAL_ASSEMBLY, ierr) + + ! Could call MatEliminateZeros in later versions of petsc, but for here + ! given we know the entries are ==1, we will just create a copy with "small" stuff removed + ! ie the zero diagonal + call remove_small_from_sparse(transpose_mat, 1d-100, output_mat, drop_diagonal_int = 1) + call MatDestroy(transpose_mat, ierr) + + end if + + ! Reset the entries in the strength matrix back to 1 + if (symmetrize .OR. square) call MatSetAllValues(output_mat, 1d0) + + end subroutine generate_sabs + +! ------------------------------------------------------------------------------------------------------------------------------- + +end module sabs + diff --git a/tests/Makefile b/tests/Makefile index 28ac6ec3..c08fa8b1 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -117,7 +117,7 @@ run_tests_load_serial: -pc_air_coarsest_matrix_free_polys -pc_air_coarsest_inverse_type newton -pc_air_max_luby_steps 3 -ksp_max_it 1 # @echo "" - @echo "Test PMISR DDC CF splitting in C" + @echo "Test PMISR DDC CF splitting and diagonal dominance extract in C" ./ex6_cf_splitting -f data/mat_stream_2364 # @echo "" @@ -203,7 +203,7 @@ run_tests_load_parallel: $(MPIEXEC) -n 4 ./ex12f -f data/mat_stream_2364 -pc_air_subcomm -pc_air_inverse_type arnoldi -pc_air_coarsest_subcomm \ -pc_air_coarsest_inverse_type arnoldi -ksp_max_it 5 -pc_air_a_drop 1e-3 # - @echo "Test PMISR DDC CF splitting in C in parallel" + @echo "Test PMISR DDC CF splitting and diagonal dominance extract in C in parallel" $(MPIEXEC) -n 2 ./ex6_cf_splitting -f data/mat_stream_2364 # @echo "" @@ -255,7 +255,7 @@ run_tests_no_load_short_serial: ./adv_diff_fd -da_grid_x 8 -da_grid_y 8 -pc_type air -ksp_max_it 3 -pc_air_smooth_type fc -pc_air_c_inverse_sparsity_order 0 @echo "" @echo "Test AIRG with advection and fixed dd ratio" - ./adv_diff_fd -da_grid_x 8 -da_grid_y 8 -pc_type air -ksp_max_it 3 -pc_air_max_dd_ratio 0.9 + ./adv_diff_fd -da_grid_x 10 -da_grid_y 10 -pc_type air -ksp_max_it 4 -pc_air_cf_splitting_type diag_dom -pc_air_strong_threshold 0.9 # @echo "" @echo "Test AIRG with SUPG CG FEM in 2D" @@ -602,8 +602,20 @@ run_tests_no_load_serial: @echo "Test AIRG on steady 2D structured advection with 0th order sparsity GMRES poly C smooth and max DD ratio" ./adv_diff_fd -da_grid_x 100 -da_grid_y 100 -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-4 \ - -pc_air_strong_threshold 0.99 -pc_air_max_dd_ratio 0.50001 -pc_air_ddc_fraction 0.01 -ksp_max_it 7 \ + -pc_air_cf_splitting_type diag_dom -pc_air_strong_threshold 0.50001 -ksp_max_it 7 \ -pc_air_inverse_type power +# + @echo "" + @echo "Test AIRG with upwind DG FEM in 2D with unstructured quads and a small fixed dd ratio" + ./adv_dg_upwind -dm_plex_filename data/square_unstruc.msh -pc_type air -ksp_type richardson \ + -ksp_norm_type unpreconditioned -pc_air_cf_splitting_type diag_dom -pc_air_strong_threshold 0.1 \ + -ksp_rtol 1e-10 -dm_refine 2 -pc_air_a_drop 1e-6 -pc_air_r_drop 1e-6 -ksp_max_it 6 +# + @echo "" + @echo "Test AIRG with upwind DG FEM in 2D with unstructured quads and a large fixed dd ratio" + ./adv_dg_upwind -dm_plex_filename data/square_unstruc.msh -pc_type air -ksp_type richardson \ + -ksp_norm_type unpreconditioned -pc_air_cf_splitting_type diag_dom -pc_air_strong_threshold 0.9 \ + -ksp_rtol 1e-10 -dm_refine 2 -pc_air_a_drop 1e-6 -pc_air_r_drop 1e-6 -ksp_max_it 11 # @echo "" @echo "Test improving Z" @@ -656,7 +668,7 @@ run_tests_no_load_short_parallel: $(MPIEXEC) -n 2 ./adv_diff_fd -da_grid_x 8 -da_grid_y 8 -pc_type air -ksp_max_it 3 -pc_air_smooth_type fc -pc_air_c_inverse_sparsity_order 0 @echo "" @echo "Test AIRG with advection and fixed dd ratio in parallel" - $(MPIEXEC) -n 2 ./adv_diff_fd -da_grid_x 8 -da_grid_y 8 -pc_type air -ksp_max_it 3 -pc_air_max_dd_ratio 0.9 + $(MPIEXEC) -n 2 ./adv_diff_fd -da_grid_x 10 -da_grid_y 10 -pc_type air -ksp_max_it 4 -pc_air_cf_splitting_type diag_dom -pc_air_strong_threshold 0.9 @echo "" @echo "Test AIRG with SUPG CG FEM in 2D in parallel" $(MPIEXEC) -n 2 ./adv_diff_cg_supg -dm_plex_simplex 0 -dm_refine 2 -pc_type air \ @@ -929,8 +941,20 @@ run_tests_no_load_parallel: @echo "Test AIRG on steady 2D structured advection with 0th order sparsity GMRES poly C smooth and max DD ratio in parallel" $(MPIEXEC) -n 2 ./adv_diff_fd -da_grid_x 100 -da_grid_y 100 -pc_type air -ksp_pc_side right \ -ksp_rtol 1e-10 -ksp_atol 1e-50 -pc_air_a_lump -pc_air_a_drop 1e-4 \ - -pc_air_strong_threshold 0.99 -pc_air_max_dd_ratio 0.50001 -pc_air_ddc_fraction 0.01 -ksp_max_it 7 \ + -pc_air_cf_splitting_type diag_dom -pc_air_strong_threshold 0.50001 -ksp_max_it 7 \ -pc_air_inverse_type power +# + @echo "" + @echo "Test AIRG with upwind DG FEM in 2D with unstructured quads and a small fixed dd ratio in parallel" + $(MPIEXEC) -n 2 ./adv_dg_upwind -dm_plex_filename data/square_unstruc.msh -pc_type air -ksp_type richardson \ + -ksp_norm_type unpreconditioned -pc_air_cf_splitting_type diag_dom -pc_air_strong_threshold 0.1 \ + -ksp_rtol 1e-10 -dm_refine 2 -pc_air_a_drop 1e-6 -pc_air_r_drop 1e-6 -ksp_max_it 6 +# + @echo "" + @echo "Test AIRG with upwind DG FEM in 2D with unstructured quads and a large fixed dd ratio in parallel" + $(MPIEXEC) -n 2 ./adv_dg_upwind -dm_plex_filename data/square_unstruc.msh -pc_type air -ksp_type richardson \ + -ksp_norm_type unpreconditioned -pc_air_cf_splitting_type diag_dom -pc_air_strong_threshold 0.9 \ + -ksp_rtol 1e-10 -dm_refine 2 -pc_air_a_drop 1e-6 -pc_air_r_drop 1e-6 -ksp_max_it 11 # @echo "" @echo "Test solving isotropic diffusion with fast coarsening and near-nullspace in parallel" diff --git a/tests/ex6_cf_splitting.c b/tests/ex6_cf_splitting.c index be4754fe..8c057092 100644 --- a/tests/ex6_cf_splitting.c +++ b/tests/ex6_cf_splitting.c @@ -1,4 +1,4 @@ -static char help[] = "Reads a PETSc matrix and vector from a file and solves a linear system.\n\ +static char help[] = "Checks we can read in a linear system and compute a CF splitting.\n\ Input arguments are:\n\ -f : file to load. For example see $PETSC_DIR/share/petsc/datafiles/matrices\n\n"; @@ -6,6 +6,134 @@ Input arguments are:\n\ #include #include "pflare.h" +// Check that a given CF splitting is valid +static PetscErrorCode CheckSplitting(Mat A, IS is_fine, IS is_coarse, const char *label) +{ + PetscInt n_fine_local, n_coarse_local; + PetscInt n_fine_global, n_coarse_global; + PetscInt local_rows, local_cols; + PetscInt global_rows, global_cols; + PetscInt rstart, rend, i; + PetscInt nlocal; + const PetscInt *idx_fine, *idx_coarse; + PetscInt *seen; + PetscMPIInt rank; + + PetscFunctionBeginUser; + PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank)); + + PetscCall(MatGetLocalSize(A, &local_rows, &local_cols)); + PetscCall(MatGetSize(A, &global_rows, &global_cols)); + PetscCall(MatGetOwnershipRange(A, &rstart, &rend)); + + PetscCall(ISGetLocalSize(is_fine, &n_fine_local)); + PetscCall(ISGetLocalSize(is_coarse, &n_coarse_local)); + PetscCall(ISGetSize(is_fine, &n_fine_global)); + PetscCall(ISGetSize(is_coarse, &n_coarse_global)); + + PetscCheck(n_fine_local + n_coarse_local == local_rows, PETSC_COMM_SELF, PETSC_ERR_PLIB, + "%s: local size mismatch on rank %d: n_fine_local (%" PetscInt_FMT ") + n_coarse_local (%" PetscInt_FMT ") != local_rows (%" PetscInt_FMT ")", + label, rank, n_fine_local, n_coarse_local, local_rows); + PetscCheck(n_fine_global + n_coarse_global == global_rows, PETSC_COMM_SELF, PETSC_ERR_PLIB, + "%s: global size mismatch: n_fine_global (%" PetscInt_FMT ") + n_coarse_global (%" PetscInt_FMT ") != global_rows (%" PetscInt_FMT ")", + label, n_fine_global, n_coarse_global, global_rows); + + PetscCall(PetscCalloc1(local_rows, &seen)); + + PetscCall(ISGetLocalSize(is_fine, &nlocal)); + PetscCall(ISGetIndices(is_fine, &idx_fine)); + for (i = 0; i < nlocal; i++) { + PetscCheck(idx_fine[i] >= rstart && idx_fine[i] < rend, PETSC_COMM_SELF, PETSC_ERR_PLIB, + "%s: fine index %" PetscInt_FMT " is not in local row ownership range [%" PetscInt_FMT ", %" PetscInt_FMT ") on rank %d", + label, idx_fine[i], rstart, rend, rank); + seen[idx_fine[i] - rstart]++; + } + PetscCall(ISRestoreIndices(is_fine, &idx_fine)); + + PetscCall(ISGetLocalSize(is_coarse, &nlocal)); + PetscCall(ISGetIndices(is_coarse, &idx_coarse)); + for (i = 0; i < nlocal; i++) { + PetscCheck(idx_coarse[i] >= rstart && idx_coarse[i] < rend, PETSC_COMM_SELF, PETSC_ERR_PLIB, + "%s: coarse index %" PetscInt_FMT " is not in local row ownership range [%" PetscInt_FMT ", %" PetscInt_FMT ") on rank %d", + label, idx_coarse[i], rstart, rend, rank); + seen[idx_coarse[i] - rstart]++; + } + PetscCall(ISRestoreIndices(is_coarse, &idx_coarse)); + + for (i = 0; i < local_rows; i++) { + PetscCheck(seen[i] == 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, + "%s: owned row %" PetscInt_FMT " appears %" PetscInt_FMT " times in local fine/coarse IS on rank %d", + label, rstart + i, seen[i], rank); + } + + PetscCall(PetscFree(seen)); + + if (!rank) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%s: OK\n", label)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +// Check that a returned diagonally dominant submatrix is diagonally dominant +// ratio = sum(abs(offdiag entries)) / abs(diagonal), with ratio=0 if no diagonal exists. +static PetscErrorCode CheckDiagDomSubmatrix(Mat A, Mat A_dd, PetscReal max_dd_ratio, const char *label) +{ + PetscInt a_local_rows, a_local_cols, a_global_rows, a_global_cols; + PetscInt dd_local_rows, dd_local_cols, dd_global_rows, dd_global_cols; + PetscInt rstart, rend, i, j, ncols; + const PetscInt *cols; + const PetscScalar *vals; + PetscReal diag_val, off_diag_sum, row_ratio; + PetscReal max_row_ratio_local = 0.0, max_row_ratio_global = 0.0; + const PetscReal dd_ratio_abs_tol = 1e-12; + const PetscReal dd_ratio_rel_tol = 1e-10; + PetscReal tol; + PetscMPIInt rank; + + PetscFunctionBeginUser; + PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank)); + + PetscCall(MatGetLocalSize(A, &a_local_rows, &a_local_cols)); + PetscCall(MatGetSize(A, &a_global_rows, &a_global_cols)); + PetscCall(MatGetLocalSize(A_dd, &dd_local_rows, &dd_local_cols)); + PetscCall(MatGetSize(A_dd, &dd_global_rows, &dd_global_cols)); + + PetscCall(MatGetOwnershipRange(A_dd, &rstart, &rend)); + for (i = rstart; i < rend; i++) { + diag_val = 0.0; + off_diag_sum = 0.0; + + PetscCall(MatGetRow(A_dd, i, &ncols, &cols, &vals)); + for (j = 0; j < ncols; j++) { + if (cols[j] == i) { + diag_val = PetscAbsScalar(vals[j]); + } else { + off_diag_sum += PetscAbsScalar(vals[j]); + } + } + PetscCall(MatRestoreRow(A_dd, i, &ncols, &cols, &vals)); + + // Ensure we don't divide by zero if no diagonal + row_ratio = (diag_val != 0.0) ? (off_diag_sum / diag_val) : 0.0; + max_row_ratio_local = PetscMax(max_row_ratio_local, row_ratio); + + } + + PetscCallMPI(MPI_Allreduce(&max_row_ratio_local, &max_row_ratio_global, 1, MPIU_REAL, MPIU_MAX, PETSC_COMM_WORLD)); + + tol = dd_ratio_abs_tol + dd_ratio_rel_tol * PetscMax(PetscAbsReal(max_dd_ratio), PetscAbsReal(max_row_ratio_global)); + PetscCheck(max_row_ratio_global <= max_dd_ratio + tol, PETSC_COMM_SELF, PETSC_ERR_PLIB, + "%s: max observed dd ratio %.16e > max_dd_ratio %.16e (tol %.3e)", + label, (double)max_row_ratio_global, (double)max_dd_ratio, (double)tol); + + if (!rank) { + PetscCall(PetscPrintf(PETSC_COMM_SELF, + "%s: OK (submatrix size %" PetscInt_FMT " x %" PetscInt_FMT ", max observed ratio %.16e <= %.16e)\n", + label, dd_global_rows, dd_global_cols, (double)max_row_ratio_global, (double)max_dd_ratio)); + } + + PetscFunctionReturn(PETSC_SUCCESS); +} + int main(int argc,char **args) { #if defined(PETSC_USE_LOG) @@ -17,6 +145,7 @@ int main(int argc,char **args) PetscViewer fd; PetscBool flg,b_in_f = PETSC_TRUE; IS is_fine, is_coarse; + Mat A_dd; VecType vtype; PetscCall(PetscInitialize(&argc,&args,(char*)0,help)); @@ -83,9 +212,6 @@ int main(int argc,char **args) int ddc_its = 1; // Fraction of F points to convert to C per ddc it PetscReal ddc_fraction = 0.1; - // If not 0, keep doing ddc its until this diagonal dominance - // ratio is hit - PetscReal max_dd_ratio = 0.0; // As many steps as needed int max_luby_steps = -1; // PMISR DDC @@ -93,29 +219,50 @@ int main(int argc,char **args) // Is the matrix symmetric? int symmetric = 0; + compute_cf_splitting(A, \ + symmetric, \ + strong_threshold, max_luby_steps, \ + algorithm, \ + ddc_its, \ + ddc_fraction, \ + &is_fine, &is_coarse); + + PetscCall(CheckSplitting(A, is_fine, is_coarse, "default PMISR_DDC")); + + PetscCall(ISDestroy(&is_fine)); + PetscCall(ISDestroy(&is_coarse)); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Compute a CF splitting with diag_dom target + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + strong_threshold = 0.5; + algorithm = CF_DIAG_DOM; + compute_cf_splitting(A, \ symmetric, \ strong_threshold, max_luby_steps, \ algorithm, \ ddc_its, \ ddc_fraction, \ - max_dd_ratio, \ &is_fine, &is_coarse); - PetscInt n_fine, n_coarse; - PetscInt local_rows, local_cols; - PetscCall(MatGetLocalSize(A, &local_rows, &local_cols)); - PetscCall(ISGetLocalSize(is_fine, &n_fine)); - PetscCall(ISGetLocalSize(is_coarse, &n_coarse)); + PetscCall(CheckSplitting(A, is_fine, is_coarse, "diag_dom strong_threshold=0.5")); - if (n_fine + n_coarse == local_rows) - { - PetscCall(PetscPrintf(PETSC_COMM_WORLD, "OK \n")); - } - else{ - PetscCall(PetscPrintf(PETSC_COMM_WORLD, "NOT OK \n")); - return 1; - } + PetscCall(ISDestroy(&is_fine)); + PetscCall(ISDestroy(&is_coarse)); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Compute a diagonally dominant submatrix + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + compute_diag_dom_submatrix(A, 0.5, &A_dd); + PetscCall(CheckDiagDomSubmatrix(A, A_dd, 0.5, "diag_dom_submatrix max_dd_ratio=0.5")); + PetscCall(MatDestroy(&A_dd)); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -124,8 +271,7 @@ int main(int argc,char **args) PetscCall(VecDestroy(&b)); PetscCall(VecDestroy(&u)); PetscCall(MatDestroy(&A)); - PetscCall(ISDestroy(&is_fine)); - PetscCall(ISDestroy(&is_coarse)); + PetscCall(PetscFinalize()); return 0; } \ No newline at end of file