diff --git a/CMakeLists.txt b/CMakeLists.txt index 536b4080..0b9d4276 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,10 +74,14 @@ option(VPIC_ENABLE_HDF5 "Enable HDF5 for use during IO. VPIC does not help you i option(VPIC_ENABLE_VARIABLE_CHARGE "Enable variable charge for particles" OFF) +option(VPIC_ENABLE_EXTERNAL_FORCE "Enable external force for particles" OFF) + option(VPIC_ENABLE_GPU_AWARE_MPI "Enable GPU-aware MPI. Allows GPUs to communicate with each other directly rather than going through the CPU. If VPIC_ENABLE_GPU_AWARE_MPI is enabled but not supported in MPI, the code may crash." OFF) option(VPIC_ENABLE_HALO_EXCHANGE "Enable halo exchange code path for remote communication. Temporary option until halo exchange code is accurately tested." ON) +option(VPIC_SHAPE_QS "Use quadratic-sum particle shape" OFF) + option(HYB_USE_SEPARATE_PE "Use separate electron energy equation instead of EoS." OFF) option(HYB_USE_STATIC_E "Use electrostatic hybrid field solver." OFF) @@ -371,6 +375,12 @@ if (VPIC_ENABLE_VARIABLE_CHARGE) message("-- VPIC: Enabled variable charge for particles") endif(VPIC_ENABLE_VARIABLE_CHARGE) +if (VPIC_ENABLE_EXTERNAL_FORCE) + add_definitions(-DEXTERNAL_FORCE) + set(VPIC_CPPFLAGS "${VPIC_CPPFLAGS} -DEXTERNAL_FORCE") + message("-- VPIC: Enabled external force for particles") +endif(VPIC_ENABLE_EXTERNAL_FORCE) + if (VPIC_ENABLE_GPU_AWARE_MPI) add_definitions(-DVPIC_ENABLE_GPU_AWARE_MPI) message("-- VPIC: Enabled GPU-Aware MPI") @@ -381,6 +391,16 @@ if (VPIC_ENABLE_HALO_EXCHANGE) message("-- VPIC: Enabled Halo exchange path") endif(VPIC_ENABLE_HALO_EXCHANGE) +if (VPIC_SHAPE_QS) + add_definitions(-DSHAPE_QS) + message("-- VPIC: Using quadratic-sum particle shape") + set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DSHAPE_QS") +else() + add_definitions(-DSHAPE_NGP) + message("-- VPIC: Using nearest-grid-point particle shape") + set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DSHAPE_NGP") +endif(VPIC_SHAPE_QS) + if (HYB_USE_SEPARATE_PE) add_definitions(-DHYB_USE_SEPARATE_PE) set(VPIC_CPPFLAGS "${VPIC_CPPFLAGS} -DHYB_USE_SEPARATE_PE") diff --git a/examples/iaw/NO-SMOOTH/iaw.cxx b/examples/iaw/NO-SMOOTH/iaw.cxx index 35cb20f0..2fd6745f 100644 --- a/examples/iaw/NO-SMOOTH/iaw.cxx +++ b/examples/iaw/NO-SMOOTH/iaw.cxx @@ -456,7 +456,7 @@ sim_log( "Loading fields" ); * Convenience functions for simlog output *------------------------------------------------------------------------*/ - char varlist[512]; + char varlist[1024]; // accommodate "allvars" for fields create_field_list(varlist, global->fdParams); sim_log ( "Fields variable list: " << varlist ); diff --git a/examples/iaw/NO-SMOOTH/translateIAW.f90 b/examples/iaw/NO-SMOOTH/translateIAW.f90 index 58782252..d855cbae 100755 --- a/examples/iaw/NO-SMOOTH/translateIAW.f90 +++ b/examples/iaw/NO-SMOOTH/translateIAW.f90 @@ -511,7 +511,12 @@ program translate bz(idxstart(n,1):idxstop(n,1), idxstart(n,2):idxstop(n,2), idxstart(n,3):idxstop(n,3)) = & buffer(2:nc(1)-1,2:nc(2)-1,2:nc(3)-1) - read(10)buffer ! skip div_b error + read(10)buffer ! skip pe + + read(10)buffer ! skip magnetic0 + read(10)buffer ! skip + read(10)buffer ! skip + read(10)buffer ! skip te0 read(10)buffer ! skip tca read(10)buffer ! skip diff --git a/examples/iaw/iaw.cxx b/examples/iaw/iaw.cxx index b8ad7260..24d44b17 100644 --- a/examples/iaw/iaw.cxx +++ b/examples/iaw/iaw.cxx @@ -458,7 +458,7 @@ sim_log( "Loading fields" ); * Convenience functions for simlog output *------------------------------------------------------------------------*/ - char varlist[512]; + char varlist[1024]; // accommodate "allvars" for fields create_field_list(varlist, global->fdParams); sim_log ( "Fields variable list: " << varlist ); diff --git a/examples/iaw/translateIAW.f90 b/examples/iaw/translateIAW.f90 index 8c731e8c..6fe6e8c7 100755 --- a/examples/iaw/translateIAW.f90 +++ b/examples/iaw/translateIAW.f90 @@ -511,7 +511,12 @@ program translate bz(idxstart(n,1):idxstop(n,1), idxstart(n,2):idxstop(n,2), idxstart(n,3):idxstop(n,3)) = & buffer(2:nc(1)-1,2:nc(2)-1,2:nc(3)-1) - read(10)buffer ! skip div_b error + read(10)buffer ! skip pe + + read(10)buffer ! skip magnetic0 + read(10)buffer ! skip + read(10)buffer ! skip + read(10)buffer ! skip te0 read(10)buffer ! skip tca read(10)buffer ! skip diff --git a/examples/islands/COLD-ION/islands_hyb.cxx b/examples/islands/COLD-ION/islands_hyb.cxx index f1b0783b..2c93948f 100644 --- a/examples/islands/COLD-ION/islands_hyb.cxx +++ b/examples/islands/COLD-ION/islands_hyb.cxx @@ -586,7 +586,7 @@ sim_log( "Loading fields" ); * Convenience functions for simlog output *------------------------------------------------------------------------*/ - char varlist[512]; + char varlist[1024]; // accommodate "allvars" for fields create_field_list(varlist, global->fdParams); sim_log ( "Fields variable list: " << varlist ); diff --git a/examples/islands/islands_hyb.cxx b/examples/islands/islands_hyb.cxx index 5aa4b075..bec52780 100644 --- a/examples/islands/islands_hyb.cxx +++ b/examples/islands/islands_hyb.cxx @@ -586,7 +586,7 @@ sim_log( "Loading fields" ); * Convenience functions for simlog output *------------------------------------------------------------------------*/ - char varlist[512]; + char varlist[1024]; // accommodate "allvars" for fields create_field_list(varlist, global->fdParams); sim_log ( "Fields variable list: " << varlist ); diff --git a/examples/pcai/pcai.cxx b/examples/pcai/pcai.cxx index 686b6b4c..e2aa984b 100644 --- a/examples/pcai/pcai.cxx +++ b/examples/pcai/pcai.cxx @@ -462,7 +462,7 @@ sim_log( "Loading fields" ); * Convenience functions for simlog output *------------------------------------------------------------------------*/ - char varlist[512]; + char varlist[1024]; // accommodate "allvars" for fields create_field_list(varlist, global->fdParams); sim_log ( "Fields variable list: " << varlist ); diff --git a/examples/pcai/translate_pcai.f90 b/examples/pcai/translate_pcai.f90 index 79f66e99..43524353 100644 --- a/examples/pcai/translate_pcai.f90 +++ b/examples/pcai/translate_pcai.f90 @@ -373,14 +373,14 @@ program translate fnames(5) = 'data/By' fnames(6) = 'data/Bz' -! fnames(7) = 'data/Uix' -! fnames(8) = 'data/Uiy' -! fnames(9) = 'data/Uiz' + fnames(7) = 'data/Uix' + fnames(8) = 'data/Uiy' + fnames(9) = 'data/Uiz' fnames(10) = 'data/ni' - fnames(11) = 'data/Uix' - fnames(12) = 'data/Uiy' - fnames(13) = 'data/Uiz' - fnames(14) = 'data/niold' + !fnames(11) = 'data/Uix' + !fnames(12) = 'data/Uiy' + !fnames(13) = 'data/Uiz' + !fnames(14) = 'data/niold' fnames(15) = 'data/aniso' !fnames(16) = 'data/Pi-xy1' @@ -513,11 +513,17 @@ program translate bz(idxstart(n,1):idxstop(n,1), idxstart(n,2):idxstop(n,2), idxstart(n,3):idxstop(n,3)) = & buffer(2:nc(1)-1,2:nc(2)-1,2:nc(3)-1) - read(10)buffer ! skip div_b error - !read(10)buffer ! skip tca - !read(10)buffer ! skip - !read(10)buffer ! skip - !read(10)buffer ! skip rhob + read(10)buffer ! skip pe + + read(10)buffer ! skip magnetic0 + read(10)buffer ! skip + read(10)buffer ! skip + read(10)buffer ! skip te0 + + read(10)buffer ! skip tca + read(10)buffer ! skip + read(10)buffer ! skip + read(10)buffer ! skip rhob read(10)buffer @@ -537,23 +543,6 @@ program translate buffer(2:nc(1)-1,2:nc(2)-1,2:nc(3)-1) - read(10)buffer - ux(idxstart(n,1):idxstop(n,1), idxstart(n,2):idxstop(n,2), idxstart(n,3):idxstop(n,3)) = & - buffer(2:nc(1)-1,2:nc(2)-1,2:nc(3)-1) - - read(10)buffer - uy(idxstart(n,1):idxstop(n,1), idxstart(n,2):idxstop(n,2), idxstart(n,3):idxstop(n,3)) = & - buffer(2:nc(1)-1,2:nc(2)-1,2:nc(3)-1) - - read(10)buffer - uz(idxstart(n,1):idxstop(n,1), idxstart(n,2):idxstop(n,2), idxstart(n,3):idxstop(n,3)) = & - buffer(2:nc(1)-1,2:nc(2)-1,2:nc(3)-1) - - read(10)buffer - ne(idxstart(n,1):idxstop(n,1), idxstart(n,2):idxstop(n,2), idxstart(n,3):idxstop(n,3)) = & - buffer(2:nc(1)-1,2:nc(2)-1,2:nc(3)-1) - - close(10) @@ -629,14 +618,14 @@ program translate call MPI_FILE_WRITE_AT_ALL(fh(4), offset, bx, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) call MPI_FILE_WRITE_AT_ALL(fh(5), offset, by, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) call MPI_FILE_WRITE_AT_ALL(fh(6), offset, bz, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) -! call MPI_FILE_WRITE_AT_ALL(fh(7), offset, jx, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) -! call MPI_FILE_WRITE_AT_ALL(fh(8), offset, jy, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) -! call MPI_FILE_WRITE_AT_ALL(fh(9), offset, jz, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) + call MPI_FILE_WRITE_AT_ALL(fh(7), offset, jx, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) + call MPI_FILE_WRITE_AT_ALL(fh(8), offset, jy, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) + call MPI_FILE_WRITE_AT_ALL(fh(9), offset, jz, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) call MPI_FILE_WRITE_AT_ALL(fh(10), offset, rho, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) - call MPI_FILE_WRITE_AT_ALL(fh(11), offset, ux, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) - call MPI_FILE_WRITE_AT_ALL(fh(12), offset, uy, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) - call MPI_FILE_WRITE_AT_ALL(fh(13), offset, uz, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) - call MPI_FILE_WRITE_AT_ALL(fh(14), offset, ne, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) +! call MPI_FILE_WRITE_AT_ALL(fh(11), offset, ux, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) +! call MPI_FILE_WRITE_AT_ALL(fh(12), offset, uy, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) +! call MPI_FILE_WRITE_AT_ALL(fh(13), offset, uz, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) +! call MPI_FILE_WRITE_AT_ALL(fh(14), offset, ne, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) call MPI_FILE_WRITE_AT_ALL(fh(15), offset, aniso, ht%nx*ht%ny*ht%nz, MPI_REAL4, status, ierror) diff --git a/examples/shock/shock-hyb.cxx b/examples/shock/shock-hyb.cxx index 8b47a20f..e3a38625 100755 --- a/examples/shock/shock-hyb.cxx +++ b/examples/shock/shock-hyb.cxx @@ -542,7 +542,7 @@ sim_log( "Loading fields" ); * Convenience functions for simlog output *------------------------------------------------------------------------*/ - char varlist[512]; + char varlist[1024]; // accommodate "allvars" for fields create_field_list(varlist, global->fdParams); sim_log ( "Fields variable list: " << varlist ); diff --git a/src/field_advance/field_advance.cc b/src/field_advance/field_advance.cc index ceef50a2..09f8bfa2 100644 --- a/src/field_advance/field_advance.cc +++ b/src/field_advance/field_advance.cc @@ -169,6 +169,16 @@ field_array_t::copy_to_host() { host_field[i].sy = k_field(i, field_var::sy); host_field[i].sz = k_field(i, field_var::sz); host_field[i].se = k_field(i, field_var::se); + +#ifdef EXTERNAL_FORCE + host_field[i].Ex0 = k_field(i, field_var::Ex0); + host_field[i].Ey0 = k_field(i, field_var::Ey0); + host_field[i].Ez0 = k_field(i, field_var::Ez0); + + host_field[i].Gx0 = k_field(i, field_var::Gx0); + host_field[i].Gy0 = k_field(i, field_var::Gy0); + host_field[i].Gz0 = k_field(i, field_var::Gz0); +#endif host_field[i].ematx = k_field_edge(i, field_edge_var::ematx); host_field[i].ematy = k_field_edge(i, field_edge_var::ematy); @@ -254,6 +264,16 @@ field_array_t::copy_to_device() { k_field(i, field_var::sy) = host_field[i].sy; k_field(i, field_var::sz) = host_field[i].sz; k_field(i, field_var::se) = host_field[i].se; + +#ifdef EXTERNAL_FORCE + k_field(i, field_var::Ex0) = host_field[i].Ex0; + k_field(i, field_var::Ey0) = host_field[i].Ey0; + k_field(i, field_var::Ez0) = host_field[i].Ez0; + + k_field(i, field_var::Gx0) = host_field[i].Gx0; + k_field(i, field_var::Gy0) = host_field[i].Gy0; + k_field(i, field_var::Gz0) = host_field[i].Gz0; +#endif k_field_edge(i, field_edge_var::ematx) = host_field[i].ematx; k_field_edge(i, field_edge_var::ematy) = host_field[i].ematy; diff --git a/src/field_advance/field_advance.h b/src/field_advance/field_advance.h index 9d9c3d67..8948df9e 100644 --- a/src/field_advance/field_advance.h +++ b/src/field_advance/field_advance.h @@ -165,6 +165,10 @@ typedef struct field { float pex, pey, pez,div_b_err ; // pressure etc float ux, uy, uz, ue; // Electron bulk flow velocity float sx, sy, sz, se; // Electron momentum and energy sources + #ifdef EXTERNAL_FORCE + float Ex0, Ey0, Ez0, _pad1; // External electric field (mover only, exclude from Ohm's law) + float Gx0, Gy0, Gz0, _pad2; // External gravity field (mover only, exclude from Ohm's law) + #endif material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers diff --git a/src/field_advance/standard/hyb_advance_b.cc b/src/field_advance/standard/hyb_advance_b.cc index c91e47e3..b43cb0ad 100644 --- a/src/field_advance/standard/hyb_advance_b.cc +++ b/src/field_advance/standard/hyb_advance_b.cc @@ -85,12 +85,25 @@ hyb_advance_b(field_array_t * RESTRICT fa, Kokkos::Profiling::pushRegion("HybyridAdvanceB::Smooth_Ion_Moments"); //Only smooth on the first subcycle if (isub==0) { +#ifndef SHAPE_NGP + //fix edge rho/currents for local BCs + //adds adjacent ghost rho/current to cell and sets BC ghosts to 0 + k_hyb_local_adjust_jf(fa, fa->g); + //fix edge rho/currents everywhere else + //sends ghosts across ranks and adds ghosts to adjacent cells + k_begin_remote_edge_hyb_jf(fa, fa->g, *(fa->fb) ); + k_end_remote_edge_hyb_jf (fa, fa->g, *(fa->fb) ); +#else + // NGP accumulates only on live cells; ghosts empty. + // So no need to fix edge rho/currents +#endif - //smooth moments + //refresh all ghosts Kokkos::Profiling::pushRegion("HybyridAdvanceB::Smooth_Ion_Moments::Exchange_JF"); k_begin_remote_ghost_hyb_jf(fa, fa->g, *(fa->fb) ); k_end_remote_ghost_hyb_jf (fa, fa->g, *(fa->fb) ); Kokkos::Profiling::popRegion(); + //fix ghosts for local BCs Kokkos::Profiling::pushRegion("HybyridAdvanceB::Smooth_Ion_Moments::Apply_Local_Ghost_JF"); k_hyb_local_ghost_jf (fa, fa->g); Kokkos::Profiling::popRegion(); diff --git a/src/field_advance/standard/local.cc b/src/field_advance/standard/local.cc index e61dce35..3a86a9d5 100644 --- a/src/field_advance/standard/local.cc +++ b/src/field_advance/standard/local.cc @@ -1755,6 +1755,9 @@ void k_local_adjust_rhob(field_array_t* fa, const grid_t* g) { adjust_rhob(fa, g, 0, 0, 1); } +/***************************************************************************** + * Hybrid local ghosts + *****************************************************************************/ //Hybrid local B field @@ -2107,3 +2110,65 @@ k_hyb_local_ghost_lapl_b( field_array_t * RESTRICT f, #undef APPLYLOCAL +/***************************************************************************** + * Hybrid local adjusts + *****************************************************************************/ + +//Hybrid adjust jf,rhof + +template void adjust_hyb_local_jf(int i, int j, int k, + const int nx, const int ny, const int nz, + field_array_t* RESTRICT f, const grid_t* g) {} + +#define ADJUSTLOCAL(x_,y_,z_) \ + int bc = g->bc[BOUNDARY(i,j,k)]; \ + k_field_t k_field = f->k_f_d; \ + Kokkos::MDRangePolicy > x_##_face({1,1},{n##y_+1,n##z_+1}); \ + if(bc < 0 || bc >= world_size) { \ + int x_ = (i+j+k)<0 ? 1 : n##x_; /* adjust edges */ \ + switch(bc) { \ + case anti_symmetric_fields: \ + case symmetric_fields: \ + Kokkos::parallel_for("adjust_hyb_local_jf: anti_symmetric_fields", \ + x_##_face, KOKKOS_LAMBDA(const int y_, const int z_) { \ + /* add ghost cell to adjacent local BC cell, then set ghost to 0 */ \ + k_field(VOXEL(x,y,z,nx,ny,nz), field_var::jfx) += k_field(VOXEL(x+i,y+j,z+k,nx,ny,nz), field_var::jfx); \ + k_field(VOXEL(x,y,z,nx,ny,nz), field_var::jfy) += k_field(VOXEL(x+i,y+j,z+k,nx,ny,nz), field_var::jfy); \ + k_field(VOXEL(x,y,z,nx,ny,nz), field_var::jfz) += k_field(VOXEL(x+i,y+j,z+k,nx,ny,nz), field_var::jfz); \ + k_field(VOXEL(x,y,z,nx,ny,nz), field_var::rhof) += k_field(VOXEL(x+i,y+j,z+k,nx,ny,nz), field_var::rhof); \ + k_field(VOXEL(x+i,y+j,z+k,nx,ny,nz), field_var::jfx) = 0; \ + k_field(VOXEL(x+i,y+j,z+k,nx,ny,nz), field_var::jfy) = 0; \ + k_field(VOXEL(x+i,y+j,z+k,nx,ny,nz), field_var::jfz) = 0; \ + k_field(VOXEL(x+i,y+j,z+k,nx,ny,nz), field_var::rhof) = 0; \ + }); \ + break; \ + default: \ + ERROR(("Bad boundary condition encountered.")); \ + break; \ + } \ + } + +template<> void adjust_hyb_local_jf(int i, int j, int k, + const int nx, const int ny, const int nz, + field_array_t* RESTRICT f, const grid_t* g) { ADJUSTLOCAL(x,y,z); } +template<> void adjust_hyb_local_jf(int i, int j, int k, + const int nx, const int ny, const int nz, + field_array_t* RESTRICT f, const grid_t* g) { ADJUSTLOCAL(y,z,x); } +template<> void adjust_hyb_local_jf(int i, int j, int k, + const int nx, const int ny, const int nz, + field_array_t* RESTRICT f, const grid_t* g) { ADJUSTLOCAL(z,x,y); } + +void +k_hyb_local_adjust_jf( field_array_t * RESTRICT f, + const grid_t * g ) { + const int nx = g->nx, ny = g->ny, nz = g->nz; + + adjust_hyb_local_jf(-1,0,0,nx,ny,nz,f,g); + adjust_hyb_local_jf(0,-1,0,nx,ny,nz,f,g); + adjust_hyb_local_jf(0,0,-1,nx,ny,nz,f,g); + adjust_hyb_local_jf(1,0,0, nx,ny,nz,f,g); + adjust_hyb_local_jf(0,1,0, nx,ny,nz,f,g); + adjust_hyb_local_jf(0,0,1, nx,ny,nz,f,g); +} + +#undef ADJUSTLOCAL diff --git a/src/field_advance/standard/remote.cc b/src/field_advance/standard/remote.cc index 394c3820..8e47af4c 100644 --- a/src/field_advance/standard/remote.cc +++ b/src/field_advance/standard/remote.cc @@ -3878,6 +3878,283 @@ k_end_remote_ghost_hyb_o(field_array_t* ALIGNED(128) fa, #endif } +/***************************************************************************** + * Edge value communications + * + * Like ghost communication routines, but opposite direction: + * Send ghost cell data, recv increment into live cells. + *****************************************************************************/ + +//Hybrid edge JF +#define BRP(x_,y_,z_) \ + const int n##y_ = fa->g->n##y_, n##z_=fa->g->n##z_; \ + const int size = (4*n##y_*n##z_)*sizeof(float); \ + BEGIN_RECV_PORT_K(i,j,k,size,fa->g, rbuf_d, rbuf_h); + +/** + * @brief Begin non blocking receive for adding remote jf ghost cells into + * local jf edge cells, needed for quadratic-sum or higher-order particle + * shape. + * + * Calculates size of receive buffer based on which face and orientation. + * BRP macro adjusts calculations for different face directions. Template + * function wraps the BRP macro to make profiling clearer. + * + * @tparam Face Enum denoting the face and order of dimensions for calculations + * @param fa Pointer for field array structure containing field and grid data + * @param i X-dim face coordinate (-1.0: neg. x, 0: origin, 1.0: pos. x) + * @param j Y-dim face coordinate (-1.0: neg. y, 0: origin, 1.0: pos. y) + * @param k Z-dim face coordinate (-1.0: neg. z, 0: origin, 1.0: pos. z) + * @param rbuf_d Receive buffer on the device + * @param rbuf_h Mirror of rbuf_d on the Host + */ +template +void +begin_recv_edge_hyb_jf(field_array* fa, const int i, const int j, const int k) { + int src = fa->g->bc[BOUNDARY(-i,-j,-k)]; /**< Source rank */ + // Only recv cells if src is a valid neighbor and not itself + if( 0 <= src && src < world_size ) { + Kokkos::DualView rbuf = fa->fb->recv_buffer[BOUNDARY(i,j,k)]; + auto rbuf_d = rbuf.view(); + auto rbuf_h = rbuf.view(); + if constexpr (std::is_same::value) { + BRP(x,y,z); + } else if constexpr (std::is_same::value) { + BRP(y,z,x); + } else if constexpr (std::is_same::value) { + BRP(z,x,y); + } + } +} + +#undef BRP + +#define BSP(x_,y_,z_) \ + const int nx = fa->g->nx, ny = fa->g->ny, nz = fa->g->nz; \ + const int size = (4*n##y_*n##z_)*sizeof(float); \ + const int face = (i+j+k)<0 ? 0 : n##x_+1; /* send ghosts to edges */ \ + const k_field_t& k_field = fa->k_f_d; \ + Kokkos::MDRangePolicy> x_##_face({1, 1}, {n##z_+1, n##y_+1}); \ + Kokkos::parallel_for("begin_send_edge_hyb_jf", x_##_face, KOKKOS_LAMBDA(const int z_, const int y_) { \ + const int x_ = face; \ + sbuf_d( (z_-1)*n##y_ + (y_-1)) = k_field(VOXEL(x,y,z,nx,ny,nz), field_var::jfx); \ + sbuf_d( n##y_*n##z_ + (z_-1)*n##y_ + (y_-1)) = k_field(VOXEL(x,y,z,nx,ny,nz), field_var::jfy); \ + sbuf_d(2*n##y_*n##z_ + (z_-1)*n##y_ + (y_-1)) = k_field(VOXEL(x,y,z,nx,ny,nz), field_var::jfz); \ + sbuf_d(3*n##y_*n##z_ + (z_-1)*n##y_ + (y_-1)) = k_field(VOXEL(x,y,z,nx,ny,nz), field_var::rhof);\ + }); \ + SYNC_MPI_BUFFER(sbuf_h, sbuf_d); \ + BEGIN_SEND_PORT_K(i,j,k,size,fa->g, sbuf_d, sbuf_h); + +/** + * @brief Begin non blocking send for adding local jf ghost cells into remote + * jf edge cells, needed for quadratic-sum or higher-order particle shape. + * + * Serializes jf data in contiguous buffer and sends data to a neighbors + * ghost cells. BSP macro adjusts calculations for different face directions. + * Template function wraps the BRP macro to make profiling clearer. + * + * @tparam Face Enum denoting the face and order of dimensions for calculations + * @param fa Pointer for field array structure containing field and grid data + * @param i X-dim face coordinate (-1.0: neg. x, 0: origin, 1.0: pos. x) + * @param j Y-dim face coordinate (-1.0: neg. y, 0: origin, 1.0: pos. y) + * @param k Z-dim face coordinate (-1.0: neg. z, 0: origin, 1.0: pos. z) + * @param sbuf_d Send buffer on the device + * @param sbuf_h Mirror of sbuf_d on the Host + */ +template +void +begin_send_edge_hyb_jf(field_array* fa, const int i, const int j, const int k) { + int dst = fa->g->bc[BOUNDARY(i,j,k)]; /**< Destination rank */ + // Only send cells if dst is a valid neighbor and not itself + if( 0 <= dst && dst < world_size ) { + Kokkos::DualView sbuf = fa->fb->send_buffer[BOUNDARY(i,j,k)]; + auto sbuf_d = sbuf.view(); + auto sbuf_h = sbuf.view(); + if constexpr (std::is_same::value) { + BSP(x,y,z); + } else if constexpr (std::is_same::value) { + BSP(y,z,x); + } else if constexpr (std::is_same::value) { + BSP(z,x,y); + } + } +} + +#undef BSP + +/** + * @brief Begin exchanging jf ghost cells between all neighbors to increment + * edge cells for quadratic-sum or higher-order particle particle shape. + * + * Prepares receive buffers, packs face cells into contiguous buffers + * and starts non blocking communication with neighbors. Only performs + * communication when necessary. Will ignore cases where the process is on a + * boundary or if the process topology would make the exchange redundant + * (ex. 1D and 2D grids). + * + * @param fa Pointer for field array structure containing field and grid data + * @param g Pointer to grid structure + * @param fb Reference to field buffers used for MPI communication + */ +void k_begin_remote_edge_hyb_jf(field_array_t* ALIGNED(128) fa, + const grid_t* g, + field_buffers_t& fb) { + // TODO halo exchange not yet implemented for ghost->edge summation. + // Need modified versions of begin/end_halo_exchange(...) + // to pack local ghosts, unpack by adding to remote edges + // instead of pack local edges, unpack by replacing remote ghosts. + // --ATr,2025aug30 +//#ifdef VPIC_ENABLE_HALO_EXCHANGE +// begin_halo_add_exchange(fa, field_var::jfx, field_var::rhof+1); +//#else + // Start receiving + begin_recv_edge_hyb_jf(fa, -1, 0, 0); + begin_recv_edge_hyb_jf(fa, 0, -1, 0); + begin_recv_edge_hyb_jf(fa, 0, 0, -1); + begin_recv_edge_hyb_jf(fa, 1, 0, 0); + begin_recv_edge_hyb_jf(fa, 0, 1, 0); + begin_recv_edge_hyb_jf(fa, 0, 0, 1); + + // Start sending + begin_send_edge_hyb_jf(fa, -1, 0, 0); + begin_send_edge_hyb_jf(fa, 0, -1, 0); + begin_send_edge_hyb_jf(fa, 0, 0, -1); + begin_send_edge_hyb_jf(fa, 1, 0, 0); + begin_send_edge_hyb_jf(fa, 0, 1, 0); + begin_send_edge_hyb_jf(fa, 0, 0, 1); +//#endif +} + +#define ERP(x_,y_,z_) \ + const grid_t* g = fa->g; \ + float* p = reinterpret_cast(end_recv_port_k(i,j,k,g)); \ + if(p) { \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + const int face = (i+j+k) < 0 ? n##x_ : 1; /*add ghosts to edges*/ \ + const k_field_t& k_field = fa->k_f_d; \ + SYNC_MPI_BUFFER(rbuf_d, rbuf_h); \ + Kokkos::MDRangePolicy> x_##_face({1, 1}, {n##z_+1, n##y_+1}); \ + Kokkos::parallel_for("end_recv_edge_hyb_jf", x_##_face, KOKKOS_LAMBDA(const int z_, const int y_) { \ + const int x_ = face; \ + k_field(VOXEL(x,y,z,nx,ny,nz), field_var::jfx) += rbuf_d( (z_-1)*n##y_ + (y_-1)); \ + k_field(VOXEL(x,y,z,nx,ny,nz), field_var::jfy) += rbuf_d( n##y_*n##z_ + (z_-1)*n##y_ + (y_-1)); \ + k_field(VOXEL(x,y,z,nx,ny,nz), field_var::jfz) += rbuf_d(2*n##y_*n##z_ + (z_-1)*n##y_ + (y_-1)); \ + k_field(VOXEL(x,y,z,nx,ny,nz), field_var::rhof) += rbuf_d(3*n##y_*n##z_ + (z_-1)*n##y_ + (y_-1)); \ + }); \ + } + +/** + * @brief End non blocking receive for adding remote jf ghost cells into + * local jf edge cells, needed for quadratic-sum or higher-order particle + * shape. + * + * Wait for non blocking communication to complete and unpack the buffer into + * the local ranks edge cells. Wait and unpacking the buffer may be made to + * overlap between different faces in the future. + * + * @tparam Face Enum denoting the face and order of dimensions for calculations + * @param fa Pointer for field array structure containing field and grid data + * @param i X-dim face coordinate (-1.0: neg. x, 0: origin, 1.0: pos. x) + * @param j Y-dim face coordinate (-1.0: neg. y, 0: origin, 1.0: pos. y) + * @param k Z-dim face coordinate (-1.0: neg. z, 0: origin, 1.0: pos. z) + * @param rbuf_d Receive buffer on the device + * @param rbuf_h Mirror of rbuf_d on the Host + */ +template +void +end_recv_edge_hyb_jf(field_array_t* fa, const int i, const int j, const int k) { + int src = fa->g->bc[BOUNDARY(-i,-j,-k)]; /**< Source rank */ + // Only recv cells if src is a valid neighbor and not itself + if( 0 <= src && src < world_size ) { + Kokkos::DualView rbuf = fa->fb->recv_buffer[BOUNDARY(i,j,k)]; + auto rbuf_d = rbuf.view(); + auto rbuf_h = rbuf.view(); + if constexpr (std::is_same::value) { + ERP(x,y,z); + } else if constexpr (std::is_same::value) { + ERP(y,z,x); + } else if constexpr (std::is_same::value) { + ERP(z,x,y); + } + } +} + +#undef ERP + +/** + * @brief End non blocking send for adding local jf ghost cells into remote jf + * edge cells, needed for quadratic-sum or higher-order particle shape. + * + * Ensures the prior send operation is complete and unsets the send buffer. + * + * @tparam Face Enum denoting the face and order of dimensions for calculations + * @param fa Pointer for field array structure containing field and grid data + * @param i X-dim face coordinate (-1.0: neg. x, 0: origin, 1.0: pos. x) + * @param j Y-dim face coordinate (-1.0: neg. y, 0: origin, 1.0: pos. y) + * @param k Z-dim face coordinate (-1.0: neg. z, 0: origin, 1.0: pos. z) + */ +template +void +end_send_edge_hyb_jf(field_array_t* fa, const int i, const int j, const int k) { + int dst = fa->g->bc[BOUNDARY(i,j,k)]; /**< Destination rank */ + // Only send cells if dst is a valid neighbor and not itself + if( 0 <= dst && dst < world_size ) { + if constexpr (std::is_same::value) { + end_send_port_k(i,j,k,fa->g); + } else if constexpr (std::is_same::value) { + end_send_port_k(i,j,k,fa->g); + } else if constexpr (std::is_same::value) { + end_send_port_k(i,j,k,fa->g); + } + } +} + +/** + * @brief End exchanging jf ghost cells between all neighbors to increment + * edge cells for quadratic-sum or higher-order particle particle shape. + * + * Wait until MPI communication is complete then unpack the buffers and + * fill in the ghost cells with the communicated smoothing variables. Only + * performs communication when necessary. Will ignore cases where the process + * is on a boundary or if the process topology would make the exchange redundant + * (ex. 1D and 2D grids). + * + * @param fa Pointer for field array structure containing field and grid data + * @param g Pointer to grid structure + * @param fb Reference to field buffers used for MPI communication + */ +void +k_end_remote_edge_hyb_jf(field_array_t* ALIGNED(128) fa, + const grid_t* g, + field_buffers_t& fb) { + // TODO halo exchange not yet implemented for ghost->edge summation. + // Need modified versions of begin/end_halo_exchange(...) + // to pack local ghosts, unpack by adding to remote edges + // instead of pack local edges, unpack by replacing remote ghosts. + // --ATr,2025aug30 +//#ifdef VPIC_ENABLE_HALO_EXCHANGE +// end_halo_add_exchange(fa, field_var::jfx, field_var::rhof+1); +//#else + // End receiving + end_recv_edge_hyb_jf(fa, -1, 0, 0); + end_recv_edge_hyb_jf(fa, 0, -1, 0); + end_recv_edge_hyb_jf(fa, 0, 0, -1); + end_recv_edge_hyb_jf(fa, 1, 0, 0); + end_recv_edge_hyb_jf(fa, 0, 1, 0); + end_recv_edge_hyb_jf(fa, 0, 0, 1); + + // End sending + end_send_edge_hyb_jf(fa, -1, 0, 0); + end_send_edge_hyb_jf(fa, 0, -1, 0); + end_send_edge_hyb_jf(fa, 0, 0, -1); + end_send_edge_hyb_jf(fa, 1, 0, 0); + end_send_edge_hyb_jf(fa, 0, 1, 0); + end_send_edge_hyb_jf(fa, 0, 0, 1); + + Kokkos::fence(); +//#endif +} + /***************************************************************************** * Synchronization functions * diff --git a/src/field_advance/standard/sfa_private.h b/src/field_advance/standard/sfa_private.h index afd859af..fdb2768b 100644 --- a/src/field_advance/standard/sfa_private.h +++ b/src/field_advance/standard/sfa_private.h @@ -425,42 +425,44 @@ local_ghost_tang_b( field_t * ALIGNED(128) f, const grid_t * g ); void -k_local_ghost_tang_b(field_array_t * RESTRICT f, - const grid_t * g); +k_local_ghost_tang_b( field_array_t * RESTRICT f, + const grid_t * g ); void local_ghost_norm_e( field_t * ALIGNED(128) f, const grid_t * g ); + void -k_local_ghost_norm_e( field_array_t * ALIGNED(128) f, - const grid_t * g ); +k_local_ghost_norm_e( field_array_t * ALIGNED(128) f, + const grid_t * g ); void local_ghost_div_b( field_t * ALIGNED(128) f, const grid_t * g ); void -k_local_ghost_div_b( field_array_t * ALIGNED(128) f, - const grid_t * g ); +k_local_ghost_div_b( field_array_t * ALIGNED(128) f, + const grid_t * g ); void local_adjust_tang_e( field_t * ALIGNED(128) f, const grid_t * g ); void -k_local_adjust_tang_e(field_array_t* RESTRICT f, - const grid_t* g); +k_local_adjust_tang_e( field_array_t * RESTRICT f, + const grid_t * g ); void local_adjust_div_e( field_t * ALIGNED(128) f, const grid_t * g ); + void -k_local_adjust_div_e( field_array_t * ALIGNED(128) f, - const grid_t * g ); +k_local_adjust_div_e( field_array_t * ALIGNED(128) f, + const grid_t * g ); void k_local_adjust_norm_b( field_array_t * RESTRICT fa, - const grid_t * g ); + const grid_t * g ); void local_adjust_norm_b( field_t * ALIGNED(128) f, @@ -482,36 +484,40 @@ local_adjust_rhob( field_t * ALIGNED(128) f, const grid_t * g ); void -k_local_adjust_rhof(field_array_t* ALIGNED(128) f, - const grid_t* g); +k_local_adjust_rhof( field_array_t * ALIGNED(128) f, + const grid_t * g ); void -k_local_adjust_rhob(field_array_t* ALIGNED(128) f, - const grid_t* g); +k_local_adjust_rhob(field_array_t * ALIGNED(128) f, + const grid_t * g ); void -k_local_adjust_jf( field_array_t * ALIGNED(128) f, - const grid_t * g ); +k_local_adjust_jf( field_array_t * ALIGNED(128) f, + const grid_t * g ); + +void +k_hyb_local_ghost_e( field_array_t * RESTRICT f, + const grid_t * g ); + +void +k_hyb_local_ghost_b( field_array_t * RESTRICT f, + const grid_t * g ); + +void +k_hyb_local_ghost_jf( field_array_t * RESTRICT f, + const grid_t * g ); void -k_hyb_local_ghost_e( field_array_t * RESTRICT f, - const grid_t * g ); +k_hyb_local_ghost_ot( field_array_t * RESTRICT f, + const grid_t * g ); void -k_hyb_local_ghost_b( field_array_t * RESTRICT f, - const grid_t * g ); - - void -k_hyb_local_ghost_jf( field_array_t * RESTRICT f, - const grid_t * g ); - - void -k_hyb_local_ghost_ot( field_array_t * RESTRICT f, - const grid_t * g ); - - void -k_hyb_local_ghost_lapl_b( field_array_t * RESTRICT f, - const grid_t * g ); +k_hyb_local_ghost_lapl_b( field_array_t * RESTRICT f, + const grid_t * g ); + +void +k_hyb_local_adjust_jf( field_array_t * RESTRICT f, + const grid_t * g ); // In remote.c @@ -649,5 +655,14 @@ k_end_remote_ghost_hyb_o( field_array_t * ALIGNED(128) f, const grid_t * g, field_buffers_t& f_buffers ); +void +k_begin_remote_edge_hyb_jf( field_array_t * ALIGNED(128) f, + const grid_t * g, + field_buffers_t& f_buffers ); +void +k_end_remote_edge_hyb_jf( field_array_t * ALIGNED(128) f, + const grid_t * g, + field_buffers_t& f_buffers ); + #endif // _sfa_private_h_ diff --git a/src/sf_interface/interpolator_array.cc b/src/sf_interface/interpolator_array.cc index 64c1e8c0..6dbf6d9a 100644 --- a/src/sf_interface/interpolator_array.cc +++ b/src/sf_interface/interpolator_array.cc @@ -45,171 +45,486 @@ delete_interpolator_array( interpolator_array_t * ia ) { void load_interpolator_array_kokkos(k_interpolator_t k_interp, k_field_t k_field, int nx, int ny, int nz) { #define pi_ex k_interp(pi_index, interpolator_var::ex) + #define pi_dexdx k_interp(pi_index, interpolator_var::dexdx) #define pi_dexdy k_interp(pi_index, interpolator_var::dexdy) #define pi_dexdz k_interp(pi_index, interpolator_var::dexdz) - #define pi_d2exdydz k_interp(pi_index, interpolator_var::d2exdydz) - + #define pi_d2exdx k_interp(pi_index, interpolator_var::d2exdx) + #define pi_d2exdy k_interp(pi_index, interpolator_var::d2exdy) + #define pi_d2exdz k_interp(pi_index, interpolator_var::d2exdz) #define pi_ey k_interp(pi_index, interpolator_var::ey) - #define pi_deydz k_interp(pi_index, interpolator_var::deydz) #define pi_deydx k_interp(pi_index, interpolator_var::deydx) - #define pi_d2eydzdx k_interp(pi_index, interpolator_var::d2eydzdx) - + #define pi_deydy k_interp(pi_index, interpolator_var::deydy) + #define pi_deydz k_interp(pi_index, interpolator_var::deydz) + #define pi_d2eydx k_interp(pi_index, interpolator_var::d2eydx) + #define pi_d2eydy k_interp(pi_index, interpolator_var::d2eydy) + #define pi_d2eydz k_interp(pi_index, interpolator_var::d2eydz) #define pi_ez k_interp(pi_index, interpolator_var::ez) #define pi_dezdx k_interp(pi_index, interpolator_var::dezdx) #define pi_dezdy k_interp(pi_index, interpolator_var::dezdy) - #define pi_d2ezdxdy k_interp(pi_index, interpolator_var::d2ezdxdy) - + #define pi_dezdz k_interp(pi_index, interpolator_var::dezdz) + #define pi_d2ezdx k_interp(pi_index, interpolator_var::d2ezdx) + #define pi_d2ezdy k_interp(pi_index, interpolator_var::d2ezdy) + #define pi_d2ezdz k_interp(pi_index, interpolator_var::d2ezdz) #define pi_cbx k_interp(pi_index, interpolator_var::cbx) #define pi_dcbxdx k_interp(pi_index, interpolator_var::dcbxdx) - + #define pi_dcbxdy k_interp(pi_index, interpolator_var::dcbxdy) + #define pi_dcbxdz k_interp(pi_index, interpolator_var::dcbxdz) + #define pi_d2cbxdx k_interp(pi_index, interpolator_var::d2cbxdx) + #define pi_d2cbxdy k_interp(pi_index, interpolator_var::d2cbxdy) + #define pi_d2cbxdz k_interp(pi_index, interpolator_var::d2cbxdz) #define pi_cby k_interp(pi_index, interpolator_var::cby) + #define pi_dcbydx k_interp(pi_index, interpolator_var::dcbydx) #define pi_dcbydy k_interp(pi_index, interpolator_var::dcbydy) - + #define pi_dcbydz k_interp(pi_index, interpolator_var::dcbydz) + #define pi_d2cbydx k_interp(pi_index, interpolator_var::d2cbydx) + #define pi_d2cbydy k_interp(pi_index, interpolator_var::d2cbydy) + #define pi_d2cbydz k_interp(pi_index, interpolator_var::d2cbydz) #define pi_cbz k_interp(pi_index, interpolator_var::cbz) + #define pi_dcbzdx k_interp(pi_index, interpolator_var::dcbzdx) + #define pi_dcbzdy k_interp(pi_index, interpolator_var::dcbzdy) #define pi_dcbzdz k_interp(pi_index, interpolator_var::dcbzdz) - - const float fourth = 0.25; - const float half = 0.5; + #define pi_d2cbzdx k_interp(pi_index, interpolator_var::d2cbzdx) + #define pi_d2cbzdy k_interp(pi_index, interpolator_var::d2cbzdy) + #define pi_d2cbzdz k_interp(pi_index, interpolator_var::d2cbzdz) + + #define pi_Ex0 k_interp(pi_index, interpolator_var::Ex0) + #define pi_dEx0dx k_interp(pi_index, interpolator_var::dEx0dx) + #define pi_dEx0dy k_interp(pi_index, interpolator_var::dEx0dy) + #define pi_dEx0dz k_interp(pi_index, interpolator_var::dEx0dz) + #define pi_d2Ex0dx k_interp(pi_index, interpolator_var::d2Ex0dx) + #define pi_d2Ex0dy k_interp(pi_index, interpolator_var::d2Ex0dy) + #define pi_d2Ex0dz k_interp(pi_index, interpolator_var::d2Ex0dz) + #define pi_Ey0 k_interp(pi_index, interpolator_var::Ey0) + #define pi_dEy0dx k_interp(pi_index, interpolator_var::dEy0dx) + #define pi_dEy0dy k_interp(pi_index, interpolator_var::dEy0dy) + #define pi_dEy0dz k_interp(pi_index, interpolator_var::dEy0dz) + #define pi_d2Ey0dx k_interp(pi_index, interpolator_var::d2Ey0dx) + #define pi_d2Ey0dy k_interp(pi_index, interpolator_var::d2Ey0dy) + #define pi_d2Ey0dz k_interp(pi_index, interpolator_var::d2Ey0dz) + #define pi_Ez0 k_interp(pi_index, interpolator_var::Ez0) + #define pi_dEz0dx k_interp(pi_index, interpolator_var::dEz0dx) + #define pi_dEz0dy k_interp(pi_index, interpolator_var::dEz0dy) + #define pi_dEz0dz k_interp(pi_index, interpolator_var::dEz0dz) + #define pi_d2Ez0dx k_interp(pi_index, interpolator_var::d2Ez0dx) + #define pi_d2Ez0dy k_interp(pi_index, interpolator_var::d2Ez0dy) + #define pi_d2Ez0dz k_interp(pi_index, interpolator_var::d2Ez0dz) + + #define pi_Gx0 k_interp(pi_index, interpolator_var::Gx0) + #define pi_dGx0dx k_interp(pi_index, interpolator_var::dGx0dx) + #define pi_dGx0dy k_interp(pi_index, interpolator_var::dGx0dy) + #define pi_dGx0dz k_interp(pi_index, interpolator_var::dGx0dz) + #define pi_d2Gx0dx k_interp(pi_index, interpolator_var::d2Gx0dx) + #define pi_d2Gx0dy k_interp(pi_index, interpolator_var::d2Gx0dy) + #define pi_d2Gx0dz k_interp(pi_index, interpolator_var::d2Gx0dz) + #define pi_Gy0 k_interp(pi_index, interpolator_var::Gy0) + #define pi_dGy0dx k_interp(pi_index, interpolator_var::dGy0dx) + #define pi_dGy0dy k_interp(pi_index, interpolator_var::dGy0dy) + #define pi_dGy0dz k_interp(pi_index, interpolator_var::dGy0dz) + #define pi_d2Gy0dx k_interp(pi_index, interpolator_var::d2Gy0dx) + #define pi_d2Gy0dy k_interp(pi_index, interpolator_var::d2Gy0dy) + #define pi_d2Gy0dz k_interp(pi_index, interpolator_var::d2Gy0dz) + #define pi_Gz0 k_interp(pi_index, interpolator_var::Gz0) + #define pi_dGz0dx k_interp(pi_index, interpolator_var::dGz0dx) + #define pi_dGz0dy k_interp(pi_index, interpolator_var::dGz0dy) + #define pi_dGz0dz k_interp(pi_index, interpolator_var::dGz0dz) + #define pi_d2Gz0dx k_interp(pi_index, interpolator_var::d2Gz0dx) + #define pi_d2Gz0dy k_interp(pi_index, interpolator_var::d2Gz0dy) + #define pi_d2Gz0dz k_interp(pi_index, interpolator_var::d2Gz0dz) + + const float twelfth = 1./12.; + const float sixth = 1./6.; + const float half = 0.5; + const float two = 2.0; + const float six = 6.; Kokkos::MDRangePolicy> load_policy({1, 1, 1}, {nz+1, ny+1, nx+1}); Kokkos::parallel_for("load interpolator", load_policy, KOKKOS_LAMBDA(const int z, const int y, const int x) { - //pi = &fi(1,y,z); - int pi_index = VOXEL(1, y, z, nx,ny,nz) + x-1; - - //pf0 = &f(1,y,z); - int pf0_index = VOXEL(1, y, z, nx,ny,nz) + x-1; - - //pfx = &f(2,y,z); - int pfx_index = VOXEL(2, y, z, nx,ny,nz) + x-1; - - //pfy = &f(1,y+1,z); - int pfy_index = VOXEL(1, y+1, z, nx,ny,nz) + x-1; - - //pfz = &f(1,y,z+1); - int pfz_index = VOXEL(1, y, z+1, nx,ny,nz) + x-1; - - //pfyz = &f(1,y+1,z+1); - int pfyz_index = VOXEL(1, y+1, z+1, nx,ny,nz) + x-1; - - //pfzx = &f(2,y,z+1); - int pfzx_index = VOXEL(2, y, z+1, nx,ny,nz) + x-1; - - //pfxy = &f(2,y+1,z); - int pfxy_index = VOXEL(2, y+1, z, nx,ny,nz) + x-1; + int pi_index = VOXEL(1, y, z, nx,ny,nz) + x-1; //pi = &fi(1,y,z); + int pf0_index = VOXEL(1, y, z, nx,ny,nz) + x-1; //pf0 = &f(1,y,z); + int pfx_index = VOXEL(2, y, z, nx,ny,nz) + x-1; //pfx = &f(2,y,z); + int pfy_index = VOXEL(1, y+1, z, nx,ny,nz) + x-1; //pfy = &f(1,y+1,z); + int pfz_index = VOXEL(1, y, z+1, nx,ny,nz) + x-1; //pfz = &f(1,y,z+1); + int pfmx_index = VOXEL(x-1, y, z, nx,ny,nz); + int pfmy_index = VOXEL(x, y-1, z, nx,ny,nz); + int pfmz_index = VOXEL(x, y, z-1, nx,ny,nz); // ex interpolation coefficients // ex->tx from hyb_smooth_eb(...) - //w0 = pf0->ex; - #define w0 k_field(pf0_index, field_var::tx) - //w1 = pfy->ex; - #define w1 k_field(pfy_index, field_var::tx) - //w2 = pfz->ex; - #define w2 k_field(pfz_index, field_var::tx) - //w3 = pfyz->ex; - #define w3 k_field(pfyz_index, field_var::tx) - - pi_ex = w0;//fourth*( (w3 + w0) + (w1 + w2) ); - pi_dexdy = fourth*( (w3 - w0) + (w1 - w2) ); - pi_dexdz = fourth*( (w3 - w0) - (w1 - w2) ); - pi_d2exdydz = fourth*( (w3 + w0) - (w1 + w2) ); - - #undef w0 - #undef w1 - #undef w2 - #undef w3 + auto w0 = k_field(pf0_index, field_var::tx); + auto wx = k_field(pfx_index, field_var::tx); + auto wy = k_field(pfy_index, field_var::tx); + auto wz = k_field(pfz_index, field_var::tx); + auto wmx = k_field(pfmx_index, field_var::tx); + auto wmy = k_field(pfmy_index, field_var::tx); + auto wmz = k_field(pfmz_index, field_var::tx); + +#ifdef SHAPE_NGP + pi_ex = w0; +#else +#ifdef SHAPE_QS + pi_ex = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dexdx = sixth*(wx - wmx); + pi_dexdy = sixth*(wy - wmy); + pi_dexdz = sixth*(wz - wmz); + pi_d2exdx = twelfth*(wx + wmx - two*w0); + pi_d2exdy = twelfth*(wy + wmy - two*w0); + pi_d2exdz = twelfth*(wz + wmz - two*w0); +#endif +#endif // ey interpolation coefficients // ey->ty from hyb_smooth_eb(...) - - //w0 = pf0->ey; - #define w0 k_field(pf0_index, field_var::ty) - //w1 = pfz->ey; - #define w1 k_field(pfz_index, field_var::ty) - //w2 = pfx->ey; - #define w2 k_field(pfx_index, field_var::ty) - //w3 = pfzx->ey; - #define w3 k_field(pfzx_index, field_var::ty) - - pi_ey = w0;//fourth*( (w3 + w0) + (w1 + w2) ); - pi_deydz = fourth*( (w3 - w0) + (w1 - w2) ); - pi_deydx = fourth*( (w3 - w0) - (w1 - w2) ); - pi_d2eydzdx = fourth*( (w3 + w0) - (w1 + w2) ); - - #undef w0 - #undef w1 - #undef w2 - #undef w3 + w0 = k_field(pf0_index, field_var::ty); + wx = k_field(pfx_index, field_var::ty); + wy = k_field(pfy_index, field_var::ty); + wz = k_field(pfz_index, field_var::ty); + wmx = k_field(pfmx_index, field_var::ty); + wmy = k_field(pfmy_index, field_var::ty); + wmz = k_field(pfmz_index, field_var::ty); + +#ifdef SHAPE_NGP + pi_ey = w0; +#else +#ifdef SHAPE_QS + pi_ey = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_deydx = sixth*(wx - wmx); + pi_deydy = sixth*(wy - wmy); + pi_deydz = sixth*(wz - wmz); + pi_d2eydx = twelfth*(wx + wmx - two*w0); + pi_d2eydy = twelfth*(wy + wmy - two*w0); + pi_d2eydz = twelfth*(wz + wmz - two*w0); +#endif +#endif // ez interpolation coefficients // ez->tz from hyb_smooth_eb(...) - - // w0 = pf0->ez; - #define w0 k_field(pf0_index, field_var::tz) - // w1 = pfx->ez; - #define w1 k_field(pfx_index, field_var::tz) - // w2 = pfy->ez; - #define w2 k_field(pfy_index, field_var::tz) - // w3 = pfxy->ez; - #define w3 k_field(pfxy_index, field_var::tz) - pi_ez = w0;//fourth*( (w3 + w0) + (w1 + w2) ); - pi_dezdx = fourth*( (w3 - w0) + (w1 - w2) ); - pi_dezdy = fourth*( (w3 - w0) - (w1 - w2) ); - pi_d2ezdxdy = fourth*( (w3 + w0) - (w1 + w2) ); - - #undef w0 - #undef w1 - #undef w2 - #undef w3 + w0 = k_field(pf0_index, field_var::tz); + wx = k_field(pfx_index, field_var::tz); + wy = k_field(pfy_index, field_var::tz); + wz = k_field(pfz_index, field_var::tz); + wmx = k_field(pfmx_index, field_var::tz); + wmy = k_field(pfmy_index, field_var::tz); + wmz = k_field(pfmz_index, field_var::tz); + +#ifdef SHAPE_NGP + pi_ez = w0; +#else +#ifdef SHAPE_QS + pi_ez = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dezdx = sixth*(wx - wmx); + pi_dezdy = sixth*(wy - wmy); + pi_dezdz = sixth*(wz - wmz); + pi_d2ezdx = twelfth*(wx + wmx - two*w0); + pi_d2ezdy = twelfth*(wy + wmy - two*w0); + pi_d2ezdz = twelfth*(wz + wmz - two*w0); +#endif +#endif // bx interpolation coefficients // cbx->ox from hyb_smooth_eb(...) - - //w0 = pf0->cbx; - #define w0 k_field(pf0_index, field_var::ox) - #define w0b k_field(pf0_index, field_var::cbx0) - //w1 = pfx->cbx; - #define w1 k_field(pfx_index, field_var::ox) - // #define w1b k_field(pfx_index, field_var::cbx0) - pi_cbx = w0 + w0b;//half*( w1 + w0 ); - pi_dcbxdx = half*( w1 - w0 ); // To-do: Fix for QS - - #undef w0 - #undef w0b - #undef w1 + w0 = k_field(pf0_index, field_var::ox) + k_field(pf0_index, field_var::cbx0); + wx = k_field(pfx_index, field_var::ox) + k_field(pfx_index, field_var::cbx0); + wy = k_field(pfy_index, field_var::ox) + k_field(pfy_index, field_var::cbx0); + wz = k_field(pfz_index, field_var::ox) + k_field(pfz_index, field_var::cbx0); + wmx = k_field(pfmx_index, field_var::ox) + k_field(pfmx_index, field_var::cbx0); + wmy = k_field(pfmy_index, field_var::ox) + k_field(pfmy_index, field_var::cbx0); + wmz = k_field(pfmz_index, field_var::ox) + k_field(pfmz_index, field_var::cbx0); + +#ifdef SHAPE_NGP + pi_cbx = w0; +#else +#ifdef SHAPE_QS + pi_cbx = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dcbxdx = sixth*(wx - wmx); + pi_dcbxdy = sixth*(wy - wmy); + pi_dcbxdz = sixth*(wz - wmz); + pi_d2cbxdx = twelfth*(wx + wmx - two*w0); + pi_d2cbxdy = twelfth*(wy + wmy - two*w0); + pi_d2cbxdz = twelfth*(wz + wmz - two*w0); +#endif +#endif // by interpolation coefficients // cby->oy from hyb_smooth_eb(...) - - // w0 = pf0->cby; - #define w0 k_field(pf0_index, field_var::oy) - #define w0b k_field(pf0_index, field_var::cby0) - // w1 = pfy->cby; - #define w1 k_field(pfy_index, field_var::oy) - - pi_cby = w0 + w0b;//half*( w1 + w0 ); - pi_dcbydy = half*( w1 - w0 ); // To-do: Fix for QS - - #undef w0 - #undef w0b - #undef w1 + w0 = k_field(pf0_index, field_var::oy) + k_field(pf0_index, field_var::cby0); + wx = k_field(pfx_index, field_var::oy) + k_field(pfx_index, field_var::cby0); + wy = k_field(pfy_index, field_var::oy) + k_field(pfy_index, field_var::cby0); + wz = k_field(pfz_index, field_var::oy) + k_field(pfz_index, field_var::cby0); + wmx = k_field(pfmx_index, field_var::oy) + k_field(pfmx_index, field_var::cby0); + wmy = k_field(pfmy_index, field_var::oy) + k_field(pfmy_index, field_var::cby0); + wmz = k_field(pfmz_index, field_var::oy) + k_field(pfmz_index, field_var::cby0); + +#ifdef SHAPE_NGP + pi_cby = w0; +#else +#ifdef SHAPE_QS + pi_cby = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dcbydx = sixth*(wx - wmx); + pi_dcbydy = sixth*(wy - wmy); + pi_dcbydz = sixth*(wz - wmz); + pi_d2cbydx = twelfth*(wx + wmx - two*w0); + pi_d2cbydy = twelfth*(wy + wmy - two*w0); + pi_d2cbydz = twelfth*(wz + wmz - two*w0); +#endif +#endif // bz interpolation coefficients // cbz->oz from hyb_smooth_eb(...) - - // w0 = pf0->cbz; - #define w0 k_field(pf0_index, field_var::oz) - #define w0b k_field(pf0_index, field_var::cbz0) - // w1 = pfz->cbz; - #define w1 k_field(pfz_index, field_var::oz) - pi_cbz = w0 + w0b;//half*( w1 + w0 ); - pi_dcbzdz = half*( w1 - w0 ); // To-do: Fix for QS - - #undef w0 - #undef w0b - #undef w1 + w0 = k_field(pf0_index, field_var::oz) + k_field(pf0_index, field_var::cbz0); + wx = k_field(pfx_index, field_var::oz) + k_field(pfx_index, field_var::cbz0); + wy = k_field(pfy_index, field_var::oz) + k_field(pfy_index, field_var::cbz0); + wz = k_field(pfz_index, field_var::oz) + k_field(pfz_index, field_var::cbz0); + wmx = k_field(pfmx_index, field_var::oz) + k_field(pfmx_index, field_var::cbz0); + wmy = k_field(pfmy_index, field_var::oz) + k_field(pfmy_index, field_var::cbz0); + wmz = k_field(pfmz_index, field_var::oz) + k_field(pfmz_index, field_var::cbz0); + +#ifdef SHAPE_NGP + pi_cbz = w0; +#else +#ifdef SHAPE_QS + pi_cbz = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dcbzdx = sixth*(wx - wmx); + pi_dcbzdy = sixth*(wy - wmy); + pi_dcbzdz = sixth*(wz - wmz); + pi_d2cbzdx = twelfth*(wx + wmx - two*w0); + pi_d2cbzdy = twelfth*(wy + wmy - two*w0); + pi_d2cbzdz = twelfth*(wz + wmz - two*w0); +#endif +#endif + +#ifdef EXTERNAL_FORCE + + // Ex0 interpolation coefficients + w0 = k_field(pf0_index, field_var::Ex0); + wx = k_field(pfx_index, field_var::Ex0); + wy = k_field(pfy_index, field_var::Ex0); + wz = k_field(pfz_index, field_var::Ex0); + wmx = k_field(pfmx_index, field_var::Ex0); + wmy = k_field(pfmy_index, field_var::Ex0); + wmz = k_field(pfmz_index, field_var::Ex0); + +#ifdef SHAPE_NGP + pi_Ex0 = w0; +#else +#ifdef SHAPE_QS + pi_Ex0 = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dEx0dx = sixth*(wx - wmx); + pi_dEx0dy = sixth*(wy - wmy); + pi_dEx0dz = sixth*(wz - wmz); + pi_d2Ex0dx = twelfth*(wx + wmx - two*w0); + pi_d2Ex0dy = twelfth*(wy + wmy - two*w0); + pi_d2Ex0dz = twelfth*(wz + wmz - two*w0); +#endif +#endif + + // Ey0 interpolation coefficients + w0 = k_field(pf0_index, field_var::Ey0); + wx = k_field(pfx_index, field_var::Ey0); + wy = k_field(pfy_index, field_var::Ey0); + wz = k_field(pfz_index, field_var::Ey0); + wmx = k_field(pfmx_index, field_var::Ey0); + wmy = k_field(pfmy_index, field_var::Ey0); + wmz = k_field(pfmz_index, field_var::Ey0); + +#ifdef SHAPE_NGP + pi_Ey0 = w0; +#else +#ifdef SHAPE_QS + pi_Ey0 = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dEy0dx = sixth*(wx - wmx); + pi_dEy0dy = sixth*(wy - wmy); + pi_dEy0dz = sixth*(wz - wmz); + pi_d2Ey0dx = twelfth*(wx + wmx - two*w0); + pi_d2Ey0dy = twelfth*(wy + wmy - two*w0); + pi_d2Ey0dz = twelfth*(wz + wmz - two*w0); +#endif +#endif + + // Ez0 interpolation coefficients + w0 = k_field(pf0_index, field_var::Ez0); + wx = k_field(pfx_index, field_var::Ez0); + wy = k_field(pfy_index, field_var::Ez0); + wz = k_field(pfz_index, field_var::Ez0); + wmx = k_field(pfmx_index, field_var::Ez0); + wmy = k_field(pfmy_index, field_var::Ez0); + wmz = k_field(pfmz_index, field_var::Ez0); + +#ifdef SHAPE_NGP + pi_Ez0 = w0; +#else +#ifdef SHAPE_QS + pi_Ez0 = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dEz0dx = sixth*(wx - wmx); + pi_dEz0dy = sixth*(wy - wmy); + pi_dEz0dz = sixth*(wz - wmz); + pi_d2Ez0dx = twelfth*(wx + wmx - two*w0); + pi_d2Ez0dy = twelfth*(wy + wmy - two*w0); + pi_d2Ez0dz = twelfth*(wz + wmz - two*w0); +#endif +#endif + + // Gx0 interpolation coefficients + w0 = k_field(pf0_index, field_var::Gx0); + wx = k_field(pfx_index, field_var::Gx0); + wy = k_field(pfy_index, field_var::Gx0); + wz = k_field(pfz_index, field_var::Gx0); + wmx = k_field(pfmx_index, field_var::Gx0); + wmy = k_field(pfmy_index, field_var::Gx0); + wmz = k_field(pfmz_index, field_var::Gx0); + +#ifdef SHAPE_NGP + pi_Gx0 = w0; +#else +#ifdef SHAPE_QS + pi_Gx0 = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dGx0dx = sixth*(wx - wmx); + pi_dGx0dy = sixth*(wy - wmy); + pi_dGx0dz = sixth*(wz - wmz); + pi_d2Gx0dx = twelfth*(wx + wmx - two*w0); + pi_d2Gx0dy = twelfth*(wy + wmy - two*w0); + pi_d2Gx0dz = twelfth*(wz + wmz - two*w0); +#endif +#endif + + // Gy0 interpolation coefficients + w0 = k_field(pf0_index, field_var::Gy0); + wx = k_field(pfx_index, field_var::Gy0); + wy = k_field(pfy_index, field_var::Gy0); + wz = k_field(pfz_index, field_var::Gy0); + wmx = k_field(pfmx_index, field_var::Gy0); + wmy = k_field(pfmy_index, field_var::Gy0); + wmz = k_field(pfmz_index, field_var::Gy0); + +#ifdef SHAPE_NGP + pi_Gy0 = w0; +#else +#ifdef SHAPE_QS + pi_Gy0 = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dGy0dx = sixth*(wx - wmx); + pi_dGy0dy = sixth*(wy - wmy); + pi_dGy0dz = sixth*(wz - wmz); + pi_d2Gy0dx = twelfth*(wx + wmx - two*w0); + pi_d2Gy0dy = twelfth*(wy + wmy - two*w0); + pi_d2Gy0dz = twelfth*(wz + wmz - two*w0); +#endif +#endif + + // Gz0 interpolation coefficients + w0 = k_field(pf0_index, field_var::Gz0); + wx = k_field(pfx_index, field_var::Gz0); + wy = k_field(pfy_index, field_var::Gz0); + wz = k_field(pfz_index, field_var::Gz0); + wmx = k_field(pfmx_index, field_var::Gz0); + wmy = k_field(pfmy_index, field_var::Gz0); + wmz = k_field(pfmz_index, field_var::Gz0); + +#ifdef SHAPE_NGP + pi_Gz0 = w0; +#else +#ifdef SHAPE_QS + pi_Gz0 = twelfth*(six*w0 + wx + wy + wz + wmx + wmy + wmz); + pi_dGz0dx = sixth*(wx - wmx); + pi_dGz0dy = sixth*(wy - wmy); + pi_dGz0dz = sixth*(wz - wmz); + pi_d2Gz0dx = twelfth*(wx + wmx - two*w0); + pi_d2Gz0dy = twelfth*(wy + wmy - two*w0); + pi_d2Gz0dz = twelfth*(wz + wmz - two*w0); +#endif +#endif + +// close EXTERNAL_FORCE macro block +#endif //pi++; pf0++; pfx++; pfy++; pfz++; pfyz++; pfzx++; pfxy++; - }); + + }); // end Kokkos::parallel_for("load interpolator") + + #undef pi_ex + #undef pi_dexdx + #undef pi_dexdy + #undef pi_dexdz + #undef pi_d2exdx + #undef pi_d2exdy + #undef pi_d2exdz + #undef pi_ey + #undef pi_deydx + #undef pi_deydy + #undef pi_deydz + #undef pi_d2eydx + #undef pi_d2eydy + #undef pi_d2eydz + #undef pi_ez + #undef pi_dezdx + #undef pi_dezdy + #undef pi_dezdz + #undef pi_d2ezdx + #undef pi_d2ezdy + #undef pi_d2ezdz + #undef pi_cbx + #undef pi_dcbxdx + #undef pi_dcbxdy + #undef pi_dcbxdz + #undef pi_d2cbxdx + #undef pi_d2cbxdy + #undef pi_d2cbxdz + #undef pi_cby + #undef pi_dcbydx + #undef pi_dcbydy + #undef pi_dcbydz + #undef pi_d2cbydx + #undef pi_d2cbydy + #undef pi_d2cbydz + #undef pi_cbz + #undef pi_dcbzdx + #undef pi_dcbzdy + #undef pi_dcbzdz + #undef pi_d2cbzdx + #undef pi_d2cbzdy + #undef pi_d2cbzdz + + #undef pi_Ex0 + #undef pi_dEx0dx + #undef pi_dEx0dy + #undef pi_dEx0dz + #undef pi_d2Ex0dx + #undef pi_d2Ex0dy + #undef pi_d2Ex0dz + #undef pi_Ey0 + #undef pi_dEy0dx + #undef pi_dEy0dy + #undef pi_dEy0dz + #undef pi_d2Ey0dx + #undef pi_d2Ey0dy + #undef pi_d2Ey0dz + #undef pi_Ez0 + #undef pi_dEz0dx + #undef pi_dEz0dy + #undef pi_dEz0dz + #undef pi_d2Ez0dx + #undef pi_d2Ez0dy + #undef pi_d2Ez0dz + + #undef pi_Gx0 + #undef pi_dGx0dx + #undef pi_dGx0dy + #undef pi_dGx0dz + #undef pi_d2Gx0dx + #undef pi_d2Gx0dy + #undef pi_d2Gx0dz + #undef pi_Gy0 + #undef pi_dGy0dx + #undef pi_dGy0dy + #undef pi_dGy0dz + #undef pi_d2Gy0dx + #undef pi_d2Gy0dy + #undef pi_d2Gy0dz + #undef pi_Gz0 + #undef pi_dGz0dx + #undef pi_dGz0dy + #undef pi_dGz0dz + #undef pi_d2Gz0dx + #undef pi_d2Gz0dy + #undef pi_d2Gz0dz /* Kokkos::parallel_for("load interpolator", KOKKOS_TEAM_POLICY_DEVICE @@ -384,24 +699,113 @@ interpolator_array_t::copy_to_host() { Kokkos::parallel_for("Copy interpolators to host", host_execution_policy(0, g->nv) , KOKKOS_LAMBDA (int i) { +#ifdef SHAPE_NGP host_interp[i].ex = k_interpolator_h(i, interpolator_var::ex); host_interp[i].ey = k_interpolator_h(i, interpolator_var::ey); host_interp[i].ez = k_interpolator_h(i, interpolator_var::ez); - host_interp[i].dexdy = k_interpolator_h(i, interpolator_var::dexdy); - host_interp[i].dexdz = k_interpolator_h(i, interpolator_var::dexdz); - host_interp[i].d2exdydz = k_interpolator_h(i, interpolator_var::d2exdydz); - host_interp[i].deydz = k_interpolator_h(i, interpolator_var::deydz); - host_interp[i].deydx = k_interpolator_h(i, interpolator_var::deydx); - host_interp[i].d2eydzdx = k_interpolator_h(i, interpolator_var::d2eydzdx); - host_interp[i].dezdx = k_interpolator_h(i, interpolator_var::dezdx); - host_interp[i].dezdy = k_interpolator_h(i, interpolator_var::dezdy); - host_interp[i].d2ezdxdy = k_interpolator_h(i, interpolator_var::d2ezdxdy); host_interp[i].cbx = k_interpolator_h(i, interpolator_var::cbx); host_interp[i].cby = k_interpolator_h(i, interpolator_var::cby); host_interp[i].cbz = k_interpolator_h(i, interpolator_var::cbz); - host_interp[i].dcbxdx = k_interpolator_h(i, interpolator_var::dcbxdx); - host_interp[i].dcbydy = k_interpolator_h(i, interpolator_var::dcbydy); - host_interp[i].dcbzdz = k_interpolator_h(i, interpolator_var::dcbzdz); + #ifdef EXTERNAL_FORCE + host_interp[i].Ex0 = k_interpolator_h(i, interpolator_var::Ex0); + host_interp[i].Ey0 = k_interpolator_h(i, interpolator_var::Ey0); + host_interp[i].Ez0 = k_interpolator_h(i, interpolator_var::Ez0); + host_interp[i].Gx0 = k_interpolator_h(i, interpolator_var::Gx0); + host_interp[i].Gy0 = k_interpolator_h(i, interpolator_var::Gy0); + host_interp[i].Gz0 = k_interpolator_h(i, interpolator_var::Gz0); + #endif +#else +#ifdef SHAPE_QS + host_interp[i].ex = k_interpolator_h(i, interpolator_var::ex ); + host_interp[i].dexdx = k_interpolator_h(i, interpolator_var::dexdx ); + host_interp[i].dexdy = k_interpolator_h(i, interpolator_var::dexdy ); + host_interp[i].dexdz = k_interpolator_h(i, interpolator_var::dexdz ); + host_interp[i].d2exdx = k_interpolator_h(i, interpolator_var::d2exdx ); + host_interp[i].d2exdy = k_interpolator_h(i, interpolator_var::d2exdy ); + host_interp[i].d2exdz = k_interpolator_h(i, interpolator_var::d2exdz ); + host_interp[i].ey = k_interpolator_h(i, interpolator_var::ey ); + host_interp[i].deydx = k_interpolator_h(i, interpolator_var::deydx ); + host_interp[i].deydy = k_interpolator_h(i, interpolator_var::deydy ); + host_interp[i].deydz = k_interpolator_h(i, interpolator_var::deydz ); + host_interp[i].d2eydx = k_interpolator_h(i, interpolator_var::d2eydx ); + host_interp[i].d2eydy = k_interpolator_h(i, interpolator_var::d2eydy ); + host_interp[i].d2eydz = k_interpolator_h(i, interpolator_var::d2eydz ); + host_interp[i].ez = k_interpolator_h(i, interpolator_var::ez ); + host_interp[i].dezdx = k_interpolator_h(i, interpolator_var::dezdx ); + host_interp[i].dezdy = k_interpolator_h(i, interpolator_var::dezdy ); + host_interp[i].dezdz = k_interpolator_h(i, interpolator_var::dezdz ); + host_interp[i].d2ezdx = k_interpolator_h(i, interpolator_var::d2ezdx ); + host_interp[i].d2ezdy = k_interpolator_h(i, interpolator_var::d2ezdy ); + host_interp[i].d2ezdz = k_interpolator_h(i, interpolator_var::d2ezdz ); + host_interp[i].cbx = k_interpolator_h(i, interpolator_var::cbx ); + host_interp[i].dcbxdx = k_interpolator_h(i, interpolator_var::dcbxdx ); + host_interp[i].dcbxdy = k_interpolator_h(i, interpolator_var::dcbxdy ); + host_interp[i].dcbxdz = k_interpolator_h(i, interpolator_var::dcbxdz ); + host_interp[i].d2cbxdx = k_interpolator_h(i, interpolator_var::d2cbxdx); + host_interp[i].d2cbxdy = k_interpolator_h(i, interpolator_var::d2cbxdy); + host_interp[i].d2cbxdz = k_interpolator_h(i, interpolator_var::d2cbxdz); + host_interp[i].cby = k_interpolator_h(i, interpolator_var::cby ); + host_interp[i].dcbydx = k_interpolator_h(i, interpolator_var::dcbydx ); + host_interp[i].dcbydy = k_interpolator_h(i, interpolator_var::dcbydy ); + host_interp[i].dcbydz = k_interpolator_h(i, interpolator_var::dcbydz ); + host_interp[i].d2cbydx = k_interpolator_h(i, interpolator_var::d2cbydx); + host_interp[i].d2cbydy = k_interpolator_h(i, interpolator_var::d2cbydy); + host_interp[i].d2cbydz = k_interpolator_h(i, interpolator_var::d2cbydz); + host_interp[i].cbz = k_interpolator_h(i, interpolator_var::cbz ); + host_interp[i].dcbzdx = k_interpolator_h(i, interpolator_var::dcbzdx ); + host_interp[i].dcbzdy = k_interpolator_h(i, interpolator_var::dcbzdy ); + host_interp[i].dcbzdz = k_interpolator_h(i, interpolator_var::dcbzdz ); + host_interp[i].d2cbzdx = k_interpolator_h(i, interpolator_var::d2cbzdx); + host_interp[i].d2cbzdy = k_interpolator_h(i, interpolator_var::d2cbzdy); + host_interp[i].d2cbzdz = k_interpolator_h(i, interpolator_var::d2cbzdz); + + #ifdef EXTERNAL_FORCE + host_interp[i].Ex0 = k_interpolator_h(i, interpolator_var::Ex0 ); + host_interp[i].dEx0dx = k_interpolator_h(i, interpolator_var::dEx0dx ); + host_interp[i].dEx0dy = k_interpolator_h(i, interpolator_var::dEx0dy ); + host_interp[i].dEx0dz = k_interpolator_h(i, interpolator_var::dEx0dz ); + host_interp[i].d2Ex0dx = k_interpolator_h(i, interpolator_var::d2Ex0dx ); + host_interp[i].d2Ex0dy = k_interpolator_h(i, interpolator_var::d2Ex0dy ); + host_interp[i].d2Ex0dz = k_interpolator_h(i, interpolator_var::d2Ex0dz ); + host_interp[i].Ey0 = k_interpolator_h(i, interpolator_var::Ey0 ); + host_interp[i].dEy0dx = k_interpolator_h(i, interpolator_var::dEy0dx ); + host_interp[i].dEy0dy = k_interpolator_h(i, interpolator_var::dEy0dy ); + host_interp[i].dEy0dz = k_interpolator_h(i, interpolator_var::dEy0dz ); + host_interp[i].d2Ey0dx = k_interpolator_h(i, interpolator_var::d2Ey0dx ); + host_interp[i].d2Ey0dy = k_interpolator_h(i, interpolator_var::d2Ey0dy ); + host_interp[i].d2Ey0dz = k_interpolator_h(i, interpolator_var::d2Ey0dz ); + host_interp[i].Ez0 = k_interpolator_h(i, interpolator_var::Ez0 ); + host_interp[i].dEz0dx = k_interpolator_h(i, interpolator_var::dEz0dx ); + host_interp[i].dEz0dy = k_interpolator_h(i, interpolator_var::dEz0dy ); + host_interp[i].dEz0dz = k_interpolator_h(i, interpolator_var::dEz0dz ); + host_interp[i].d2Ez0dx = k_interpolator_h(i, interpolator_var::d2Ez0dx ); + host_interp[i].d2Ez0dy = k_interpolator_h(i, interpolator_var::d2Ez0dy ); + host_interp[i].d2Ez0dz = k_interpolator_h(i, interpolator_var::d2Ez0dz ); + + host_interp[i].Gx0 = k_interpolator_h(i, interpolator_var::Gx0 ); + host_interp[i].dGx0dx = k_interpolator_h(i, interpolator_var::dGx0dx ); + host_interp[i].dGx0dy = k_interpolator_h(i, interpolator_var::dGx0dy ); + host_interp[i].dGx0dz = k_interpolator_h(i, interpolator_var::dGx0dz ); + host_interp[i].d2Gx0dx = k_interpolator_h(i, interpolator_var::d2Gx0dx ); + host_interp[i].d2Gx0dy = k_interpolator_h(i, interpolator_var::d2Gx0dy ); + host_interp[i].d2Gx0dz = k_interpolator_h(i, interpolator_var::d2Gx0dz ); + host_interp[i].Gy0 = k_interpolator_h(i, interpolator_var::Gy0 ); + host_interp[i].dGy0dx = k_interpolator_h(i, interpolator_var::dGy0dx ); + host_interp[i].dGy0dy = k_interpolator_h(i, interpolator_var::dGy0dy ); + host_interp[i].dGy0dz = k_interpolator_h(i, interpolator_var::dGy0dz ); + host_interp[i].d2Gy0dx = k_interpolator_h(i, interpolator_var::d2Gy0dx ); + host_interp[i].d2Gy0dy = k_interpolator_h(i, interpolator_var::d2Gy0dy ); + host_interp[i].d2Gy0dz = k_interpolator_h(i, interpolator_var::d2Gy0dz ); + host_interp[i].Gz0 = k_interpolator_h(i, interpolator_var::Gz0 ); + host_interp[i].dGz0dx = k_interpolator_h(i, interpolator_var::dGz0dx ); + host_interp[i].dGz0dy = k_interpolator_h(i, interpolator_var::dGz0dy ); + host_interp[i].dGz0dz = k_interpolator_h(i, interpolator_var::dGz0dz ); + host_interp[i].d2Gz0dx = k_interpolator_h(i, interpolator_var::d2Gz0dx ); + host_interp[i].d2Gz0dy = k_interpolator_h(i, interpolator_var::d2Gz0dy ); + host_interp[i].d2Gz0dz = k_interpolator_h(i, interpolator_var::d2Gz0dz ); + #endif +#endif +#endif }); } @@ -416,24 +820,113 @@ interpolator_array_t::copy_to_device() { Kokkos::parallel_for("Copy interpolators to device", host_execution_policy(0, g->nv) , KOKKOS_LAMBDA (int i) { +#ifdef SHAPE_NGP k_interpolator_h(i, interpolator_var::ex) = host_interp[i].ex; k_interpolator_h(i, interpolator_var::ey) = host_interp[i].ey; k_interpolator_h(i, interpolator_var::ez) = host_interp[i].ez; - k_interpolator_h(i, interpolator_var::dexdy) = host_interp[i].dexdy; - k_interpolator_h(i, interpolator_var::dexdz) = host_interp[i].dexdz; - k_interpolator_h(i, interpolator_var::d2exdydz) = host_interp[i].d2exdydz; - k_interpolator_h(i, interpolator_var::deydz) = host_interp[i].deydz; - k_interpolator_h(i, interpolator_var::deydx) = host_interp[i].deydx; - k_interpolator_h(i, interpolator_var::d2eydzdx) = host_interp[i].d2eydzdx; - k_interpolator_h(i, interpolator_var::dezdx) = host_interp[i].dezdx; - k_interpolator_h(i, interpolator_var::dezdy) = host_interp[i].dezdy; - k_interpolator_h(i, interpolator_var::d2ezdxdy) = host_interp[i].d2ezdxdy; k_interpolator_h(i, interpolator_var::cbx) = host_interp[i].cbx; k_interpolator_h(i, interpolator_var::cby) = host_interp[i].cby; k_interpolator_h(i, interpolator_var::cbz) = host_interp[i].cbz; - k_interpolator_h(i, interpolator_var::dcbxdx) = host_interp[i].dcbxdx; - k_interpolator_h(i, interpolator_var::dcbydy) = host_interp[i].dcbydy; - k_interpolator_h(i, interpolator_var::dcbzdz) = host_interp[i].dcbzdz; + #ifdef EXTERNAL_FORCE + k_interpolator_h(i, interpolator_var::Ex0) = host_interp[i].Ex0; + k_interpolator_h(i, interpolator_var::Ey0) = host_interp[i].Ey0; + k_interpolator_h(i, interpolator_var::Ez0) = host_interp[i].Ez0; + k_interpolator_h(i, interpolator_var::Gx0) = host_interp[i].Gx0; + k_interpolator_h(i, interpolator_var::Gy0) = host_interp[i].Gy0; + k_interpolator_h(i, interpolator_var::Gz0) = host_interp[i].Gz0; + #endif +#else +#ifdef SHAPE_QS + k_interpolator_h(i, interpolator_var::ex ) = host_interp[i].ex ; + k_interpolator_h(i, interpolator_var::dexdx ) = host_interp[i].dexdx ; + k_interpolator_h(i, interpolator_var::dexdy ) = host_interp[i].dexdy ; + k_interpolator_h(i, interpolator_var::dexdz ) = host_interp[i].dexdz ; + k_interpolator_h(i, interpolator_var::d2exdx ) = host_interp[i].d2exdx ; + k_interpolator_h(i, interpolator_var::d2exdy ) = host_interp[i].d2exdy ; + k_interpolator_h(i, interpolator_var::d2exdz ) = host_interp[i].d2exdz ; + k_interpolator_h(i, interpolator_var::ey ) = host_interp[i].ey ; + k_interpolator_h(i, interpolator_var::deydx ) = host_interp[i].deydx ; + k_interpolator_h(i, interpolator_var::deydy ) = host_interp[i].deydy ; + k_interpolator_h(i, interpolator_var::deydz ) = host_interp[i].deydz ; + k_interpolator_h(i, interpolator_var::d2eydx ) = host_interp[i].d2eydx ; + k_interpolator_h(i, interpolator_var::d2eydy ) = host_interp[i].d2eydy ; + k_interpolator_h(i, interpolator_var::d2eydz ) = host_interp[i].d2eydz ; + k_interpolator_h(i, interpolator_var::ez ) = host_interp[i].ez ; + k_interpolator_h(i, interpolator_var::dezdx ) = host_interp[i].dezdx ; + k_interpolator_h(i, interpolator_var::dezdy ) = host_interp[i].dezdy ; + k_interpolator_h(i, interpolator_var::dezdz ) = host_interp[i].dezdz ; + k_interpolator_h(i, interpolator_var::d2ezdx ) = host_interp[i].d2ezdx ; + k_interpolator_h(i, interpolator_var::d2ezdy ) = host_interp[i].d2ezdy ; + k_interpolator_h(i, interpolator_var::d2ezdz ) = host_interp[i].d2ezdz ; + k_interpolator_h(i, interpolator_var::cbx ) = host_interp[i].cbx ; + k_interpolator_h(i, interpolator_var::dcbxdx ) = host_interp[i].dcbxdx ; + k_interpolator_h(i, interpolator_var::dcbxdy ) = host_interp[i].dcbxdy ; + k_interpolator_h(i, interpolator_var::dcbxdz ) = host_interp[i].dcbxdz ; + k_interpolator_h(i, interpolator_var::d2cbxdx ) = host_interp[i].d2cbxdx; + k_interpolator_h(i, interpolator_var::d2cbxdy ) = host_interp[i].d2cbxdy; + k_interpolator_h(i, interpolator_var::d2cbxdz ) = host_interp[i].d2cbxdz; + k_interpolator_h(i, interpolator_var::cby ) = host_interp[i].cby ; + k_interpolator_h(i, interpolator_var::dcbydx ) = host_interp[i].dcbydx ; + k_interpolator_h(i, interpolator_var::dcbydy ) = host_interp[i].dcbydy ; + k_interpolator_h(i, interpolator_var::dcbydz ) = host_interp[i].dcbydz ; + k_interpolator_h(i, interpolator_var::d2cbydx ) = host_interp[i].d2cbydx; + k_interpolator_h(i, interpolator_var::d2cbydy ) = host_interp[i].d2cbydy; + k_interpolator_h(i, interpolator_var::d2cbydz ) = host_interp[i].d2cbydz; + k_interpolator_h(i, interpolator_var::cbz ) = host_interp[i].cbz ; + k_interpolator_h(i, interpolator_var::dcbzdx ) = host_interp[i].dcbzdx ; + k_interpolator_h(i, interpolator_var::dcbzdy ) = host_interp[i].dcbzdy ; + k_interpolator_h(i, interpolator_var::dcbzdz ) = host_interp[i].dcbzdz ; + k_interpolator_h(i, interpolator_var::d2cbzdx ) = host_interp[i].d2cbzdx; + k_interpolator_h(i, interpolator_var::d2cbzdy ) = host_interp[i].d2cbzdy; + k_interpolator_h(i, interpolator_var::d2cbzdz ) = host_interp[i].d2cbzdz; + + #ifdef EXTERNAL_FORCE + k_interpolator_h(i, interpolator_var::Ex0 ) = host_interp[i].Ex0 ; + k_interpolator_h(i, interpolator_var::dEx0dx ) = host_interp[i].dEx0dx ; + k_interpolator_h(i, interpolator_var::dEx0dy ) = host_interp[i].dEx0dy ; + k_interpolator_h(i, interpolator_var::dEx0dz ) = host_interp[i].dEx0dz ; + k_interpolator_h(i, interpolator_var::d2Ex0dx ) = host_interp[i].d2Ex0dx; + k_interpolator_h(i, interpolator_var::d2Ex0dy ) = host_interp[i].d2Ex0dy; + k_interpolator_h(i, interpolator_var::d2Ex0dz ) = host_interp[i].d2Ex0dz; + k_interpolator_h(i, interpolator_var::Ey0 ) = host_interp[i].Ey0 ; + k_interpolator_h(i, interpolator_var::dEy0dx ) = host_interp[i].dEy0dx ; + k_interpolator_h(i, interpolator_var::dEy0dy ) = host_interp[i].dEy0dy ; + k_interpolator_h(i, interpolator_var::dEy0dz ) = host_interp[i].dEy0dz ; + k_interpolator_h(i, interpolator_var::d2Ey0dx ) = host_interp[i].d2Ey0dx; + k_interpolator_h(i, interpolator_var::d2Ey0dy ) = host_interp[i].d2Ey0dy; + k_interpolator_h(i, interpolator_var::d2Ey0dz ) = host_interp[i].d2Ey0dz; + k_interpolator_h(i, interpolator_var::Ez0 ) = host_interp[i].Ez0 ; + k_interpolator_h(i, interpolator_var::dEz0dx ) = host_interp[i].dEz0dx ; + k_interpolator_h(i, interpolator_var::dEz0dy ) = host_interp[i].dEz0dy ; + k_interpolator_h(i, interpolator_var::dEz0dz ) = host_interp[i].dEz0dz ; + k_interpolator_h(i, interpolator_var::d2Ez0dx ) = host_interp[i].d2Ez0dx; + k_interpolator_h(i, interpolator_var::d2Ez0dy ) = host_interp[i].d2Ez0dy; + k_interpolator_h(i, interpolator_var::d2Ez0dz ) = host_interp[i].d2Ez0dz; + + k_interpolator_h(i, interpolator_var::Gx0 ) = host_interp[i].Gx0 ; + k_interpolator_h(i, interpolator_var::dGx0dx ) = host_interp[i].dGx0dx ; + k_interpolator_h(i, interpolator_var::dGx0dy ) = host_interp[i].dGx0dy ; + k_interpolator_h(i, interpolator_var::dGx0dz ) = host_interp[i].dGx0dz ; + k_interpolator_h(i, interpolator_var::d2Gx0dx ) = host_interp[i].d2Gx0dx; + k_interpolator_h(i, interpolator_var::d2Gx0dy ) = host_interp[i].d2Gx0dy; + k_interpolator_h(i, interpolator_var::d2Gx0dz ) = host_interp[i].d2Gx0dz; + k_interpolator_h(i, interpolator_var::Gy0 ) = host_interp[i].Gy0 ; + k_interpolator_h(i, interpolator_var::dGy0dx ) = host_interp[i].dGy0dx ; + k_interpolator_h(i, interpolator_var::dGy0dy ) = host_interp[i].dGy0dy ; + k_interpolator_h(i, interpolator_var::dGy0dz ) = host_interp[i].dGy0dz ; + k_interpolator_h(i, interpolator_var::d2Gy0dx ) = host_interp[i].d2Gy0dx; + k_interpolator_h(i, interpolator_var::d2Gy0dy ) = host_interp[i].d2Gy0dy; + k_interpolator_h(i, interpolator_var::d2Gy0dz ) = host_interp[i].d2Gy0dz; + k_interpolator_h(i, interpolator_var::Gz0 ) = host_interp[i].Gz0 ; + k_interpolator_h(i, interpolator_var::dGz0dx ) = host_interp[i].dGz0dx ; + k_interpolator_h(i, interpolator_var::dGz0dy ) = host_interp[i].dGz0dy ; + k_interpolator_h(i, interpolator_var::dGz0dz ) = host_interp[i].dGz0dz ; + k_interpolator_h(i, interpolator_var::d2Gz0dx ) = host_interp[i].d2Gz0dx; + k_interpolator_h(i, interpolator_var::d2Gz0dy ) = host_interp[i].d2Gz0dy; + k_interpolator_h(i, interpolator_var::d2Gz0dz ) = host_interp[i].d2Gz0dz; + #endif +#endif +#endif }); Kokkos::deep_copy(k_i_d, k_i_h); diff --git a/src/sf_interface/sf_interface.h b/src/sf_interface/sf_interface.h index c5a30318..2202a38d 100644 --- a/src/sf_interface/sf_interface.h +++ b/src/sf_interface/sf_interface.h @@ -21,13 +21,44 @@ // fi(0,:,:) or fi(nx+1,:,:)) are not used. typedef struct interpolator { - float ex, dexdy, dexdz, d2exdydz; - float ey, deydz, deydx, d2eydzdx; - float ez, dezdx, dezdy, d2ezdxdy; - float cbx, dcbxdx; - float cby, dcbydy; - float cbz, dcbzdz; +#ifdef SHAPE_NGP + //float ex, dexdy, dexdz, d2exdydz; + //float ey, deydz, deydx, d2eydzdx; + //float ez, dezdx, dezdy, d2ezdxdy; + //float cbx, dcbxdx; + //float cby, dcbydy; + //float cbz, dcbzdz; + float ex, ey, ez; + float cbx, cby, cbz; + #ifdef EXTERNAL_FORCE + float Ex0, Ey0, Ez0; + float Gx0, Gy0, Gz0; + #else float _pad[2]; // 16-byte align + #endif +#else +#ifdef SHAPE_QS + // TODO(low-priority) TEST LAYOUT - is it better to interleave padding so ex,ey,ez;bx,by,bz + // are cleanly spaced on 32-byte boundaries, or only pad end of struct???? + // --ATr,2024nov08 + float ex, dexdx, dexdy, dexdz, d2exdx, d2exdy, d2exdz; + float ey, deydx, deydy, deydz, d2eydx, d2eydy, d2eydz; + float ez, dezdx, dezdy, dezdz, d2ezdx, d2ezdy, d2ezdz; + float cbx, dcbxdx, dcbxdy, dcbxdz, d2cbxdx, d2cbxdy, d2cbxdz; + float cby, dcbydx, dcbydy, dcbydz, d2cbydx, d2cbydy, d2cbydz; + float cbz, dcbzdx, dcbzdy, dcbzdz, d2cbzdx, d2cbzdy, d2cbzdz; + #ifdef EXTERNAL_FORCE + float Ex0, dEx0dx, dEx0dy, dEx0dz, d2Ex0dx, d2Ex0dy, d2Ex0dz; + float Ey0, dEy0dx, dEy0dy, dEy0dz, d2Ey0dx, d2Ey0dy, d2Ey0dz; + float Ez0, dEz0dx, dEz0dy, dEz0dz, d2Ez0dx, d2Ez0dy, d2Ez0dz; + float Gx0, dGx0dx, dGx0dy, dGx0dz, d2Gx0dx, d2Gx0dy, d2Gx0dz; + float Gy0, dGy0dx, dGy0dy, dGy0dz, d2Gy0dx, d2Gy0dy, d2Gy0dz; + float Gz0, dGz0dx, dGz0dy, dGz0dz, d2Gz0dx, d2Gz0dy, d2Gz0dz; + #else + float _pad[2]; // 16-byte align + #endif +#endif +#endif } interpolator_t; typedef struct interpolator_array { diff --git a/src/species_advance/species_advance.h b/src/species_advance/species_advance.h index 1d141642..4c11473e 100644 --- a/src/species_advance/species_advance.h +++ b/src/species_advance/species_advance.h @@ -461,14 +461,16 @@ move_p_kokkos( float s_dispx, s_dispy, s_dispz; float s_dir[3]; float v0, v1, v2, v3, v4, v5, q; + float w0, wx, wy, wz, wmx, wmy, wmz; int axis, face; int64_t neighbor; //int pi = int(local_pm_i); int pi = pm->i; float ux,uy,uz,u,absdisp,x_half,y_half,z_half,fracdt; - const float one=1., two=2., three=3.; + const float one=1., two=2., three=3., one_twelfth=1./12.; //const float gdx=g->dx, gdy=g->dy, gdz=g->dz, gdt=g->dt; const float rV = 1.0/gdx/gdy/gdz; + const float rV12 = rV*one_twelfth; // auto k_field_scatter_access = k_f_sa.access(); // auto accum_sa = accum_sv.access(); auto scatter_access = scatter_view.access(); @@ -476,10 +478,11 @@ move_p_kokkos( //printf("in move_p %d \n", pi); #ifdef VARIABLE_CHARGE - q = rV*p_q*p_w; + q = p_q*p_w; #else - q = rV*qsp*p_w; + q = qsp*p_w; #endif + //printf("in move %d \n", pi); for(;;) { @@ -525,21 +528,82 @@ move_p_kokkos( // Accumulate the particle current density - //int iii = ii; - //int zi = iii/((nx+2)*(ny+2)); - //iii -= zi*(nx+2)*(ny+2); - //int yi = iii/(nx+2); - //int xi = iii-yi*(nx+2); - //printf("move_p accumulate here"); //if (std::is_same::value) { - scatter_access(ii, field_var::jfx) += q*ux; - scatter_access(ii, field_var::jfy) += q*uy; - scatter_access(ii, field_var::jfz) += q*uz; - scatter_access(ii, field_var::rhof) += q; +#ifdef SHAPE_NGP + scatter_access(ii, field_var::jfx) += q*rV*ux; + scatter_access(ii, field_var::jfy) += q*rV*uy; + scatter_access(ii, field_var::jfz) += q*rV*uz; + scatter_access(ii, field_var::rhof) += q*rV; +#else +#ifdef SHAPE_QS + // stencil coefficients + // ... OLD hybrid-VPIC with QS shape, the accumulator stores + // ... ... p->w*qsp * two*(three - ...) + // ... ... hyb_unload_accumulator(...) applies factor rV/12. + // ... NEW HVPIC-K not using accumulator (yet), scatter directly to mesh, + // ... ... so include all factors + w0 = q*rV12*two*( three - x_half*x_half - y_half*y_half - z_half*z_half ); + wx = q*rV12*( x_half + one )*( x_half + one ); + wy = q*rV12*( y_half + one )*( y_half + one ); + wz = q*rV12*( z_half + one )*( z_half + one ); + wmx = q*rV12*( x_half - one )*( x_half - one ); + wmy = q*rV12*( y_half - one )*( y_half - one ); + wmz = q*rV12*( z_half - one )*( z_half - one ); + + // Voxel indices + int iii = ii; + int zi = iii/((nx+2)*(ny+2)); + iii -= zi*(nx+2)*(ny+2); + int yi = iii/(nx+2); + int xi = iii-yi*(nx+2); + // Neighboring voxel 1D (flattened) indices + int iix = VOXEL(xi+1,yi,zi,nx,ny,nz); + int iiy = VOXEL(xi,yi+1,zi,nx,ny,nz); + int iiz = VOXEL(xi,yi,zi+1,nx,ny,nz); + int iimx = VOXEL(xi-1,yi,zi,nx,ny,nz); + int iimy = VOXEL(xi,yi-1,zi,nx,ny,nz); + int iimz = VOXEL(xi,yi,zi-1,nx,ny,nz); + + scatter_access(ii, field_var::jfx) += w0*ux; + scatter_access(ii, field_var::jfy) += w0*uy; + scatter_access(ii, field_var::jfz) += w0*uz; + scatter_access(ii, field_var::rhof) += w0; + + scatter_access(iix, field_var::jfx) += wx*ux; + scatter_access(iix, field_var::jfy) += wx*uy; + scatter_access(iix, field_var::jfz) += wx*uz; + scatter_access(iix, field_var::rhof) += wx; + + scatter_access(iiy, field_var::jfx) += wy*ux; + scatter_access(iiy, field_var::jfy) += wy*uy; + scatter_access(iiy, field_var::jfz) += wy*uz; + scatter_access(iiy, field_var::rhof) += wy; + + scatter_access(iiz, field_var::jfx) += wz*ux; + scatter_access(iiz, field_var::jfy) += wz*uy; + scatter_access(iiz, field_var::jfz) += wz*uz; + scatter_access(iiz, field_var::rhof) += wz; + + scatter_access(iimx, field_var::jfx) += wmx*ux; + scatter_access(iimx, field_var::jfy) += wmx*uy; + scatter_access(iimx, field_var::jfz) += wmx*uz; + scatter_access(iimx, field_var::rhof) += wmx; + + scatter_access(iimy, field_var::jfx) += wmy*ux; + scatter_access(iimy, field_var::jfy) += wmy*uy; + scatter_access(iimy, field_var::jfz) += wmy*uz; + scatter_access(iimy, field_var::rhof) += wmy; + + scatter_access(iimz, field_var::jfx) += wmz*ux; + scatter_access(iimz, field_var::jfy) += wmz*uy; + scatter_access(iimz, field_var::jfz) += wmz*uz; + scatter_access(iimz, field_var::rhof) += wmz; +#endif // defined(SHAPE_QS) +#endif // defined(SHAPE_NGP) //} @@ -759,9 +823,10 @@ move_p_kokkos_host_serial( const int nz = g->nz; float ux,uy,uz,u,absdisp,x_half,y_half,z_half,fracdt; - const float one=1., two=2., three=3.; + const float one=1., two=2., three=3., one_twelfth=1./12.; const float gdx=g->dx, gdy=g->dy, gdz=g->dz, gdt=g->dt; const float rV = g->rdx * g->rdy * g->rdz; + const float rV12 = rV*one_twelfth; float cx = 0.25 * g->rdy * g->rdz / g->dt; float cy = 0.25 * g->rdz * g->rdx / g->dt; @@ -789,6 +854,7 @@ move_p_kokkos_host_serial( float s_dispx, s_dispy, s_dispz; float s_dir[3]; float v0, v1, v2, v3, v4, v5, q; + float w0, wx, wy, wz, wmx, wmy, wmz; int axis, face; int64_t neighbor; //int pi = int(local_pm_i); @@ -799,6 +865,7 @@ move_p_kokkos_host_serial( #else q = qsp*p_w; #endif + //printf("in move %d \n", pi); for(;;) { @@ -843,17 +910,79 @@ move_p_kokkos_host_serial( && -x_half<=one && -y_half<=one && -z_half<=one) { // Accumulate the particle current density - - //int iii = ii; - //int zi = iii/((nx+2)*(ny+2)); - //iii -= zi*(nx+2)*(ny+2); - //int yi = iii/(nx+2); - //int xi = iii-yi*(nx+2); - - k_jf_accum(ii, accumulator_var::jx) += rV*q*ux; - k_jf_accum(ii, accumulator_var::jy) += rV*q*uy; - k_jf_accum(ii, accumulator_var::jz) += rV*q*uz; - k_jf_accum(ii, accumulator_var::rho) += rV*q; + +#ifdef SHAPE_NGP + k_jf_accum(ii, accumulator_var::jx) += q*rV*ux; + k_jf_accum(ii, accumulator_var::jy) += q*rV*uy; + k_jf_accum(ii, accumulator_var::jz) += q*rV*uz; + k_jf_accum(ii, accumulator_var::rho) += q*rV; +#else +#ifdef SHAPE_QS + // stencil coefficients + // ... OLD hybrid-VPIC with QS shape, the accumulator stores + // ... ... p->w*qsp * two*(three - ...) + // ... ... hyb_unload_accumulator(...) applies factor rV/12. + // ... NEW HVPIC-K not using accumulator (yet), scatter directly to mesh, + // ... ... so include all factors + w0 = q*rV12*two*( three - x_half*x_half - y_half*y_half - z_half*z_half ); + wx = q*rV12*( x_half + one )*( x_half + one ); + wy = q*rV12*( y_half + one )*( y_half + one ); + wz = q*rV12*( z_half + one )*( z_half + one ); + wmx = q*rV12*( x_half - one )*( x_half - one ); + wmy = q*rV12*( y_half - one )*( y_half - one ); + wmz = q*rV12*( z_half - one )*( z_half - one ); + + // Voxel indices + int iii = ii; + int zi = iii/((nx+2)*(ny+2)); + iii -= zi*(nx+2)*(ny+2); + int yi = iii/(nx+2); + int xi = iii-yi*(nx+2); + // Neighboring voxel 1D (flattened) indices + int iix = VOXEL(xi+1,yi,zi,nx,ny,nz); + int iiy = VOXEL(xi,yi+1,zi,nx,ny,nz); + int iiz = VOXEL(xi,yi,zi+1,nx,ny,nz); + int iimx = VOXEL(xi-1,yi,zi,nx,ny,nz); + int iimy = VOXEL(xi,yi-1,zi,nx,ny,nz); + int iimz = VOXEL(xi,yi,zi-1,nx,ny,nz); + + k_jf_accum(ii, accumulator_var::jx) += w0*ux; + k_jf_accum(ii, accumulator_var::jy) += w0*uy; + k_jf_accum(ii, accumulator_var::jz) += w0*uz; + k_jf_accum(ii, accumulator_var::rho) += w0; + + k_jf_accum(iix, accumulator_var::jx) += wx*ux; + k_jf_accum(iix, accumulator_var::jy) += wx*uy; + k_jf_accum(iix, accumulator_var::jz) += wx*uz; + k_jf_accum(iix, accumulator_var::rho) += wx; + + k_jf_accum(iiy, accumulator_var::jx) += wy*ux; + k_jf_accum(iiy, accumulator_var::jy) += wy*uy; + k_jf_accum(iiy, accumulator_var::jz) += wy*uz; + k_jf_accum(iiy, accumulator_var::rho) += wy; + + k_jf_accum(iiz, accumulator_var::jx) += wz*ux; + k_jf_accum(iiz, accumulator_var::jy) += wz*uy; + k_jf_accum(iiz, accumulator_var::jz) += wz*uz; + k_jf_accum(iiz, accumulator_var::rho) += wz; + + k_jf_accum(iimx, accumulator_var::jx) += wmx*ux; + k_jf_accum(iimx, accumulator_var::jy) += wmx*uy; + k_jf_accum(iimx, accumulator_var::jz) += wmx*uz; + k_jf_accum(iimx, accumulator_var::rho) += wmx; + + k_jf_accum(iimy, accumulator_var::jx) += wmy*ux; + k_jf_accum(iimy, accumulator_var::jy) += wmy*uy; + k_jf_accum(iimy, accumulator_var::jz) += wmy*uz; + k_jf_accum(iimy, accumulator_var::rho) += wmy; + + k_jf_accum(iimz, accumulator_var::jx) += wmz*ux; + k_jf_accum(iimz, accumulator_var::jy) += wmz*uy; + k_jf_accum(iimz, accumulator_var::jz) += wmz*uz; + k_jf_accum(iimz, accumulator_var::rho) += wmz; +#endif // defined(SHAPE_QS) +#endif // defined(SHAPE_NGP) + } //if indbds } //ifmore than half dt left diff --git a/src/species_advance/standard/advance_p.cc b/src/species_advance/standard/advance_p.cc index 0cd97179..7e3f9386 100644 --- a/src/species_advance/standard/advance_p.cc +++ b/src/species_advance/standard/advance_p.cc @@ -17,25 +17,83 @@ accumulate_current(CurrentScatterAccess& current_sa, int ii, const float v0, const float v1, const float v2, const float v3, const float v4, const float v5, const float v6, const float v7, const float v8, const float v9, const float v10, const float v11) { -#ifdef VPIC_ENABLE_ACCUMULATORS - current_sa(ii, 0) += v0; - //current_sa(ii, 1) += cx*v1; - //current_sa(ii, 2) += cx*v2; - //current_sa(ii, 3) += cx*v3; - current_sa(ii, 4) += v1; - current_sa(ii, 8) += v2; - +#ifdef SHAPE_NGP +# ifdef VPIC_ENABLE_ACCUMULATORS + current_sa(ii, 0) += v0; + //current_sa(ii, 1) += cx*v1; + //current_sa(ii, 2) += cx*v2; + //current_sa(ii, 3) += cx*v3; + current_sa(ii, 4) += v1; + current_sa(ii, 8) += v2; +# else + int iii = ii; + int zi = iii/((nx+2)*(ny+2)); + iii -= zi*(nx+2)*(ny+2); + int yi = iii/(nx+2); + int xi = iii - yi*(nx+2); + current_sa(ii, field_var::jfx) += v0; + current_sa(ii, field_var::jfy) += v1; + current_sa(ii, field_var::jfz) += v2; + current_sa(ii, field_var::rhof) += v3; +# endif #else - int iii = ii; - int zi = iii/((nx+2)*(ny+2)); - iii -= zi*(nx+2)*(ny+2); - int yi = iii/(nx+2); - int xi = iii - yi*(nx+2); - current_sa(ii, field_var::jfx) += v0; - current_sa(ii, field_var::jfy) += v1; - current_sa(ii, field_var::jfz) += v2; - current_sa(ii, field_var::rhof) += v3; -#endif +#ifdef SHAPE_QS +# ifdef VPIC_ENABLE_ACCUMULATORS + // not handling VPIC_ENABLE_ACCUMULATORS case + // so fail loudly by not depositing anything + Kokkos::abort("shape_qs lacks support for VPIC_ENABLE_ACCUMULATORS"); +# else + // Voxel indices + int iii = ii; + int zi = iii/((nx+2)*(ny+2)); + iii -= zi*(nx+2)*(ny+2); + int yi = iii/(nx+2); + int xi = iii - yi*(nx+2); + // Neighboring voxel 1D (flattened) indices + int iix = VOXEL(xi+1,yi,zi,nx,ny,nz); + int iiy = VOXEL(xi,yi+1,zi,nx,ny,nz); + int iiz = VOXEL(xi,yi,zi+1,nx,ny,nz); + int iimx = VOXEL(xi-1,yi,zi,nx,ny,nz); + int iimy = VOXEL(xi,yi-1,zi,nx,ny,nz); + int iimz = VOXEL(xi,yi,zi-1,nx,ny,nz); + + current_sa(ii, field_var::jfx) += v3*v6; // w0*ux; + current_sa(ii, field_var::jfy) += v3*v7; // w0*uy; + current_sa(ii, field_var::jfz) += v3*v8; // w0*uz; + current_sa(ii, field_var::rhof) += v3; // w0; + + current_sa(iix, field_var::jfx) += v4*v6; // wx*ux; + current_sa(iix, field_var::jfy) += v4*v7; // wx*uy; + current_sa(iix, field_var::jfz) += v4*v8; // wx*uz; + current_sa(iix, field_var::rhof) += v4; // wx; + + current_sa(iiy, field_var::jfx) += v5*v6; // wy*ux; + current_sa(iiy, field_var::jfy) += v5*v7; // wy*uy; + current_sa(iiy, field_var::jfz) += v5*v8; // wy*uz; + current_sa(iiy, field_var::rhof) += v5; // wy; + + current_sa(iiz, field_var::jfx) += v9*v6; // wz*ux; + current_sa(iiz, field_var::jfy) += v9*v7; // wz*uy; + current_sa(iiz, field_var::jfz) += v9*v8; // wz*uz; + current_sa(iiz, field_var::rhof) += v9; // wz; + + current_sa(iimx, field_var::jfx) += v0*v6; // wmx*ux; + current_sa(iimx, field_var::jfy) += v0*v7; // wmx*uy; + current_sa(iimx, field_var::jfz) += v0*v8; // wmx*uz; + current_sa(iimx, field_var::rhof) += v0; // wmx; + + current_sa(iimy, field_var::jfx) += v1*v6; // wmy*ux; + current_sa(iimy, field_var::jfy) += v1*v7; // wmy*uy; + current_sa(iimy, field_var::jfz) += v1*v8; // wmy*uz; + current_sa(iimy, field_var::rhof) += v1; // wmy; + + current_sa(iimz, field_var::jfx) += v2*v6; // wmz*ux; + current_sa(iimz, field_var::jfy) += v2*v7; // wmz*uy; + current_sa(iimz, field_var::jfz) += v2*v8; // wmz*uz; + current_sa(iimz, field_var::rhof) += v2; // wmz; +# endif +#endif // defined(SHAPE_QS) +#endif // defined(SHAPE_NGP) } // Reduce the current for all active threads/lanes to reduce the number of writes to memory @@ -185,7 +243,7 @@ template KOKKOS_INLINE_FUNCTION void unrolled_simd_load(float* vals, const int* ii, const k_interpolator_t& k_interp, int len) { unrolled_simd_load(vals, ii, k_interp, len); - simd_load_interpolator_var(vals+(N-1)*18, ii[N-1], k_interp, len); + simd_load_interpolator_var(vals+(N-1)*INTERPOLATOR_VAR_COUNT, ii[N-1], k_interp, len); } template<> KOKKOS_INLINE_FUNCTION @@ -199,28 +257,18 @@ void unrolled_simd_load(float* vals, const int* ii, const k_interpolator_t& k_in } } +#ifdef SHAPE_NGP + // Load interpolators template KOKKOS_INLINE_FUNCTION void load_interpolators( float* fex, - float* fdexdy, - float* fdexdz, - float* fd2exdydz, float* fey, - float* fdeydz, - float* fdeydx, - float* fd2eydzdx, float* fez, - float* fdezdx, - float* fdezdy, - float* fd2ezdxdy, float* fcbx, - float* fdcbxdx, float* fcby, - float* fdcbydy, float* fcbz, - float* fdcbzdz, const int* ii, const int num_part, const k_interpolator_t& k_interp @@ -236,84 +284,266 @@ void load_interpolators( // Try to reduce the number of loads if all particles are in the same cell if(same_cell) { - float vals[18]; + float vals[INTERPOLATOR_VAR_COUNT]; - simd_load_interpolator_var(vals, ii[0], k_interp, 18); + simd_load_interpolator_var(vals, ii[0], k_interp, INTERPOLATOR_VAR_COUNT); #pragma omp simd for(int i=0; i(vals, ii, k_interp, 18); + float vals[INTERPOLATOR_VAR_COUNT*NumLanes]; + unrolled_simd_load(vals, ii, k_interp, INTERPOLATOR_VAR_COUNT, num_part); +// unrolled_simd_load(vals, ii, k_interp, INTERPOLATOR_VAR_COUNT); // Essentially a transpose #pragma omp simd for(int i=0; i +KOKKOS_INLINE_FUNCTION +void load_interpolators( + float* fex, + float* fdexdx, + float* fdexdy, + float* fdexdz, + float* fd2exdx, + float* fd2exdy, + float* fd2exdz, + float* fey, + float* fdeydx, + float* fdeydy, + float* fdeydz, + float* fd2eydx, + float* fd2eydy, + float* fd2eydz, + float* fez, + float* fdezdx, + float* fdezdy, + float* fdezdz, + float* fd2ezdx, + float* fd2ezdy, + float* fd2ezdz, + float* fcbx, + float* fdcbxdx, + float* fdcbxdy, + float* fdcbxdz, + float* fd2cbxdx, + float* fd2cbxdy, + float* fd2cbxdz, + float* fcby, + float* fdcbydx, + float* fdcbydy, + float* fdcbydz, + float* fd2cbydx, + float* fd2cbydy, + float* fd2cbydz, + float* fcbz, + float* fdcbzdx, + float* fdcbzdy, + float* fdcbzdz, + float* fd2cbzdx, + float* fd2cbzdy, + float* fd2cbzdz, + const int* ii, + const int num_part, + const k_interpolator_t& k_interp + ) { +#if defined(VPIC_ENABLE_VECTORIZATION) && !defined(USE_GPU) + int same_cell = 1; + for(int lane=0; lane(vals, ii, k_interp, INTERPOLATOR_VAR_COUNT); + + // Essentially a transpose + #pragma omp simd + for(int i=0; ik_f_d; float cx = 0.25 * g->rdy * g->rdz / g->dt; @@ -357,6 +590,12 @@ advance_p_kokkos_unified( float rV = g->rdx * g->rdy * g->rdz; float gdx=g->dx, gdy=g->dy, gdz=g->dz, gdt=g->dt; +#ifdef EXTERNAL_FORCE + // Don't futz with interpolator loading code that we won't even use + // --ATr,2026feb25 + Kokkos::abort("External force is not implemented for CPU particle advance"); +#endif + #define p_dx k_particles(p_index, particle_var::dx) #define p_dy k_particles(p_index, particle_var::dy) #define p_dz k_particles(p_index, particle_var::dz) @@ -369,29 +608,6 @@ advance_p_kokkos_unified( #endif #define pii k_particles_i(p_index) - #define f_cbx k_interp(ii[LANE], interpolator_var::cbx) - #define f_cby k_interp(ii[LANE], interpolator_var::cby) - #define f_cbz k_interp(ii[LANE], interpolator_var::cbz) - #define f_ex k_interp(ii[LANE], interpolator_var::ex) - #define f_ey k_interp(ii[LANE], interpolator_var::ey) - #define f_ez k_interp(ii[LANE], interpolator_var::ez) - - #define f_dexdy k_interp(ii[LANE], interpolator_var::dexdy) - #define f_dexdz k_interp(ii[LANE], interpolator_var::dexdz) - - #define f_d2exdydz k_interp(ii[LANE], interpolator_var::d2exdydz) - #define f_deydx k_interp(ii[LANE], interpolator_var::deydx) - #define f_deydz k_interp(ii[LANE], interpolator_var::deydz) - - #define f_d2eydzdx k_interp(ii[LANE], interpolator_var::d2eydzdx) - #define f_dezdx k_interp(ii[LANE], interpolator_var::dezdx) - #define f_dezdy k_interp(ii[LANE], interpolator_var::dezdy) - - #define f_d2ezdxdy k_interp(ii[LANE], interpolator_var::d2ezdxdy) - #define f_dcbxdx k_interp(ii[LANE], interpolator_var::dcbxdx) - #define f_dcbydy k_interp(ii[LANE], interpolator_var::dcbydy) - #define f_dcbzdz k_interp(ii[LANE], interpolator_var::dcbzdz) - auto rangel = g->rangel; auto rangeh = g->rangeh; @@ -476,33 +692,77 @@ advance_p_kokkos_unified( int inbnds[num_lanes]; //int midbnds[num_lanes]; - +#ifdef SHAPE_NGP float fcbx[num_lanes]; float fcby[num_lanes]; float fcbz[num_lanes]; float fex[num_lanes]; float fey[num_lanes]; float fez[num_lanes]; + float tmp0[num_lanes]; + float tmp1[num_lanes]; + float *v6 = fex; + float *v7 = fey; + float *v8 = fez; + float *v9 = fcbx; + float *v10 = fcby; + float *v11 = fcbz; + float *v12 = tmp0; + float *v13 = tmp1; +#else +#ifdef SHAPE_QS + float fex[num_lanes]; + float fdexdx[num_lanes]; float fdexdy[num_lanes]; float fdexdz[num_lanes]; - float fd2exdydz[num_lanes]; + float fd2exdx[num_lanes]; + float fd2exdy[num_lanes]; + float fd2exdz[num_lanes]; + float fey[num_lanes]; float fdeydx[num_lanes]; + float fdeydy[num_lanes]; float fdeydz[num_lanes]; - float fd2eydzdx[num_lanes]; + float fd2eydx[num_lanes]; + float fd2eydy[num_lanes]; + float fd2eydz[num_lanes]; + float fez[num_lanes]; float fdezdx[num_lanes]; float fdezdy[num_lanes]; - float fd2ezdxdy[num_lanes]; + float fdezdz[num_lanes]; + float fd2ezdx[num_lanes]; + float fd2ezdy[num_lanes]; + float fd2ezdz[num_lanes]; + float fcbx[num_lanes]; float fdcbxdx[num_lanes]; + float fdcbxdy[num_lanes]; + float fdcbxdz[num_lanes]; + float fd2cbxdx[num_lanes]; + float fd2cbxdy[num_lanes]; + float fd2cbxdz[num_lanes]; + float fcby[num_lanes]; + float fdcbydx[num_lanes]; float fdcbydy[num_lanes]; + float fdcbydz[num_lanes]; + float fd2cbydx[num_lanes]; + float fd2cbydy[num_lanes]; + float fd2cbydz[num_lanes]; + float fcbz[num_lanes]; + float fdcbzdx[num_lanes]; + float fdcbzdy[num_lanes]; float fdcbzdz[num_lanes]; + float fd2cbzdx[num_lanes]; + float fd2cbzdy[num_lanes]; + float fd2cbzdz[num_lanes]; float *v6 = fex; - float *v7 = fdexdy; - float *v8 = fdexdz; - float *v9 = fd2exdydz; + float *v7 = fdexdx; + float *v8 = fdexdy; + float *v9 = fdexdz; float *v10 = fey; - float *v11 = fdeydz; - float *v12 = fdeydx; - float *v13 = fd2eydzdx; + float *v11 = fdeydx; + float *v12 = fdeydy; + float *v13 = fdeydz; +#endif // defined(SHAPE_QS) +#endif // defined(SHAPE_NGP) size_t p_index = pi_offset; @@ -526,21 +786,29 @@ advance_p_kokkos_unified( ii[LANE] = pii; } END_VECTOR_BLOCK; - load_interpolators( fex, fdexdy, fdexdz, fd2exdydz, - fey, fdeydz, fdeydx, fd2eydzdx, - fez, fdezdx, fdezdy, fd2ezdxdy, - fcbx, fdcbxdx, - fcby, fdcbydy, - fcbz, fdcbzdz, +#ifdef SHAPE_NGP + load_interpolators( fex, fey, fez, fcbx, fcby, fcbz, + ii, num_particles, k_interp); +#else +#ifdef SHAPE_QS + load_interpolators( fex, fdexdx, fdexdy, fdexdz, fd2exdx, fd2exdy, fd2exdz, + fey, fdeydx, fdeydy, fdeydz, fd2eydx, fd2eydy, fd2eydz, + fez, fdezdx, fdezdy, fdezdz, fd2ezdx, fd2ezdy, fd2ezdz, + fcbx, fdcbxdx, fdcbxdy, fdcbxdz, fd2cbxdx, fd2cbxdy, fd2cbxdz, + fcby, fdcbydx, fdcbydy, fdcbydz, fd2cbydx, fd2cbydy, fd2cbydz, + fcbz, fdcbzdx, fdcbzdy, fdcbzdz, fd2cbzdx, fd2cbzdy, fd2cbzdz, ii, num_particles, k_interp); +#endif // defined(SHAPE_QS) +#endif // defined(SHAPE_NGP) BEGIN_VECTOR_BLOCK { +#ifdef SHAPE_NGP + // Interpolate E #ifdef VARIABLE_CHARGE - hax[LANE] = dt_2mc*qp[LANE]*( (fex[LANE] ) ); - hay[LANE] = dt_2mc*qp[LANE]*( (fey[LANE] ) ); - haz[LANE] = dt_2mc*qp[LANE]*( (fez[LANE] ) ); + hax[LANE] = dt_2mc*qp[LANE]*( (fex[LANE] ) ); + hay[LANE] = dt_2mc*qp[LANE]*( (fey[LANE] ) ); + haz[LANE] = dt_2mc*qp[LANE]*( (fez[LANE] ) ); #else - // Interpolate E hax[LANE] = qdt_2mc*( (fex[LANE] ) ); hay[LANE] = qdt_2mc*( (fey[LANE] ) ); haz[LANE] = qdt_2mc*( (fez[LANE] ) ); @@ -549,6 +817,45 @@ advance_p_kokkos_unified( cbx[LANE] = fcbx[LANE];// + dx[LANE]*fdcbxdx[LANE]; cby[LANE] = fcby[LANE];// + dy[LANE]*fdcbydy[LANE]; cbz[LANE] = fcbz[LANE];// + dz[LANE]*fdcbzdz[LANE]; +#else +#ifdef SHAPE_QS + // Interpolate E +#ifdef VARIABLE_CHARGE + hax[LANE] = dt_2mc*qp[LANE]*( fex[LANE] + + dx[LANE]*( fdexdx[LANE] + dx[LANE]*fd2exdx[LANE] ) + + dy[LANE]*( fdexdy[LANE] + dy[LANE]*fd2exdy[LANE] ) + + dz[LANE]*( fdexdz[LANE] + dz[LANE]*fd2exdz[LANE] ) ); + hay[LANE] = dt_2mc*qp[LANE]*( fey[LANE] + + dx[LANE]*( fdeydx[LANE] + dx[LANE]*fd2eydx[LANE] ) + + dy[LANE]*( fdeydy[LANE] + dy[LANE]*fd2eydy[LANE] ) + + dz[LANE]*( fdeydz[LANE] + dz[LANE]*fd2eydz[LANE] ) ); + haz[LANE] = dt_2mc*qp[LANE]*( fez[LANE] + + dx[LANE]*( fdezdx[LANE] + dx[LANE]*fd2ezdx[LANE] ) + + dy[LANE]*( fdezdy[LANE] + dy[LANE]*fd2ezdy[LANE] ) + + dz[LANE]*( fdezdz[LANE] + dz[LANE]*fd2ezdz[LANE] ) ); +#else + hax[LANE] = qdt_2mc*( fex[LANE] + dx[LANE]*( fdexdx[LANE] + dx[LANE]*fd2exdx[LANE] ) + + dy[LANE]*( fdexdy[LANE] + dy[LANE]*fd2exdy[LANE] ) + + dz[LANE]*( fdexdz[LANE] + dz[LANE]*fd2exdz[LANE] ) ); + hay[LANE] = qdt_2mc*( fey[LANE] + dx[LANE]*( fdeydx[LANE] + dx[LANE]*fd2eydx[LANE] ) + + dy[LANE]*( fdeydy[LANE] + dy[LANE]*fd2eydy[LANE] ) + + dz[LANE]*( fdeydz[LANE] + dz[LANE]*fd2eydz[LANE] ) ); + haz[LANE] = qdt_2mc*( fez[LANE] + dx[LANE]*( fdezdx[LANE] + dx[LANE]*fd2ezdx[LANE] ) + + dy[LANE]*( fdezdy[LANE] + dy[LANE]*fd2ezdy[LANE] ) + + dz[LANE]*( fdezdz[LANE] + dz[LANE]*fd2ezdz[LANE] ) ); +#endif + // Interpolate B + cbx[LANE] = fcbx[LANE] + dx[LANE]*( fdcbxdx[LANE] + dx[LANE]*fd2cbxdx[LANE] ) + + dy[LANE]*( fdcbxdy[LANE] + dy[LANE]*fd2cbxdy[LANE] ) + + dz[LANE]*( fdcbxdz[LANE] + dz[LANE]*fd2cbxdz[LANE] ); + cby[LANE] = fcby[LANE] + dx[LANE]*( fdcbydx[LANE] + dx[LANE]*fd2cbydx[LANE] ) + + dy[LANE]*( fdcbydy[LANE] + dy[LANE]*fd2cbydy[LANE] ) + + dz[LANE]*( fdcbydz[LANE] + dz[LANE]*fd2cbydz[LANE] ); + cbz[LANE] = fcbz[LANE] + dx[LANE]*( fdcbzdx[LANE] + dx[LANE]*fd2cbzdx[LANE] ) + + dy[LANE]*( fdcbzdy[LANE] + dy[LANE]*fd2cbzdy[LANE] ) + + dz[LANE]*( fdcbzdz[LANE] + dz[LANE]*fd2cbzdz[LANE] ); +#endif // defined(SHAPE_QS) +#endif // defined(SHAPE_NGP) // Half advance e ux[LANE] += hax[LANE]; @@ -646,14 +953,23 @@ advance_p_kokkos_unified( //dz[LANE] = v2[LANE]; //v5[LANE] = q[LANE]*ux[LANE]*uy[LANE]*uz[LANE]*one_third; -# define ACCUMULATE_J() \ - v0[LANE] = q[LANE]*v6[LANE]; /* = q ux */ \ - v1[LANE] = q[LANE]*v7[LANE]; /* = q uy */ \ - v2[LANE] = q[LANE]*v8[LANE]; /* = q uz */ \ - v3[LANE] = q[LANE]; /* v2 = q */ \ - - ACCUMULATE_J(); - +#ifdef SHAPE_NGP + v0[LANE] = q[LANE]*v6[LANE]; // q*ux + v1[LANE] = q[LANE]*v7[LANE]; // q*uy + v2[LANE] = q[LANE]*v8[LANE]; // q*uz + v3[LANE] = q[LANE]; +#else +#ifdef SHAPE_QS + q[LANE] *= one_twelfth; + v3[LANE] = q[LANE]*two*( three - v0[LANE]*v0[LANE] - v1[LANE]*v1[LANE] - v2[LANE]*v2[LANE] ); // w0 + v4[LANE] = q[LANE]*( v0[LANE] + one )*( v0[LANE] + one ); // wx + v5[LANE] = q[LANE]*( v1[LANE] + one )*( v1[LANE] + one ); // wy + v9[LANE] = q[LANE]*( v2[LANE] + one )*( v2[LANE] + one ); // wz + v0[LANE] = q[LANE]*( v0[LANE] - one )*( v0[LANE] - one ); // wmx // re-use due to limited number of slots + v1[LANE] = q[LANE]*( v1[LANE] - one )*( v1[LANE] - one ); // wmy // in accumulate_current(...) signature + v2[LANE] = q[LANE]*( v2[LANE] - one )*( v2[LANE] - one ); // wmz +#endif +#endif } END_VECTOR_BLOCK; @@ -671,8 +987,8 @@ advance_p_kokkos_unified( accumulate_current(current_sa, ii[LANE], nx, ny, nz, rV, v0[LANE], v1[LANE], v2[LANE], v3[LANE], - v6[LANE], v7[LANE], v8[LANE], v9[LANE], - v10[LANE], v11[LANE], v12[LANE], v13[LANE]); + v4[LANE], v5[LANE], v6[LANE], v7[LANE], + v8[LANE], v9[LANE], v10[LANE], v11[LANE]); } END_VECTOR_BLOCK; #ifdef VPIC_ENABLE_TEAM_REDUCTION } @@ -767,29 +1083,6 @@ advance_p_kokkos_unified( #endif #undef pii - -#undef f_cbx -#undef f_cby -#undef f_cbz -#undef f_ex -#undef f_ey -#undef f_ez - -#undef f_dexdy -#undef f_dexdz - -#undef f_d2exdydz -#undef f_deydx -#undef f_deydz - -#undef f_d2eydzdx -#undef f_dezdx -#undef f_dezdy - -#undef f_d2ezdxdy -#undef f_dcbxdx -#undef f_dcbydy -#undef f_dcbzdz } void @@ -822,17 +1115,24 @@ advance_p_kokkos_gpu( const int nz) { + constexpr float three = 3.; + constexpr float two = 2.; constexpr float one = 1.; constexpr float one_third = 1./3.; constexpr float two_fifteenths = 2./15.; + constexpr float one_twelfth = 1./12.; k_field_t k_field = fa->k_f_d; k_field_sa_t k_f_sv = Kokkos::Experimental::create_scatter_view<>(k_field); float cx = 0.25 * g->rdy * g->rdz / g->dt; float cy = 0.25 * g->rdz * g->rdx / g->dt; float cz = 0.25 * g->rdx * g->rdy / g->dt; float rV = g->rdx*g->rdy*g->rdz; + float rV12 = rV*one_twelfth; float gdx=g->dx, gdy=g->dy, gdz = g->dz, gdt = g->dt; +#ifdef EXTERNAL_FORCE + const float dt_2c = (g->dt)/(2*g->cvac); +#endif // Process particles for this pipeline @@ -848,28 +1148,92 @@ advance_p_kokkos_gpu( #endif #define pii k_particles_i(p_index) - #define f_cbx k_interp(ii, interpolator_var::cbx) - #define f_cby k_interp(ii, interpolator_var::cby) - #define f_cbz k_interp(ii, interpolator_var::cbz) - #define f_ex k_interp(ii, interpolator_var::ex) - #define f_ey k_interp(ii, interpolator_var::ey) - #define f_ez k_interp(ii, interpolator_var::ez) - + #define f_ex k_interp(ii, interpolator_var::ex) + #define f_dexdx k_interp(ii, interpolator_var::dexdx) #define f_dexdy k_interp(ii, interpolator_var::dexdy) #define f_dexdz k_interp(ii, interpolator_var::dexdz) - - #define f_d2exdydz k_interp(ii, interpolator_var::d2exdydz) + #define f_d2exdx k_interp(ii, interpolator_var::d2exdx) + #define f_d2exdy k_interp(ii, interpolator_var::d2exdy) + #define f_d2exdz k_interp(ii, interpolator_var::d2exdz) + #define f_ey k_interp(ii, interpolator_var::ey) #define f_deydx k_interp(ii, interpolator_var::deydx) + #define f_deydy k_interp(ii, interpolator_var::deydy) #define f_deydz k_interp(ii, interpolator_var::deydz) - - #define f_d2eydzdx k_interp(ii, interpolator_var::d2eydzdx) + #define f_d2eydx k_interp(ii, interpolator_var::d2eydx) + #define f_d2eydy k_interp(ii, interpolator_var::d2eydy) + #define f_d2eydz k_interp(ii, interpolator_var::d2eydz) + #define f_ez k_interp(ii, interpolator_var::ez) #define f_dezdx k_interp(ii, interpolator_var::dezdx) #define f_dezdy k_interp(ii, interpolator_var::dezdy) - - #define f_d2ezdxdy k_interp(ii, interpolator_var::d2ezdxdy) + #define f_dezdz k_interp(ii, interpolator_var::dezdz) + #define f_d2ezdx k_interp(ii, interpolator_var::d2ezdx) + #define f_d2ezdy k_interp(ii, interpolator_var::d2ezdy) + #define f_d2ezdz k_interp(ii, interpolator_var::d2ezdz) + #define f_cbx k_interp(ii, interpolator_var::cbx) #define f_dcbxdx k_interp(ii, interpolator_var::dcbxdx) + #define f_dcbxdy k_interp(ii, interpolator_var::dcbxdy) + #define f_dcbxdz k_interp(ii, interpolator_var::dcbxdz) + #define f_d2cbxdx k_interp(ii, interpolator_var::d2cbxdx) + #define f_d2cbxdy k_interp(ii, interpolator_var::d2cbxdy) + #define f_d2cbxdz k_interp(ii, interpolator_var::d2cbxdz) + #define f_cby k_interp(ii, interpolator_var::cby) + #define f_dcbydx k_interp(ii, interpolator_var::dcbydx) #define f_dcbydy k_interp(ii, interpolator_var::dcbydy) + #define f_dcbydz k_interp(ii, interpolator_var::dcbydz) + #define f_d2cbydx k_interp(ii, interpolator_var::d2cbydx) + #define f_d2cbydy k_interp(ii, interpolator_var::d2cbydy) + #define f_d2cbydz k_interp(ii, interpolator_var::d2cbydz) + #define f_cbz k_interp(ii, interpolator_var::cbz) + #define f_dcbzdx k_interp(ii, interpolator_var::dcbzdx) + #define f_dcbzdy k_interp(ii, interpolator_var::dcbzdy) #define f_dcbzdz k_interp(ii, interpolator_var::dcbzdz) + #define f_d2cbzdx k_interp(ii, interpolator_var::d2cbzdx) + #define f_d2cbzdy k_interp(ii, interpolator_var::d2cbzdy) + #define f_d2cbzdz k_interp(ii, interpolator_var::d2cbzdz) + + #define f_Ex0 k_interp(ii, interpolator_var::Ex0) + #define f_dEx0dx k_interp(ii, interpolator_var::dEx0dx) + #define f_dEx0dy k_interp(ii, interpolator_var::dEx0dy) + #define f_dEx0dz k_interp(ii, interpolator_var::dEx0dz) + #define f_d2Ex0dx k_interp(ii, interpolator_var::d2Ex0dx) + #define f_d2Ex0dy k_interp(ii, interpolator_var::d2Ex0dy) + #define f_d2Ex0dz k_interp(ii, interpolator_var::d2Ex0dz) + #define f_Ey0 k_interp(ii, interpolator_var::Ey0) + #define f_dEy0dx k_interp(ii, interpolator_var::dEy0dx) + #define f_dEy0dy k_interp(ii, interpolator_var::dEy0dy) + #define f_dEy0dz k_interp(ii, interpolator_var::dEy0dz) + #define f_d2Ey0dx k_interp(ii, interpolator_var::d2Ey0dx) + #define f_d2Ey0dy k_interp(ii, interpolator_var::d2Ey0dy) + #define f_d2Ey0dz k_interp(ii, interpolator_var::d2Ey0dz) + #define f_Ez0 k_interp(ii, interpolator_var::Ez0) + #define f_dEz0dx k_interp(ii, interpolator_var::dEz0dx) + #define f_dEz0dy k_interp(ii, interpolator_var::dEz0dy) + #define f_dEz0dz k_interp(ii, interpolator_var::dEz0dz) + #define f_d2Ez0dx k_interp(ii, interpolator_var::d2Ez0dx) + #define f_d2Ez0dy k_interp(ii, interpolator_var::d2Ez0dy) + #define f_d2Ez0dz k_interp(ii, interpolator_var::d2Ez0dz) + + #define f_Gx0 k_interp(ii, interpolator_var::Gx0) + #define f_dGx0dx k_interp(ii, interpolator_var::dGx0dx) + #define f_dGx0dy k_interp(ii, interpolator_var::dGx0dy) + #define f_dGx0dz k_interp(ii, interpolator_var::dGx0dz) + #define f_d2Gx0dx k_interp(ii, interpolator_var::d2Gx0dx) + #define f_d2Gx0dy k_interp(ii, interpolator_var::d2Gx0dy) + #define f_d2Gx0dz k_interp(ii, interpolator_var::d2Gx0dz) + #define f_Gy0 k_interp(ii, interpolator_var::Gy0) + #define f_dGy0dx k_interp(ii, interpolator_var::dGy0dx) + #define f_dGy0dy k_interp(ii, interpolator_var::dGy0dy) + #define f_dGy0dz k_interp(ii, interpolator_var::dGy0dz) + #define f_d2Gy0dx k_interp(ii, interpolator_var::d2Gy0dx) + #define f_d2Gy0dy k_interp(ii, interpolator_var::d2Gy0dy) + #define f_d2Gy0dz k_interp(ii, interpolator_var::d2Gy0dz) + #define f_Gz0 k_interp(ii, interpolator_var::Gz0) + #define f_dGz0dx k_interp(ii, interpolator_var::dGz0dx) + #define f_dGz0dy k_interp(ii, interpolator_var::dGz0dy) + #define f_dGz0dz k_interp(ii, interpolator_var::dGz0dz) + #define f_d2Gz0dx k_interp(ii, interpolator_var::d2Gz0dx) + #define f_d2Gz0dy k_interp(ii, interpolator_var::d2Gz0dy) + #define f_d2Gz0dz k_interp(ii, interpolator_var::d2Gz0dz) // copy local memmbers from grid //auto nfaces_per_voxel = 6; @@ -900,6 +1264,7 @@ advance_p_kokkos_gpu( #endif float v0, v1, v2, v3, v4, v5, v6; + float w0, wx, wy, wz, wmx, wmy, wmz; auto k_field_scatter_access = k_f_sv.access(); #ifdef VARIABLE_CHARGE @@ -911,13 +1276,72 @@ advance_p_kokkos_gpu( float dy = p_dy; float dz = p_dz; int ii = pii; +#ifdef SHAPE_NGP + #ifdef EXTERNAL_FORCE + float hax = qdt_2mc*( f_ex + f_Ex0 ) + dt_2c * f_Gx0; + float hay = qdt_2mc*( f_ey + f_Ey0 ) + dt_2c * f_Gy0; + float haz = qdt_2mc*( f_ez + f_Ez0 ) + dt_2c * f_Gz0; + #else float hax = qdt_2mc*( ( f_ex ) ); float hay = qdt_2mc*( ( f_ey ) ); float haz = qdt_2mc*( ( f_ez ) ); - + #endif float cbx = f_cbx;// + dx*f_dcbxdx; // Interpolate B float cby = f_cby;// + dy*f_dcbydy; float cbz = f_cbz;// + dz*f_dcbzdz; +#elif defined( SHAPE_QS ) + #ifdef EXTERNAL_FORCE + // Interpolate E, E0, G0 + float hax = qdt_2mc*( f_ex + dx*( f_dexdx + dx*f_d2exdx ) + + dy*( f_dexdy + dy*f_d2exdy ) + + dz*( f_dexdz + dz*f_d2exdz ) + + f_Ex0 + dx*( f_dEx0dx + dx*f_d2Ex0dx ) + + dy*( f_dEx0dy + dy*f_d2Ex0dy ) + + dz*( f_dEx0dz + dz*f_d2Ex0dz ) ); + float hay = qdt_2mc*( f_ey + dx*( f_deydx + dx*f_d2eydx ) + + dy*( f_deydy + dy*f_d2eydy ) + + dz*( f_deydz + dz*f_d2eydz ) + + f_Ey0 + dx*( f_dEy0dx + dx*f_d2Ey0dx ) + + dy*( f_dEy0dy + dy*f_d2Ey0dy ) + + dz*( f_dEy0dz + dz*f_d2Ey0dz ) ); + float haz = qdt_2mc*( f_ez + dx*( f_dezdx + dx*f_d2ezdx ) + + dy*( f_dezdy + dy*f_d2ezdy ) + + dz*( f_dezdz + dz*f_d2ezdz ) + + f_Ez0 + dx*( f_dEz0dx + dx*f_d2Ez0dx ) + + dy*( f_dEz0dy + dy*f_d2Ez0dy ) + + dz*( f_dEz0dz + dz*f_d2Ez0dz ) ); + hax += dt_2c *( f_Gx0 + dx*( f_dGx0dx + dx*f_d2Gx0dx ) + + dy*( f_dGx0dy + dy*f_d2Gx0dy ) + + dz*( f_dGx0dz + dz*f_d2Gx0dz ) ); + hay += dt_2c *( f_Gy0 + dx*( f_dGy0dx + dx*f_d2Gy0dx ) + + dy*( f_dGy0dy + dy*f_d2Gy0dy ) + + dz*( f_dGy0dz + dz*f_d2Gy0dz ) ); + haz += dt_2c *( f_Gz0 + dx*( f_dGz0dx + dx*f_d2Gz0dx ) + + dy*( f_dGz0dy + dy*f_d2Gz0dy ) + + dz*( f_dGz0dz + dz*f_d2Gz0dz ) ); + #else + // Interpolate E + float hax = qdt_2mc*( f_ex + dx*( f_dexdx + dx*f_d2exdx ) + + dy*( f_dexdy + dy*f_d2exdy ) + + dz*( f_dexdz + dz*f_d2exdz ) ); + float hay = qdt_2mc*( f_ey + dx*( f_deydx + dx*f_d2eydx ) + + dy*( f_deydy + dy*f_d2eydy ) + + dz*( f_deydz + dz*f_d2eydz ) ); + float haz = qdt_2mc*( f_ez + dx*( f_dezdx + dx*f_d2ezdx ) + + dy*( f_dezdy + dy*f_d2ezdy ) + + dz*( f_dezdz + dz*f_d2ezdz ) ); + #endif + // Interpolate B + float cbx = f_cbx + dx*( f_dcbxdx + dx*f_d2cbxdx ) + + dy*( f_dcbxdy + dy*f_d2cbxdy ) + + dz*( f_dcbxdz + dz*f_d2cbxdz ); + float cby = f_cby + dx*( f_dcbydx + dx*f_d2cbydx ) + + dy*( f_dcbydy + dy*f_d2cbydy ) + + dz*( f_dcbydz + dz*f_d2cbydz ); + float cbz = f_cbz + dx*( f_dcbzdx + dx*f_d2cbzdx ) + + dy*( f_dcbzdy + dy*f_d2cbzdy ) + + dz*( f_dcbzdz + dz*f_d2cbzdz ); +#endif float ux = p_ux; // Load momentum float uy = p_uy; float uz = p_uz; @@ -1040,10 +1464,78 @@ advance_p_kokkos_gpu( //int yi = iii/(nx+2); //int xi = iii - yi*(nx+2); - k_field_scatter_access(ii, field_var::jfx) += q*rV*ux; - k_field_scatter_access(ii, field_var::jfy) += q*rV*uy; - k_field_scatter_access(ii, field_var::jfz) += q*rV*uz; - k_field_scatter_access(ii, field_var::rhof) += q*rV; +#ifdef SHAPE_NGP + q *= rV; + k_field_scatter_access(ii, field_var::jfx) += q*ux; + k_field_scatter_access(ii, field_var::jfy) += q*uy; + k_field_scatter_access(ii, field_var::jfz) += q*uz; + k_field_scatter_access(ii, field_var::rhof) += q; +#elif defined( SHAPE_QS ) + // stencil coefficients + // ... OLD hybrid-VPIC with QS shape, the accumulator stores + // ... ... p->w*qsp * two*(three - ...) + // ... ... hyb_unload_accumulator(...) applies factor rV/12. + // ... NEW HVPIC-K not using accumulator (yet), scatter directly to mesh, + // ... ... so include all factors + q *= rV12; + w0 = q*two*( three - v0*v0 - v1*v1 - v2*v2 ); + wx = q*( v0 + one )*( v0 + one ); + wy = q*( v1 + one )*( v1 + one ); + wz = q*( v2 + one )*( v2 + one ); + wmx = q*( v0 - one )*( v0 - one ); + wmy = q*( v1 - one )*( v1 - one ); + wmz = q*( v2 - one )*( v2 - one ); + + // Voxel indices + int iii = ii; + int zi = iii/((nx+2)*(ny+2)); + iii -= zi*(nx+2)*(ny+2); + int yi = iii/(nx+2); + int xi = iii - yi*(nx+2); + // Neighboring voxel 1D (flattened) indices + int iix = VOXEL(xi+1,yi,zi,nx,ny,nz); + int iiy = VOXEL(xi,yi+1,zi,nx,ny,nz); + int iiz = VOXEL(xi,yi,zi+1,nx,ny,nz); + int iimx = VOXEL(xi-1,yi,zi,nx,ny,nz); + int iimy = VOXEL(xi,yi-1,zi,nx,ny,nz); + int iimz = VOXEL(xi,yi,zi-1,nx,ny,nz); + + k_field_scatter_access(ii, field_var::jfx) += w0*ux; + k_field_scatter_access(ii, field_var::jfy) += w0*uy; + k_field_scatter_access(ii, field_var::jfz) += w0*uz; + k_field_scatter_access(ii, field_var::rhof) += w0; + + k_field_scatter_access(iix, field_var::jfx) += wx*ux; + k_field_scatter_access(iix, field_var::jfy) += wx*uy; + k_field_scatter_access(iix, field_var::jfz) += wx*uz; + k_field_scatter_access(iix, field_var::rhof) += wx; + + k_field_scatter_access(iiy, field_var::jfx) += wy*ux; + k_field_scatter_access(iiy, field_var::jfy) += wy*uy; + k_field_scatter_access(iiy, field_var::jfz) += wy*uz; + k_field_scatter_access(iiy, field_var::rhof) += wy; + + k_field_scatter_access(iiz, field_var::jfx) += wz*ux; + k_field_scatter_access(iiz, field_var::jfy) += wz*uy; + k_field_scatter_access(iiz, field_var::jfz) += wz*uz; + k_field_scatter_access(iiz, field_var::rhof) += wz; + + k_field_scatter_access(iimx, field_var::jfx) += wmx*ux; + k_field_scatter_access(iimx, field_var::jfy) += wmx*uy; + k_field_scatter_access(iimx, field_var::jfz) += wmx*uz; + k_field_scatter_access(iimx, field_var::rhof) += wmx; + + k_field_scatter_access(iimy, field_var::jfx) += wmy*ux; + k_field_scatter_access(iimy, field_var::jfy) += wmy*uy; + k_field_scatter_access(iimy, field_var::jfz) += wmy*uz; + k_field_scatter_access(iimy, field_var::rhof) += wmy; + + k_field_scatter_access(iimz, field_var::jfx) += wmz*ux; + k_field_scatter_access(iimz, field_var::jfy) += wmz*uy; + k_field_scatter_access(iimz, field_var::jfz) += wmz*uz; + k_field_scatter_access(iimz, field_var::rhof) += wmz; + +#endif } else { @@ -1115,6 +1607,93 @@ advance_p_kokkos_gpu( //delete(k_local_particle_movers_p); //return h_nm(0); + #undef f_ex + #undef f_dexdx + #undef f_dexdy + #undef f_dexdz + #undef f_d2exdx + #undef f_d2exdy + #undef f_d2exdz + #undef f_ey + #undef f_deydx + #undef f_deydy + #undef f_deydz + #undef f_d2eydx + #undef f_d2eydy + #undef f_d2eydz + #undef f_ez + #undef f_dezdx + #undef f_dezdy + #undef f_dezdz + #undef f_d2ezdx + #undef f_d2ezdy + #undef f_d2ezdz + #undef f_cbx + #undef f_dcbxdx + #undef f_dcbxdy + #undef f_dcbxdz + #undef f_d2cbxdx + #undef f_d2cbxdy + #undef f_d2cbxdz + #undef f_cby + #undef f_dcbydx + #undef f_dcbydy + #undef f_dcbydz + #undef f_d2cbydx + #undef f_d2cbydy + #undef f_d2cbydz + #undef f_cbz + #undef f_dcbzdx + #undef f_dcbzdy + #undef f_dcbzdz + #undef f_d2cbzdx + #undef f_d2cbzdy + #undef f_d2cbzdz + + #undef f_Ex0 + #undef f_dEx0dx + #undef f_dEx0dy + #undef f_dEx0dz + #undef f_d2Ex0dx + #undef f_d2Ex0dy + #undef f_d2Ex0dz + #undef f_Ey0 + #undef f_dEy0dx + #undef f_dEy0dy + #undef f_dEy0dz + #undef f_d2Ey0dx + #undef f_d2Ey0dy + #undef f_d2Ey0dz + #undef f_Ez0 + #undef f_dEz0dx + #undef f_dEz0dy + #undef f_dEz0dz + #undef f_d2Ez0dx + #undef f_d2Ez0dy + #undef f_d2Ez0dz + + #undef f_Gx0 + #undef f_dGx0dx + #undef f_dGx0dy + #undef f_dGx0dz + #undef f_d2Gx0dx + #undef f_d2Gx0dy + #undef f_d2Gx0dz + #undef f_Gy0 + #undef f_dGy0dx + #undef f_dGy0dy + #undef f_dGy0dz + #undef f_d2Gy0dx + #undef f_d2Gy0dy + #undef f_d2Gy0dz + #undef f_Gz0 + #undef f_dGz0dx + #undef f_dGz0dy + #undef f_dGz0dz + #undef f_d2Gz0dx + #undef f_d2Gz0dy + #undef f_d2Gz0dz + } //advance_p_kokkos_gpu void @@ -1153,7 +1732,8 @@ advance_p( /**/ species_t * RESTRICT sp, #define ADVANCE_P advance_p_kokkos_gpu #else // Portable kernel with additional vectorization options - #define ADVANCE_P advance_p_kokkos_unified + //#define ADVANCE_P advance_p_kokkos_unified + #define ADVANCE_P advance_p_kokkos_gpu #endif KOKKOS_TIC(); ADVANCE_P( diff --git a/src/species_advance/standard/center_p.cc b/src/species_advance/standard/center_p.cc index 5f43a248..9c95e995 100644 --- a/src/species_advance/standard/center_p.cc +++ b/src/species_advance/standard/center_p.cc @@ -18,6 +18,9 @@ center_p_pipeline( center_p_pipeline_args_t * args, float qp; float qdt_2mc; float qdt_4mc; +#endif +#ifdef EXTERNAL_FORCE + const float dt_2c = args->dt_2c; #endif const float one = 1.; const float one_third = 1./3.; @@ -46,13 +49,72 @@ center_p_pipeline( center_p_pipeline_args_t * args, qdt_4mc = 0.5*qdt_2mc; #endif ii = p->i; - f = f0 + ii; // Interpolate E - hax = qdt_2mc*( ( f->ex ) ); + f = f0 + ii; // Load interpolator +#ifdef SHAPE_NGP + #ifdef EXTERNAL_FORCE + hax = qdt_2mc*( f->ex + f->Ex0 ) + dt_2c * f->Gx0; // Interpolate E, E0, G0 + hay = qdt_2mc*( f->ey + f->Ey0 ) + dt_2c * f->Gy0; + haz = qdt_2mc*( f->ez + f->Ez0 ) + dt_2c * f->Gz0; + #else + hax = qdt_2mc*( ( f->ex ) ); // Interpolate E hay = qdt_2mc*( ( f->ey ) ); haz = qdt_2mc*( ( f->ez ) ); + #endif cbx = f->cbx;// + dx*f->dcbxdx; // Interpolate B cby = f->cby;// + dy*f->dcbydy; cbz = f->cbz;// + dz*f->dcbzdz; +#else +#ifdef SHAPE_QS + #ifdef EXTERNAL_FORCE + hax = qdt_2mc*( f->ex + dx*( f->dexdx + dx*f->d2exdx ) // Interpolate E, E0 + + dy*( f->dexdy + dy*f->d2exdy ) + + dz*( f->dexdz + dz*f->d2exdz ) + + f->Ex0 + dx*( f->dEx0dx + dx*f->d2Ex0dx ) + + dy*( f->dEx0dy + dy*f->d2Ex0dy ) + + dz*( f->dEx0dz + dz*f->d2Ex0dz ) ); + hay = qdt_2mc*( f->ey + dx*( f->deydx + dx*f->d2eydx ) + + dy*( f->deydy + dy*f->d2eydy ) + + dz*( f->deydz + dz*f->d2eydz ) + + f->Ey0 + dx*( f->dEy0dx + dx*f->d2Ey0dx ) + + dy*( f->dEy0dy + dy*f->d2Ey0dy ) + + dz*( f->dEy0dz + dz*f->d2Ey0dz ) ); + haz = qdt_2mc*( f->ez + dx*( f->dezdx + dx*f->d2ezdx ) + + dy*( f->dezdy + dy*f->d2ezdy ) + + dz*( f->dezdz + dz*f->d2ezdz ) + + f->Ez0 + dx*( f->dEz0dx + dx*f->d2Ez0dx ) + + dy*( f->dEz0dy + dy*f->d2Ez0dy ) + + dz*( f->dEz0dz + dz*f->d2Ez0dz ) ); + hax += dt_2c *( f->Gx0 + dx*( f->dGx0dx + dx*f->d2Gx0dx ) // Interpolate G0 + + dy*( f->dGx0dy + dy*f->d2Gx0dy ) + + dz*( f->dGx0dz + dz*f->d2Gx0dz ) ); + hay += dt_2c *( f->Gy0 + dx*( f->dGy0dx + dx*f->d2Gy0dx ) + + dy*( f->dGy0dy + dy*f->d2Gy0dy ) + + dz*( f->dGy0dz + dz*f->d2Gy0dz ) ); + haz += dt_2c *( f->Gz0 + dx*( f->dGz0dx + dx*f->d2Gz0dx ) + + dy*( f->dGz0dy + dy*f->d2Gz0dy ) + + dz*( f->dGz0dz + dz*f->d2Gz0dz ) ); + #else + hax = qdt_2mc*( f->ex + dx*( f->dexdx + dx*f->d2exdx ) // Interpolate E + + dy*( f->dexdy + dy*f->d2exdy ) + + dz*( f->dexdz + dz*f->d2exdz ) ); + hay = qdt_2mc*( f->ey + dx*( f->deydx + dx*f->d2eydx ) + + dy*( f->deydy + dy*f->d2eydy ) + + dz*( f->deydz + dz*f->d2eydz ) ); + haz = qdt_2mc*( f->ez + dx*( f->dezdx + dx*f->d2ezdx ) + + dy*( f->dezdy + dy*f->d2ezdy ) + + dz*( f->dezdz + dz*f->d2ezdz ) ); + #endif + cbx = f->cbx + dx*( f->dcbxdx + dx*f->d2cbxdx ) // Interpolate B + + dy*( f->dcbxdy + dy*f->d2cbxdy ) + + dz*( f->dcbxdz + dz*f->d2cbxdz ); + cby = f->cby + dx*( f->dcbydx + dx*f->d2cbydx ) + + dy*( f->dcbydy + dy*f->d2cbydy ) + + dz*( f->dcbydz + dz*f->d2cbydz ); + cbz = f->cbz + dx*( f->dcbzdx + dx*f->d2cbzdx ) + + dy*( f->dcbzdy + dy*f->d2cbzdy ) + + dz*( f->dcbzdz + dz*f->d2cbzdz ); +#endif +#endif ux = p->ux; // Load momentum uy = p->uy; uz = p->uz; @@ -172,6 +234,9 @@ center_p( /**/ species_t * RESTRICT sp, args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac); #endif args->np = sp->np; +#ifdef EXTERNAL_FORCE + args->dt_2c = (sp->g->dt)/(2*sp->g->cvac); +#endif EXEC_PIPELINES( center_p, args, 0 ); WAIT_PIPELINES(); diff --git a/src/species_advance/standard/energy_p.cc b/src/species_advance/standard/energy_p.cc index bc42f7ed..7487f722 100644 --- a/src/species_advance/standard/energy_p.cc +++ b/src/species_advance/standard/energy_p.cc @@ -11,6 +11,9 @@ energy_p_pipeline( energy_p_pipeline_args_t * RESTRICT args, const particle_t * RESTRICT ALIGNED(32) p = args->p; const float qdt_2mc = args->qdt_2mc; const float msp = args->msp; +#ifdef EXTERNAL_FORCE + const float dt_2c = args->dt_2c; +#endif const float one = 1; float dx, dy, dz; @@ -31,12 +34,59 @@ energy_p_pipeline( energy_p_pipeline_args_t * RESTRICT args, dy = p[n].dy; dz = p[n].dz; i = p[n].i; - v0 = p[n].ux + qdt_2mc*( ( f[i].ex + dy*f[i].dexdy ) + - dz*( f[i].dexdz + dy*f[i].d2exdydz ) ); - v1 = p[n].uy + qdt_2mc*( ( f[i].ey + dz*f[i].deydz ) + - dx*( f[i].deydx + dz*f[i].d2eydzdx ) ); - v2 = p[n].uz + qdt_2mc*( ( f[i].ez + dx*f[i].dezdx ) + - dy*( f[i].dezdy + dx*f[i].d2ezdxdy ) ); +#ifdef SHAPE_NGP + #ifdef EXTERNAL_FORCE + v0 = p[n].ux + qdt_2mc * ( f[i].ex + f[i].Ex0 ) + dt_2c * f[i].Gx0; + v1 = p[n].uy + qdt_2mc * ( f[i].ey + f[i].Ey0 ) + dt_2c * f[i].Gy0; + v2 = p[n].uz + qdt_2mc * ( f[i].ez + f[i].Ez0 ) + dt_2c * f[i].Gz0; + #else + v0 = p[n].ux + qdt_2mc * f[i].ex; + v1 = p[n].uy + qdt_2mc * f[i].ey; + v2 = p[n].uz + qdt_2mc * f[i].ez; + #endif +#else +#ifdef SHAPE_QS + #ifdef EXTERNAL_FORCE + v0 = p[n].ux + qdt_2mc*( f[i].ex + dx*( f[i].dexdx + dx*f[i].d2exdx ) + + dy*( f[i].dexdy + dy*f[i].d2exdy ) + + dz*( f[i].dexdz + dz*f[i].d2exdz ) + + f[i].Ex0 + dx*( f[i].dEx0dx + dx*f[i].d2Ex0dx ) + + dy*( f[i].dEx0dy + dy*f[i].d2Ex0dy ) + + dz*( f[i].dEx0dz + dz*f[i].d2Ex0dz ) ); + v1 = p[n].uy + qdt_2mc*( f[i].ey + dx*( f[i].deydx + dx*f[i].d2eydx ) + + dy*( f[i].deydy + dy*f[i].d2eydy ) + + dz*( f[i].deydz + dz*f[i].d2eydz ) + + f[i].Ey0 + dx*( f[i].dEy0dx + dx*f[i].d2Ey0dx ) + + dy*( f[i].dEy0dy + dy*f[i].d2Ey0dy ) + + dz*( f[i].dEy0dz + dz*f[i].d2Ey0dz ) ); + v2 = p[n].uz + qdt_2mc*( f[i].ez + dx*( f[i].dezdx + dx*f[i].d2ezdx ) + + dy*( f[i].dezdy + dy*f[i].d2ezdy ) + + dz*( f[i].dezdz + dz*f[i].d2ezdz ) + + f[i].Ez0 + dx*( f[i].dEz0dx + dx*f[i].d2Ez0dx ) + + dy*( f[i].dEz0dy + dy*f[i].d2Ez0dy ) + + dz*( f[i].dEz0dz + dz*f[i].d2Ez0dz ) ); + v0 += dt_2c *( f[i].Gx0 + dx*( f[i].dGx0dx + dx*f[i].d2Gx0dx ) + + dy*( f[i].dGx0dy + dy*f[i].d2Gx0dy ) + + dz*( f[i].dGx0dz + dz*f[i].d2Gx0dz ) ); + v1 += dt_2c *( f[i].Gy0 + dx*( f[i].dGy0dx + dx*f[i].d2Gy0dx ) + + dy*( f[i].dGy0dy + dy*f[i].d2Gy0dy ) + + dz*( f[i].dGy0dz + dz*f[i].d2Gy0dz ) ); + v2 += dt_2c *( f[i].Gz0 + dx*( f[i].dGz0dx + dx*f[i].d2Gz0dx ) + + dy*( f[i].dGz0dy + dy*f[i].d2Gz0dy ) + + dz*( f[i].dGz0dz + dz*f[i].d2Gz0dz ) ); + #else + v0 = p[n].ux + qdt_2mc*( f[i].ex + dx*( f[i].dexdx + dx*f[i].d2exdx ) + + dy*( f[i].dexdy + dy*f[i].d2exdy ) + + dz*( f[i].dexdz + dz*f[i].d2exdz ) ); + v1 = p[n].uy + qdt_2mc*( f[i].ey + dx*( f[i].deydx + dx*f[i].d2eydx ) + + dy*( f[i].deydy + dy*f[i].d2eydy ) + + dz*( f[i].deydz + dz*f[i].d2eydz ) ); + v2 = p[n].uz + qdt_2mc*( f[i].ez + dx*( f[i].dezdx + dx*f[i].d2ezdx ) + + dy*( f[i].dezdy + dy*f[i].d2ezdy ) + + dz*( f[i].dezdz + dz*f[i].d2ezdz ) ); + #endif +#endif +#endif v0 = v0*v0 + v1*v1 + v2*v2; v0 = (msp * p[n].w) * (v0 / (one + sqrtf(one + v0))); en += (double)v0; @@ -119,7 +169,7 @@ energy_p_pipeline_v4( energy_p_pipeline_args_t * args, #endif double -energy_p_kernel(const k_interpolator_t& k_interp, const k_particles_t& k_particles, const k_particles_i_t& k_particles_i, const float qdt_2mc, const float msp, const int np) { +energy_p_kernel(const k_interpolator_t& k_interp, const k_particles_t& k_particles, const k_particles_i_t& k_particles_i, const float qdt_2mc, const float dt_2c, const float msp, const int np) { // const interpolator_t * RESTRICT ALIGNED(128) f = args->f; // const particle_t * RESTRICT ALIGNED(32) p = args->p; // const float qdt_2mc = args->qdt_2mc; @@ -157,24 +207,208 @@ energy_p_kernel(const k_interpolator_t& k_interp, const k_particles_t& k_particl en += (double)v0; } */ + + #define f_ex k_interp(ii, interpolator_var::ex) + #define f_dexdx k_interp(ii, interpolator_var::dexdx) + #define f_dexdy k_interp(ii, interpolator_var::dexdy) + #define f_dexdz k_interp(ii, interpolator_var::dexdz) + #define f_d2exdx k_interp(ii, interpolator_var::d2exdx) + #define f_d2exdy k_interp(ii, interpolator_var::d2exdy) + #define f_d2exdz k_interp(ii, interpolator_var::d2exdz) + #define f_ey k_interp(ii, interpolator_var::ey) + #define f_deydx k_interp(ii, interpolator_var::deydx) + #define f_deydy k_interp(ii, interpolator_var::deydy) + #define f_deydz k_interp(ii, interpolator_var::deydz) + #define f_d2eydx k_interp(ii, interpolator_var::d2eydx) + #define f_d2eydy k_interp(ii, interpolator_var::d2eydy) + #define f_d2eydz k_interp(ii, interpolator_var::d2eydz) + #define f_ez k_interp(ii, interpolator_var::ez) + #define f_dezdx k_interp(ii, interpolator_var::dezdx) + #define f_dezdy k_interp(ii, interpolator_var::dezdy) + #define f_dezdz k_interp(ii, interpolator_var::dezdz) + #define f_d2ezdx k_interp(ii, interpolator_var::d2ezdx) + #define f_d2ezdy k_interp(ii, interpolator_var::d2ezdy) + #define f_d2ezdz k_interp(ii, interpolator_var::d2ezdz) + + #define f_Ex0 k_interp(ii, interpolator_var::Ex0) + #define f_dEx0dx k_interp(ii, interpolator_var::dEx0dx) + #define f_dEx0dy k_interp(ii, interpolator_var::dEx0dy) + #define f_dEx0dz k_interp(ii, interpolator_var::dEx0dz) + #define f_d2Ex0dx k_interp(ii, interpolator_var::d2Ex0dx) + #define f_d2Ex0dy k_interp(ii, interpolator_var::d2Ex0dy) + #define f_d2Ex0dz k_interp(ii, interpolator_var::d2Ex0dz) + #define f_Ey0 k_interp(ii, interpolator_var::Ey0) + #define f_dEy0dx k_interp(ii, interpolator_var::dEy0dx) + #define f_dEy0dy k_interp(ii, interpolator_var::dEy0dy) + #define f_dEy0dz k_interp(ii, interpolator_var::dEy0dz) + #define f_d2Ey0dx k_interp(ii, interpolator_var::d2Ey0dx) + #define f_d2Ey0dy k_interp(ii, interpolator_var::d2Ey0dy) + #define f_d2Ey0dz k_interp(ii, interpolator_var::d2Ey0dz) + #define f_Ez0 k_interp(ii, interpolator_var::Ez0) + #define f_dEz0dx k_interp(ii, interpolator_var::dEz0dx) + #define f_dEz0dy k_interp(ii, interpolator_var::dEz0dy) + #define f_dEz0dz k_interp(ii, interpolator_var::dEz0dz) + #define f_d2Ez0dx k_interp(ii, interpolator_var::d2Ez0dx) + #define f_d2Ez0dy k_interp(ii, interpolator_var::d2Ez0dy) + #define f_d2Ez0dz k_interp(ii, interpolator_var::d2Ez0dz) + + #define f_Gx0 k_interp(ii, interpolator_var::Gx0) + #define f_dGx0dx k_interp(ii, interpolator_var::dGx0dx) + #define f_dGx0dy k_interp(ii, interpolator_var::dGx0dy) + #define f_dGx0dz k_interp(ii, interpolator_var::dGx0dz) + #define f_d2Gx0dx k_interp(ii, interpolator_var::d2Gx0dx) + #define f_d2Gx0dy k_interp(ii, interpolator_var::d2Gx0dy) + #define f_d2Gx0dz k_interp(ii, interpolator_var::d2Gx0dz) + #define f_Gy0 k_interp(ii, interpolator_var::Gy0) + #define f_dGy0dx k_interp(ii, interpolator_var::dGy0dx) + #define f_dGy0dy k_interp(ii, interpolator_var::dGy0dy) + #define f_dGy0dz k_interp(ii, interpolator_var::dGy0dz) + #define f_d2Gy0dx k_interp(ii, interpolator_var::d2Gy0dx) + #define f_d2Gy0dy k_interp(ii, interpolator_var::d2Gy0dy) + #define f_d2Gy0dz k_interp(ii, interpolator_var::d2Gy0dz) + #define f_Gz0 k_interp(ii, interpolator_var::Gz0) + #define f_dGz0dx k_interp(ii, interpolator_var::dGz0dx) + #define f_dGz0dy k_interp(ii, interpolator_var::dGz0dy) + #define f_dGz0dz k_interp(ii, interpolator_var::dGz0dz) + #define f_d2Gz0dx k_interp(ii, interpolator_var::d2Gz0dx) + #define f_d2Gz0dy k_interp(ii, interpolator_var::d2Gz0dy) + #define f_d2Gz0dz k_interp(ii, interpolator_var::d2Gz0dz) + Kokkos::parallel_reduce(np, KOKKOS_LAMBDA(const int n, double& update) { + float ux = k_particles(n, particle_var::ux); + float uy = k_particles(n, particle_var::uy); + float uz = k_particles(n, particle_var::uz); float dx = k_particles(n, particle_var::dx); float dy = k_particles(n, particle_var::dy); float dz = k_particles(n, particle_var::dz); - int i = k_particles_i(n); - float v0 = k_particles(n, particle_var::ux) + qdt_2mc*( ( k_interp(i, interpolator_var::ex) + dy*k_interp(i, interpolator_var::dexdy) ) + - dz*( k_interp(i, interpolator_var::dexdz) + dy*k_interp(i, interpolator_var::d2exdydz) ) ); - float v1 = k_particles(n, particle_var::uy) + qdt_2mc*( ( k_interp(i, interpolator_var::ey) + dz*k_interp(i, interpolator_var::deydz) ) + - dx*( k_interp(i, interpolator_var::deydx) + dz*k_interp(i, interpolator_var::d2eydzdx) ) ); - float v2 = k_particles(n, particle_var::uz) + qdt_2mc*( ( k_interp(i, interpolator_var::ez) + dx*k_interp(i, interpolator_var::dezdx) ) + - dy*( k_interp(i, interpolator_var::dezdy) + dx*k_interp(i, interpolator_var::d2ezdxdy) ) ); + int ii = k_particles_i(n); +#ifdef SHAPE_NGP + #ifdef EXTERNAL_FORCE + float v0 = ux + qdt_2mc * (f_ex + f_Ex0) + dt_2c * f_Gx0; + float v1 = uy + qdt_2mc * (f_ey + f_Ey0) + dt_2c * f_Gy0; + float v2 = uz + qdt_2mc * (f_ez + f_Ez0) + dt_2c * f_Gz0; + #else + float v0 = ux + qdt_2mc * f_ex; + float v1 = uy + qdt_2mc * f_ey; + float v2 = uz + qdt_2mc * f_ez; + #endif +#else +#ifdef SHAPE_QS + // Interpolate E + #ifdef EXTERNAL_FORCE + float v0 = ux + qdt_2mc*( f_ex + dx*( f_dexdx + dx*f_d2exdx ) + + dy*( f_dexdy + dy*f_d2exdy ) + + dz*( f_dexdz + dz*f_d2exdz ) + + f_Ex0 + dx*( f_dEx0dx + dx*f_d2Ex0dx ) + + dy*( f_dEx0dy + dy*f_d2Ex0dy ) + + dz*( f_dEx0dz + dz*f_d2Ex0dz ) ); + float v1 = uy + qdt_2mc*( f_ey + dx*( f_deydx + dx*f_d2eydx ) + + dy*( f_deydy + dy*f_d2eydy ) + + dz*( f_deydz + dz*f_d2eydz ) + + f_Ey0 + dx*( f_dEy0dx + dx*f_d2Ey0dx ) + + dy*( f_dEy0dy + dy*f_d2Ey0dy ) + + dz*( f_dEy0dz + dz*f_d2Ey0dz ) ); + float v2 = uz + qdt_2mc*( f_ez + dx*( f_dezdx + dx*f_d2ezdx ) + + dy*( f_dezdy + dy*f_d2ezdy ) + + dz*( f_dezdz + dz*f_d2ezdz ) + + f_Ez0 + dx*( f_dEz0dx + dx*f_d2Ez0dx ) + + dy*( f_dEz0dy + dy*f_d2Ez0dy ) + + dz*( f_dEz0dz + dz*f_d2Ez0dz ) ); + v0 += dt_2c *( f_Gx0 + dx*( f_dGx0dx + dx*f_d2Gx0dx ) + + dy*( f_dGx0dy + dy*f_d2Gx0dy ) + + dz*( f_dGx0dz + dz*f_d2Gx0dz ) ); + v1 += dt_2c *( f_Gy0 + dx*( f_dGy0dx + dx*f_d2Gy0dx ) + + dy*( f_dGy0dy + dy*f_d2Gy0dy ) + + dz*( f_dGy0dz + dz*f_d2Gy0dz ) ); + v2 += dt_2c *( f_Gz0 + dx*( f_dGz0dx + dx*f_d2Gz0dx ) + + dy*( f_dGz0dy + dy*f_d2Gz0dy ) + + dz*( f_dGz0dz + dz*f_d2Gz0dz ) ); + #else + float v0 = ux + qdt_2mc*( f_ex + dx*( f_dexdx + dx*f_d2exdx ) + + dy*( f_dexdy + dy*f_d2exdy ) + + dz*( f_dexdz + dz*f_d2exdz ) ); + float v1 = uy + qdt_2mc*( f_ey + dx*( f_deydx + dx*f_d2eydx ) + + dy*( f_deydy + dy*f_d2eydy ) + + dz*( f_deydz + dz*f_d2eydz ) ); + float v2 = uz + qdt_2mc*( f_ez + dx*( f_dezdx + dx*f_d2ezdx ) + + dy*( f_dezdy + dy*f_d2ezdy ) + + dz*( f_dezdz + dz*f_d2ezdz ) ); + #endif +#endif +#endif v0 = v0*v0 + v1*v1 + v2*v2; //v0 = (msp * k_particles(n, particle_var::w)) * (v0 / (1 + sqrtf(1 + v0))); // Relativistic kinetic energy v0 *= 0.5 * (msp * k_particles(n, particle_var::w)); // Non-relativistic kinetic energy update += static_cast(v0); }, en); return en; -} + + #undef f_ex + #undef f_dexdx + #undef f_dexdy + #undef f_dexdz + #undef f_d2exdx + #undef f_d2exdy + #undef f_d2exdz + #undef f_ey + #undef f_deydx + #undef f_deydy + #undef f_deydz + #undef f_d2eydx + #undef f_d2eydy + #undef f_d2eydz + #undef f_ez + #undef f_dezdx + #undef f_dezdy + #undef f_dezdz + #undef f_d2ezdx + #undef f_d2ezdy + #undef f_d2ezdz + + #undef f_Ex0 + #undef f_dEx0dx + #undef f_dEx0dy + #undef f_dEx0dz + #undef f_d2Ex0dx + #undef f_d2Ex0dy + #undef f_d2Ex0dz + #undef f_Ey0 + #undef f_dEy0dx + #undef f_dEy0dy + #undef f_dEy0dz + #undef f_d2Ey0dx + #undef f_d2Ey0dy + #undef f_d2Ey0dz + #undef f_Ez0 + #undef f_dEz0dx + #undef f_dEz0dy + #undef f_dEz0dz + #undef f_d2Ez0dx + #undef f_d2Ez0dy + #undef f_d2Ez0dz + + #undef f_Gx0 + #undef f_dGx0dx + #undef f_dGx0dy + #undef f_dGx0dz + #undef f_d2Gx0dx + #undef f_d2Gx0dy + #undef f_d2Gx0dz + #undef f_Gy0 + #undef f_dGy0dx + #undef f_dGy0dy + #undef f_dGy0dz + #undef f_d2Gy0dx + #undef f_d2Gy0dy + #undef f_d2Gy0dz + #undef f_Gz0 + #undef f_dGz0dx + #undef f_dGz0dy + #undef f_dGz0dz + #undef f_d2Gz0dx + #undef f_d2Gz0dy + #undef f_d2Gz0dz +} // energy_p_kernel(...) double energy_p( const species_t * RESTRICT sp, @@ -195,6 +429,9 @@ energy_p( const species_t * RESTRICT sp, args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac); args->msp = sp->m; args->np = sp->np; +#ifdef EXTERNAL_FORCE + args->dt_2c = (sp->g->dt)/(2*sp->g->cvac); +#endif EXEC_PIPELINES( energy_p, args, 0 ); WAIT_PIPELINES(); @@ -213,8 +450,9 @@ energy_p_kokkos(const species_t* RESTRICT sp, if(!sp || !ia || sp->g != ia->g) ERROR(("Bad args")); float qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac); + float dt_2c = (sp->g->dt)/(2*sp->g->cvac); - local = energy_p_kernel(ia->k_i_d, sp->k_p_d, sp->k_p_i_d, qdt_2mc, sp->m, sp->np); + local = energy_p_kernel(ia->k_i_d, sp->k_p_d, sp->k_p_i_d, qdt_2mc, dt_2c, sp->m, sp->np); Kokkos::fence(); mp_allsum_d( &local, &global, 1 ); diff --git a/src/species_advance/standard/hydro_p.cc b/src/species_advance/standard/hydro_p.cc index acdf3093..99fd5af7 100644 --- a/src/species_advance/standard/hydro_p.cc +++ b/src/species_advance/standard/hydro_p.cc @@ -30,6 +30,9 @@ accumulate_hydro_p( hydro_array_t * RESTRICT ha, const particle_t * RESTRICT ALIGNED(128) p; const interpolator_t * RESTRICT ALIGNED(128) f; float c, qsp, msp, qdt_2mc, qdt_4mc, rV; +#ifdef EXTERNAL_FORCE + float dt_2c; +#endif int np, stride_10, stride_21, stride_43; float dx, dy, dz, ux, uy, uz, w; @@ -73,15 +76,77 @@ accumulate_hydro_p( hydro_array_t * RESTRICT ha, uz = p[n].uz; w = p[n].w; +#ifdef SHAPE_NGP + #ifdef EXTERNAL_FORCE + // Half advance E, E0, G0 + ux += qdt_2mc*( f[i].ex + f[i].Ex0 ) + dt_2c * f[i].Gx0; + uy += qdt_2mc*( f[i].ey + f[i].Ey0 ) + dt_2c * f[i].Gy0; + uz += qdt_2mc*( f[i].ez + f[i].Ez0 ) + dt_2c * f[i].Gz0; + #else // Half advance E - ux += qdt_2mc*((f[i].ex)); //+dy*f[i].dexdy) + dz*(f[i].dexdz+dy*f[i].d2exdydz)); - uy += qdt_2mc*((f[i].ey)); //+dz*f[i].deydz) + dx*(f[i].deydx+dz*f[i].d2eydzdx)); - uz += qdt_2mc*((f[i].ez)); //+dx*f[i].dezdx) + dy*(f[i].dezdy+dx*f[i].d2ezdxdy)); - + ux += qdt_2mc*f[i].ex; + uy += qdt_2mc*f[i].ey; + uz += qdt_2mc*f[i].ez; + #endif + // Boris rotation - Interpolate B field + w5 = f[i].cbx; + w6 = f[i].cby; + w7 = f[i].cbz; +#else +#ifdef SHAPE_QS + #ifdef EXTERNAL_FORCE + // Half advance E, E0, G0 + ux += qdt_2mc*( f[i].ex + dx*( f[i].dexdx + dx*f[i].d2exdx ) + + dy*( f[i].dexdy + dy*f[i].d2exdy ) + + dz*( f[i].dexdz + dz*f[i].d2exdz ) + + f[i].Ex0 + dx*( f[i].dEx0dx + dx*f[i].d2Ex0dx ) + + dy*( f[i].dEx0dy + dy*f[i].d2Ex0dy ) + + dz*( f[i].dEx0dz + dz*f[i].d2Ex0dz ) ); + uy += qdt_2mc*( f[i].ey + dx*( f[i].deydx + dx*f[i].d2eydx ) + + dy*( f[i].deydy + dy*f[i].d2eydy ) + + dz*( f[i].deydz + dz*f[i].d2eydz ) + + f[i].Ey0 + dx*( f[i].dEy0dx + dx*f[i].d2Ey0dx ) + + dy*( f[i].dEy0dy + dy*f[i].d2Ey0dy ) + + dz*( f[i].dEy0dz + dz*f[i].d2Ey0dz ) ); + uz += qdt_2mc*( f[i].ez + dx*( f[i].dezdx + dx*f[i].d2ezdx ) + + dy*( f[i].dezdy + dy*f[i].d2ezdy ) + + dz*( f[i].dezdz + dz*f[i].d2ezdz ) + + f[i].Ey0 + dx*( f[i].dEy0dx + dx*f[i].d2Ey0dx ) + + dy*( f[i].dEy0dy + dy*f[i].d2Ey0dy ) + + dz*( f[i].dEy0dz + dz*f[i].d2Ey0dz ) ); + ux += dt_2c *( f[i].Gx0 + dx*( f[i].dGx0dx + dx*f[i].d2Gx0dx ) + + dy*( f[i].dGx0dy + dy*f[i].d2Gx0dy ) + + dz*( f[i].dGx0dz + dz*f[i].d2Gx0dz ) ); + uy += dt_2c *( f[i].Gy0 + dx*( f[i].dGy0dx + dx*f[i].d2Gy0dx ) + + dy*( f[i].dGy0dy + dy*f[i].d2Gy0dy ) + + dz*( f[i].dGy0dz + dz*f[i].d2Gy0dz ) ); + uz += dt_2c *( f[i].Gz0 + dx*( f[i].dGz0dx + dx*f[i].d2Gz0dx ) + + dy*( f[i].dGz0dy + dy*f[i].d2Gz0dy ) + + dz*( f[i].dGz0dz + dz*f[i].d2Gz0dz ) ); + #else + // Half advance E + ux += qdt_2mc*( f[i].ex + dx*( f[i].dexdx + dx*f[i].d2exdx ) + + dy*( f[i].dexdy + dy*f[i].d2exdy ) + + dz*( f[i].dexdz + dz*f[i].d2exdz ) ); + uy += qdt_2mc*( f[i].ey + dx*( f[i].deydx + dx*f[i].d2eydx ) + + dy*( f[i].deydy + dy*f[i].d2eydy ) + + dz*( f[i].deydz + dz*f[i].d2eydz ) ); + uz += qdt_2mc*( f[i].ez + dx*( f[i].dezdx + dx*f[i].d2ezdx ) + + dy*( f[i].dezdy + dy*f[i].d2ezdy ) + + dz*( f[i].dezdz + dz*f[i].d2ezdz ) ); + #endif // Boris rotation - Interpolate B field - w5 = f[i].cbx; // + dx*f[i].dcbxdx; - w6 = f[i].cby; // + dy*f[i].dcbydy; - w7 = f[i].cbz; // + dz*f[i].dcbzdz; + w5 = f[i].cbx + dx*( f[i].dcbxdx + dx*f[i].d2cbxdx ) + + dy*( f[i].dcbxdy + dy*f[i].d2cbxdy ) + + dz*( f[i].dcbxdz + dz*f[i].d2cbxdz ); + w6 = f[i].cby + dx*( f[i].dcbydx + dx*f[i].d2cbydx ) + + dy*( f[i].dcbydy + dy*f[i].d2cbydy ) + + dz*( f[i].dcbydz + dz*f[i].d2cbydz ); + w7 = f[i].cbz + dx*( f[i].dcbzdx + dx*f[i].d2cbzdx ) + + dy*( f[i].dcbzdy + dy*f[i].d2cbzdy ) + + dz*( f[i].dcbzdz + dz*f[i].d2cbzdz ); +#endif +#endif // Boris rotation - curl scalars (0.5 in v0 for half rotate) and // kinetic energy computation. Note: gamma-1 = |u|^2 / (gamma+1) @@ -313,10 +378,13 @@ accumulate_hydro_p_kokkos( k_hydro_sv_t k_hydro_sv = Kokkos::Experimental::create_scatter_view(k_hydro); #ifdef VARIABLE_CHARGE - float c, qsp, msp, dt_2mc, dt_4mc, rV; + float c, qsp, msp, dt_2mc, dt_4mc, rV, r12V; #else - float c, qsp, msp, qdt_2mc, qdt_4mc, rV; + float c, qsp, msp, qdt_2mc, qdt_4mc, rV, r12V; #endif + + constexpr float one=1.0, two=2.0, three=3.0; + //int np, stride_10, stride_21, stride_43; //float dx, dy, dz, ux, uy, uz, w, vx, vy, vz, ke_mc; @@ -350,16 +418,22 @@ accumulate_hydro_p_kokkos( #else qdt_2mc = (qsp*sp->g->dt)/(2*msp*c); qdt_4mc = qdt_2mc / 2; +#endif +#ifdef EXTERNAL_FORCE + float dt_2c = (sp->g->dt)/(2*c); #endif rV = 1.0/(sp->g->dx*sp->g->dy*sp->g->dz); + r12V = rV/12.; const int np = sp->np; - const int stride_10 = VOXEL(1,0,0, sp->g->nx,sp->g->ny,sp->g->nz) - - VOXEL(0,0,0, sp->g->nx,sp->g->ny,sp->g->nz); - const int stride_21 = VOXEL(0,1,0, sp->g->nx,sp->g->ny,sp->g->nz) - - VOXEL(1,0,0, sp->g->nx,sp->g->ny,sp->g->nz); - const int stride_43 = VOXEL(0,0,1, sp->g->nx,sp->g->ny,sp->g->nz) - - VOXEL(1,1,0, sp->g->nx,sp->g->ny,sp->g->nz); + //const int stride_10 = VOXEL(1,0,0, sp->g->nx,sp->g->ny,sp->g->nz) - + // VOXEL(0,0,0, sp->g->nx,sp->g->ny,sp->g->nz); + //const int stride_21 = VOXEL(0,1,0, sp->g->nx,sp->g->ny,sp->g->nz) - + // VOXEL(1,0,0, sp->g->nx,sp->g->ny,sp->g->nz); + //const int stride_43 = VOXEL(0,0,1, sp->g->nx,sp->g->ny,sp->g->nz) - + // VOXEL(1,1,0, sp->g->nx,sp->g->ny,sp->g->nz); + const int sy = sp->g->sy; + const int sz = sp->g->sz; //for( n=0; n (0, np), @@ -381,39 +455,165 @@ accumulate_hydro_p_kokkos( #endif int ii = k_particles_i(p_index); - const float cbx = k_interp(ii, interpolator_var::cbx); - const float cby = k_interp(ii, interpolator_var::cby); - const float cbz = k_interp(ii, interpolator_var::cbz); - - const float ex = k_interp(ii, interpolator_var::ex); - const float ey = k_interp(ii, interpolator_var::ey); - const float ez = k_interp(ii, interpolator_var::ez); - - const float dexdy = k_interp(ii, interpolator_var::dexdy); - const float deydz = k_interp(ii, interpolator_var::deydz); - const float dezdx = k_interp(ii, interpolator_var::dezdx); - - const float dexdz = k_interp(ii, interpolator_var::dexdz); - const float deydx = k_interp(ii, interpolator_var::deydx); - const float dezdy = k_interp(ii, interpolator_var::dezdy); - - const float d2exdydz = k_interp(ii, interpolator_var::d2exdydz); - const float d2eydzdx = k_interp(ii, interpolator_var::d2eydzdx); - const float d2ezdxdy = k_interp(ii, interpolator_var::d2ezdxdy); - - const float dcbxdx = k_interp(ii, interpolator_var::dcbxdx); - const float dcbydy = k_interp(ii, interpolator_var::dcbydy); - const float dcbzdz = k_interp(ii, interpolator_var::dcbzdz); - + #define f_ex k_interp(ii, interpolator_var::ex) + #define f_dexdx k_interp(ii, interpolator_var::dexdx) + #define f_dexdy k_interp(ii, interpolator_var::dexdy) + #define f_dexdz k_interp(ii, interpolator_var::dexdz) + #define f_d2exdx k_interp(ii, interpolator_var::d2exdx) + #define f_d2exdy k_interp(ii, interpolator_var::d2exdy) + #define f_d2exdz k_interp(ii, interpolator_var::d2exdz) + #define f_ey k_interp(ii, interpolator_var::ey) + #define f_deydx k_interp(ii, interpolator_var::deydx) + #define f_deydy k_interp(ii, interpolator_var::deydy) + #define f_deydz k_interp(ii, interpolator_var::deydz) + #define f_d2eydx k_interp(ii, interpolator_var::d2eydx) + #define f_d2eydy k_interp(ii, interpolator_var::d2eydy) + #define f_d2eydz k_interp(ii, interpolator_var::d2eydz) + #define f_ez k_interp(ii, interpolator_var::ez) + #define f_dezdx k_interp(ii, interpolator_var::dezdx) + #define f_dezdy k_interp(ii, interpolator_var::dezdy) + #define f_dezdz k_interp(ii, interpolator_var::dezdz) + #define f_d2ezdx k_interp(ii, interpolator_var::d2ezdx) + #define f_d2ezdy k_interp(ii, interpolator_var::d2ezdy) + #define f_d2ezdz k_interp(ii, interpolator_var::d2ezdz) + #define f_cbx k_interp(ii, interpolator_var::cbx) + #define f_dcbxdx k_interp(ii, interpolator_var::dcbxdx) + #define f_dcbxdy k_interp(ii, interpolator_var::dcbxdy) + #define f_dcbxdz k_interp(ii, interpolator_var::dcbxdz) + #define f_d2cbxdx k_interp(ii, interpolator_var::d2cbxdx) + #define f_d2cbxdy k_interp(ii, interpolator_var::d2cbxdy) + #define f_d2cbxdz k_interp(ii, interpolator_var::d2cbxdz) + #define f_cby k_interp(ii, interpolator_var::cby) + #define f_dcbydx k_interp(ii, interpolator_var::dcbydx) + #define f_dcbydy k_interp(ii, interpolator_var::dcbydy) + #define f_dcbydz k_interp(ii, interpolator_var::dcbydz) + #define f_d2cbydx k_interp(ii, interpolator_var::d2cbydx) + #define f_d2cbydy k_interp(ii, interpolator_var::d2cbydy) + #define f_d2cbydz k_interp(ii, interpolator_var::d2cbydz) + #define f_cbz k_interp(ii, interpolator_var::cbz) + #define f_dcbzdx k_interp(ii, interpolator_var::dcbzdx) + #define f_dcbzdy k_interp(ii, interpolator_var::dcbzdy) + #define f_dcbzdz k_interp(ii, interpolator_var::dcbzdz) + #define f_d2cbzdx k_interp(ii, interpolator_var::d2cbzdx) + #define f_d2cbzdy k_interp(ii, interpolator_var::d2cbzdy) + #define f_d2cbzdz k_interp(ii, interpolator_var::d2cbzdz) + + #define f_Ex0 k_interp(ii, interpolator_var::Ex0) + #define f_dEx0dx k_interp(ii, interpolator_var::dEx0dx) + #define f_dEx0dy k_interp(ii, interpolator_var::dEx0dy) + #define f_dEx0dz k_interp(ii, interpolator_var::dEx0dz) + #define f_d2Ex0dx k_interp(ii, interpolator_var::d2Ex0dx) + #define f_d2Ex0dy k_interp(ii, interpolator_var::d2Ex0dy) + #define f_d2Ex0dz k_interp(ii, interpolator_var::d2Ex0dz) + #define f_Ey0 k_interp(ii, interpolator_var::Ey0) + #define f_dEy0dx k_interp(ii, interpolator_var::dEy0dx) + #define f_dEy0dy k_interp(ii, interpolator_var::dEy0dy) + #define f_dEy0dz k_interp(ii, interpolator_var::dEy0dz) + #define f_d2Ey0dx k_interp(ii, interpolator_var::d2Ey0dx) + #define f_d2Ey0dy k_interp(ii, interpolator_var::d2Ey0dy) + #define f_d2Ey0dz k_interp(ii, interpolator_var::d2Ey0dz) + #define f_Ez0 k_interp(ii, interpolator_var::Ez0) + #define f_dEz0dx k_interp(ii, interpolator_var::dEz0dx) + #define f_dEz0dy k_interp(ii, interpolator_var::dEz0dy) + #define f_dEz0dz k_interp(ii, interpolator_var::dEz0dz) + #define f_d2Ez0dx k_interp(ii, interpolator_var::d2Ez0dx) + #define f_d2Ez0dy k_interp(ii, interpolator_var::d2Ez0dy) + #define f_d2Ez0dz k_interp(ii, interpolator_var::d2Ez0dz) + + #define f_Gx0 k_interp(ii, interpolator_var::Gx0) + #define f_dGx0dx k_interp(ii, interpolator_var::dGx0dx) + #define f_dGx0dy k_interp(ii, interpolator_var::dGx0dy) + #define f_dGx0dz k_interp(ii, interpolator_var::dGx0dz) + #define f_d2Gx0dx k_interp(ii, interpolator_var::d2Gx0dx) + #define f_d2Gx0dy k_interp(ii, interpolator_var::d2Gx0dy) + #define f_d2Gx0dz k_interp(ii, interpolator_var::d2Gx0dz) + #define f_Gy0 k_interp(ii, interpolator_var::Gy0) + #define f_dGy0dx k_interp(ii, interpolator_var::dGy0dx) + #define f_dGy0dy k_interp(ii, interpolator_var::dGy0dy) + #define f_dGy0dz k_interp(ii, interpolator_var::dGy0dz) + #define f_d2Gy0dx k_interp(ii, interpolator_var::d2Gy0dx) + #define f_d2Gy0dy k_interp(ii, interpolator_var::d2Gy0dy) + #define f_d2Gy0dz k_interp(ii, interpolator_var::d2Gy0dz) + #define f_Gz0 k_interp(ii, interpolator_var::Gz0) + #define f_dGz0dx k_interp(ii, interpolator_var::dGz0dx) + #define f_dGz0dy k_interp(ii, interpolator_var::dGz0dy) + #define f_dGz0dz k_interp(ii, interpolator_var::dGz0dz) + #define f_d2Gz0dx k_interp(ii, interpolator_var::d2Gz0dx) + #define f_d2Gz0dy k_interp(ii, interpolator_var::d2Gz0dy) + #define f_d2Gz0dz k_interp(ii, interpolator_var::d2Gz0dz) + +#ifdef SHAPE_NGP + #ifdef EXTERNAL_FORCE + // Half advance E, E0, G0 + ux += qdt_2mc * ( f_ex + f_Ex0) + dt_2c * f_Gx0; + uy += qdt_2mc * ( f_ey + f_Ey0) + dt_2c * f_Gy0; + uz += qdt_2mc * ( f_ez + f_Ez0) + dt_2c * f_Gz0; + #else // Half advance E - ux += qdt_2mc*((ex)); //+dy*dexdy) + dz*(dexdz+dy*d2exdydz)); - uy += qdt_2mc*((ey)); //+dz*deydz) + dx*(deydx+dz*d2eydzdx)); - uz += qdt_2mc*((ez)); //+dx*dezdx) + dy*(dezdy+dx*d2ezdxdy)); - + ux += qdt_2mc * f_ex; + uy += qdt_2mc * f_ey; + uz += qdt_2mc * f_ez; + #endif + // Boris rotation - Interpolate B field + float w5 = f_cbx; + float w6 = f_cby; + float w7 = f_cbz; +#else +#ifdef SHAPE_QS + #ifdef EXTERNAL_FORCE + // Half advance E, E0 + ux += qdt_2mc*( f_ex + dx*( f_dexdx + dx*f_d2exdx ) + + dy*( f_dexdy + dy*f_d2exdy ) + + dz*( f_dexdz + dz*f_d2exdz ) + + f_Ex0 + dx*( f_dEx0dx + dx*f_d2Ex0dx ) + + dy*( f_dEx0dy + dy*f_d2Ex0dy ) + + dz*( f_dEx0dz + dz*f_d2Ex0dz ) ); + uy += qdt_2mc*( f_ey + dx*( f_deydx + dx*f_d2eydx ) + + dy*( f_deydy + dy*f_d2eydy ) + + dz*( f_deydz + dz*f_d2eydz ) + + f_Ey0 + dx*( f_dEy0dx + dx*f_d2Ey0dx ) + + dy*( f_dEy0dy + dy*f_d2Ey0dy ) + + dz*( f_dEy0dz + dz*f_d2Ey0dz ) ); + uz += qdt_2mc*( f_ez + dx*( f_dezdx + dx*f_d2ezdx ) + + dy*( f_dezdy + dy*f_d2ezdy ) + + dz*( f_dezdz + dz*f_d2ezdz ) + + f_Ez0 + dx*( f_dEz0dx + dx*f_d2Ez0dx ) + + dy*( f_dEz0dy + dy*f_d2Ez0dy ) + + dz*( f_dEz0dz + dz*f_d2Ez0dz ) ); + // Half advance G0 + ux += dt_2c *( f_Gx0 + dx*( f_dGx0dx + dx*f_d2Gx0dx ) + + dy*( f_dGx0dy + dy*f_d2Gx0dy ) + + dz*( f_dGx0dz + dz*f_d2Gx0dz ) ); + uy += dt_2c *( f_Gy0 + dx*( f_dGy0dx + dx*f_d2Gy0dx ) + + dy*( f_dGy0dy + dy*f_d2Gy0dy ) + + dz*( f_dGy0dz + dz*f_d2Gy0dz ) ); + uz += dt_2c *( f_Gz0 + dx*( f_dGz0dx + dx*f_d2Gz0dx ) + + dy*( f_dGz0dy + dy*f_d2Gz0dy ) + + dz*( f_dGz0dz + dz*f_d2Gz0dz ) ); + #else + // Half advance E + ux += qdt_2mc*( f_ex + dx*( f_dexdx + dx*f_d2exdx ) + + dy*( f_dexdy + dy*f_d2exdy ) + + dz*( f_dexdz + dz*f_d2exdz ) ); + uy += qdt_2mc*( f_ey + dx*( f_deydx + dx*f_d2eydx ) + + dy*( f_deydy + dy*f_d2eydy ) + + dz*( f_deydz + dz*f_d2eydz ) ); + uz += qdt_2mc*( f_ez + dx*( f_dezdx + dx*f_d2ezdx ) + + dy*( f_dezdy + dy*f_d2ezdy ) + + dz*( f_dezdz + dz*f_d2ezdz ) ); + #endif // Boris rotation - Interpolate B field - float w5 = cbx; // + dx*dcbxdx; - float w6 = cby; // + dy*dcbydy; - float w7 = cbz; // + dz*dcbzdz; + float w5 = f_cbx + dx*( f_dcbxdx + dx*f_d2cbxdx ) + + dy*( f_dcbxdy + dy*f_d2cbxdy ) + + dz*( f_dcbxdz + dz*f_d2cbxdz ); + float w6 = f_cby + dx*( f_dcbydx + dx*f_d2cbydx ) + + dy*( f_dcbydy + dy*f_d2cbydy ) + + dz*( f_dcbydz + dz*f_d2cbydz ); + float w7 = f_cbz + dx*( f_dcbzdx + dx*f_d2cbzdx ) + + dy*( f_dcbzdy + dy*f_d2cbzdy ) + + dz*( f_dcbzdz + dz*f_d2cbzdz ); +#endif +#endif // Boris rotation - curl scalars (0.5 in v0 for half rotate) and // kinetic energy computation. Note: gamma-1 = |u|^2 / (gamma+1) @@ -469,8 +669,22 @@ accumulate_hydro_p_kokkos( //w2 *= dz; // w2 = (1/8)(w/V)(1-x)(1+y)(1-z) = (w/V) trilin_6 *Done //w3 *= dz; // w3 = (1/8)(w/V)(1+x)(1+y)(1-z) = (w/V) trilin_7 *Done - // Hybrid-VPIC NGP shape + // Stencil weights omit species' electric charge. + // Charge is accounted for in ACCUM_HYDRO below, in order to distinguish + // electric-charge outputs (j, rho) from mass-charge outputs (p, Tij) +#ifdef SHAPE_NGP w0 = w*rV; +#else +#ifdef SHAPE_QS + w0 = (w*r12V) * two*( three - dx*dx - dy*dy - dz*dz ); + float wx = (w*r12V) * ( dx + one )*( dx + one ); + float wy = (w*r12V) * ( dy + one )*( dy + one ); + float wz = (w*r12V) * ( dz + one )*( dz + one ); + float wmx = (w*r12V) * ( dx - one )*( dx - one ); + float wmy = (w*r12V) * ( dy - one )*( dy - one ); + float wmz = (w*r12V) * ( dz - one )*( dz - one ); +#endif +#endif // TODO: This could easily be a loop? @@ -536,9 +750,9 @@ accumulate_hydro_p_kokkos( // TODO: this serial adding to try and save adds is a bit sad // TODO: This is somehow going out of bounds right now - const int i0 = ii; - ACCUM_HYDRO(w0, i0); // Cell i,j,k - +// const int i0 = ii; +// ACCUM_HYDRO(w0, i0); // Cell i,j,k +// // const int i1 = i0 + stride_10; // ACCUM_HYDRO(w1, i1); // Cell i+1,j,k // @@ -560,6 +774,20 @@ accumulate_hydro_p_kokkos( // const int i7 = i6 + stride_10; // ACCUM_HYDRO(w7, i7); // Cell i+1,j+1,k+1 +#ifdef SHAPE_NGP + ACCUM_HYDRO(w0, ii); // Cell i,j,k +#else +#ifdef SHAPE_QS + ACCUM_HYDRO(w0, ii ); // Cell i,j,k + ACCUM_HYDRO(wx, ii + 1); // Cell i+1,j,k + ACCUM_HYDRO(wy, ii + sy); // Cell i,j+1,k + ACCUM_HYDRO(wz, ii + sz); // Cell i,j,k+1 + ACCUM_HYDRO(wmx, ii - 1); // Cell i-1,j,k + ACCUM_HYDRO(wmy, ii - sy); // Cell i,j-1,k + ACCUM_HYDRO(wmz, ii - sz); // Cell i,j,k-1 +#endif +#endif + # undef ACCUM_HYDRO }); @@ -583,5 +811,92 @@ accumulate_hydro_p_kokkos( Kokkos::Experimental::contribute(k_hydro, k_hydro_sv); Kokkos::fence(); // TODO: Check if I need this to block the contribute + #undef f_ex + #undef f_dexdx + #undef f_dexdy + #undef f_dexdz + #undef f_d2exdx + #undef f_d2exdy + #undef f_d2exdz + #undef f_ey + #undef f_deydx + #undef f_deydy + #undef f_deydz + #undef f_d2eydx + #undef f_d2eydy + #undef f_d2eydz + #undef f_ez + #undef f_dezdx + #undef f_dezdy + #undef f_dezdz + #undef f_d2ezdx + #undef f_d2ezdy + #undef f_d2ezdz + #undef f_cbx + #undef f_dcbxdx + #undef f_dcbxdy + #undef f_dcbxdz + #undef f_d2cbxdx + #undef f_d2cbxdy + #undef f_d2cbxdz + #undef f_cby + #undef f_dcbydx + #undef f_dcbydy + #undef f_dcbydz + #undef f_d2cbydx + #undef f_d2cbydy + #undef f_d2cbydz + #undef f_cbz + #undef f_dcbzdx + #undef f_dcbzdy + #undef f_dcbzdz + #undef f_d2cbzdx + #undef f_d2cbzdy + #undef f_d2cbzdz + + #undef f_Ex0 + #undef f_dEx0dx + #undef f_dEx0dy + #undef f_dEx0dz + #undef f_d2Ex0dx + #undef f_d2Ex0dy + #undef f_d2Ex0dz + #undef f_Ey0 + #undef f_dEy0dx + #undef f_dEy0dy + #undef f_dEy0dz + #undef f_d2Ey0dx + #undef f_d2Ey0dy + #undef f_d2Ey0dz + #undef f_Ez0 + #undef f_dEz0dx + #undef f_dEz0dy + #undef f_dEz0dz + #undef f_d2Ez0dx + #undef f_d2Ez0dy + #undef f_d2Ez0dz + + #undef f_Gx0 + #undef f_dGx0dx + #undef f_dGx0dy + #undef f_dGx0dz + #undef f_d2Gx0dx + #undef f_d2Gx0dy + #undef f_d2Gx0dz + #undef f_Gy0 + #undef f_dGy0dx + #undef f_dGy0dy + #undef f_dGy0dz + #undef f_d2Gy0dx + #undef f_d2Gy0dy + #undef f_d2Gy0dz + #undef f_Gz0 + #undef f_dGz0dx + #undef f_dGz0dy + #undef f_dGz0dz + #undef f_d2Gz0dx + #undef f_d2Gz0dy + #undef f_d2Gz0dz + // Perform debug printing } diff --git a/src/species_advance/standard/rho_p.cc b/src/species_advance/standard/rho_p.cc index b49dfa6c..8863266d 100644 --- a/src/species_advance/standard/rho_p.cc +++ b/src/species_advance/standard/rho_p.cc @@ -32,13 +32,21 @@ accumulate_rho_p( /**/ field_array_t * RESTRICT fa, /**/ field_t * RESTRICT ALIGNED(128) f = fa->f; const particle_t * RESTRICT ALIGNED(128) p = sp->p; - const float q_8V = sp->q*sp->g->r8V, one=1.0; + const float q_8V = sp->q*sp->g->r8V; + const float q_V = sp->q*(sp->g->rdx*sp->g->rdy*sp->g->rdz); + const float q_12V = q_V * 1./12.; const int np = sp->np; const int sy = sp->g->sy; const int sz = sp->g->sz; + constexpr float three = 3.; + constexpr float two = 2.; + constexpr float one = 1.; + # if 1 - float ux, uy, uz, w3, w4, w5, w6, w7, dz; + float ux, uy, uz; //, w3, w4, w5, w6, w7, dz; + float dx, dy, dz; + float q, w0, wx, wy, wz, wmx, wmy, wmz; # else using namespace v4; v4float q, wl, wh, rl, rh; @@ -61,6 +69,7 @@ accumulate_rho_p( /**/ field_array_t * RESTRICT fa, ux = p[n].ux; uy = p[n].uy; uz = p[n].uz; + v = p[n].i; // Disable relativistic gamma for Hybrid PIC //w3 = one/sqrtf(one + (ux*ux+ (uy*uy + uz*uz))); @@ -68,9 +77,6 @@ accumulate_rho_p( /**/ field_array_t * RESTRICT fa, //uy *= w3; //uz *= w3; - v = p[n].i; - w7 = p[n].w*q_8V*8.0; - // Compute the trilinear weights // Though the PPE should have hardware fma/fmaf support, it was // measured to be more efficient _not_ to use it here. (Maybe the @@ -89,10 +95,66 @@ accumulate_rho_p( /**/ field_array_t * RESTRICT fa, // Reduce the particle charge to rhof - f[v ].jfx += w7*ux; - f[v ].jfy += w7*uy; - f[v ].jfz += w7*uz; - f[v ].rhof+= w7; +#ifdef SHAPE_NGP + w0 = p[n].w * q_V; + + f[v ].jfx += w0*ux; + f[v ].jfy += w0*uy; + f[v ].jfz += w0*uz; + f[v ].rhof+= w0; +#else +#ifdef SHAPE_QS + + dx = p[n].dx; + dy = p[n].dy; + dz = p[n].dz; + + q = p[n].w * q_12V; + w0 = q*two*( three - dx*dx - dy*dy - dz*dz ); + wx = q*( dx + one )*( dx + one ); + wy = q*( dy + one )*( dy + one ); + wz = q*( dz + one )*( dz + one ); + wmx = q*( dx - one )*( dx - one ); + wmy = q*( dy - one )*( dy - one ); + wmz = q*( dz - one )*( dz - one ); + + f[v ].jfx += w0*ux; + f[v ].jfy += w0*uy; + f[v ].jfz += w0*uz; + f[v ].rhof += w0; + + f[v+1 ].jfx += wx*ux; + f[v+1 ].jfy += wx*uy; + f[v+1 ].jfz += wx*uz; + f[v+1 ].rhof += wx; + + f[v+sy].jfx += wy*ux; + f[v+sy].jfy += wy*uy; + f[v+sy].jfz += wy*uz; + f[v+sy].rhof += wy; + + f[v+sz].jfx += wz*ux; + f[v+sz].jfy += wz*uy; + f[v+sz].jfz += wz*uz; + f[v+sz].rhof += wz; + + f[v-1 ].jfx += wmx*ux; + f[v-1 ].jfy += wmx*uy; + f[v-1 ].jfz += wmx*uz; + f[v-1 ].rhof += wmx; + + f[v-sy].jfx += wmy*ux; + f[v-sy].jfy += wmy*uy; + f[v-sy].jfz += wmy*uz; + f[v-sy].rhof += wmy; + + f[v-sz].jfx += wmz*ux; + f[v-sz].jfy += wmz*uy; + f[v-sz].jfz += wmz*uz; + f[v-sz].rhof += wmz; + +#endif // defined(SHAPE_QS) +#endif // defined(SHAPE_NGP) // ; f[v +1].rhof += w1; //f[v +sy].rhof += w2; f[v +sy+1].rhof += w3; @@ -480,9 +542,15 @@ k_accumulate_rho_p( /**/ field_array_t * RESTRICT fa, k_particles_i_t kparticles_i = sp->k_p_i_d; const float q_8V = (sp->q)*(sp->g->r8V); + const float q_V = sp->q*(sp->g->rdx*sp->g->rdy*sp->g->rdz); + const float q_12V = q_V * 1./12.; const int np = sp->np; const int sy = sp->g->sy; const int sz = sp->g->sz; + + constexpr float three = 3.; + constexpr float two = 2.; + constexpr float one = 1.; /* float sums[sp->g->nv]; Kokkos::parallel_reduce("accumulate_rho_p", Kokkos::RangePolicy<>(0, np), accum_rho_p_reduce(kfield, kparticles, kparticles_i, sy, sz, q_8V, np, sp->g->nv), sums); @@ -528,10 +596,13 @@ k_accumulate_rho_p( /**/ field_array_t * RESTRICT fa, // Hybrid int ii = kparticles_i(n); + float dx = kparticles(n, particle_var::dx); + float dy = kparticles(n, particle_var::dy); + float dz = kparticles(n, particle_var::dz); float ux = kparticles(n, particle_var::ux); float uy = kparticles(n, particle_var::uy); float uz = kparticles(n, particle_var::uz); - float q_V = q_8V*8.0 * kparticles(n, particle_var::w); + float p_w = kparticles(n, particle_var::w); auto scatter_view_access = scatter_view.access(); @@ -553,11 +624,60 @@ k_accumulate_rho_p( /**/ field_array_t * RESTRICT fa, // Kokkos::atomic_add(&kfield(v+sz+sy, field_var::rhof), w6); // Kokkos::atomic_add(&kfield(v+sz+sy+1, field_var::rhof), w7); +#ifdef SHAPE_NGP // Hybrid, nearest-grid-point shape - scatter_view_access(ii, field_var::jfx) += q_V * ux; - scatter_view_access(ii, field_var::jfy) += q_V * uy; - scatter_view_access(ii, field_var::jfz) += q_V * uz; - scatter_view_access(ii, field_var::rhof) += q_V; + float w0 = q_V * p_w; + + scatter_view_access(ii, field_var::jfx) += w0 * ux; + scatter_view_access(ii, field_var::jfy) += w0 * uy; + scatter_view_access(ii, field_var::jfz) += w0 * uz; + scatter_view_access(ii, field_var::rhof) += w0; +#else +#ifdef SHAPE_QS + float w0 = (q_12V*p_w) * two*( three - dx*dx - dy*dy - dz*dz ); + float wx = (q_12V*p_w) * ( dx + one )*( dx + one ); + float wy = (q_12V*p_w) * ( dy + one )*( dy + one ); + float wz = (q_12V*p_w) * ( dz + one )*( dz + one ); + float wmx = (q_12V*p_w) * ( dx - one )*( dx - one ); + float wmy = (q_12V*p_w) * ( dy - one )*( dy - one ); + float wmz = (q_12V*p_w) * ( dz - one )*( dz - one ); + + scatter_view_access(ii, field_var::jfx) += w0 * ux; + scatter_view_access(ii, field_var::jfy) += w0 * uy; + scatter_view_access(ii, field_var::jfz) += w0 * uz; + scatter_view_access(ii, field_var::rhof) += w0; + + scatter_view_access(ii+1, field_var::jfx) += wx * ux; + scatter_view_access(ii+1, field_var::jfy) += wx * uy; + scatter_view_access(ii+1, field_var::jfz) += wx * uz; + scatter_view_access(ii+1, field_var::rhof) += wx; + + scatter_view_access(ii+sy, field_var::jfx) += wy * ux; + scatter_view_access(ii+sy, field_var::jfy) += wy * uy; + scatter_view_access(ii+sy, field_var::jfz) += wy * uz; + scatter_view_access(ii+sy, field_var::rhof) += wy; + + scatter_view_access(ii+sz, field_var::jfx) += wz * ux; + scatter_view_access(ii+sz, field_var::jfy) += wz * uy; + scatter_view_access(ii+sz, field_var::jfz) += wz * uz; + scatter_view_access(ii+sz, field_var::rhof) += wz; + + scatter_view_access(ii-1, field_var::jfx) += wmx * ux; + scatter_view_access(ii-1, field_var::jfy) += wmx * uy; + scatter_view_access(ii-1, field_var::jfz) += wmx * uz; + scatter_view_access(ii-1, field_var::rhof) += wmx; + + scatter_view_access(ii-sy, field_var::jfx) += wmy * ux; + scatter_view_access(ii-sy, field_var::jfy) += wmy * uy; + scatter_view_access(ii-sy, field_var::jfz) += wmy * uz; + scatter_view_access(ii-sy, field_var::rhof) += wmy; + + scatter_view_access(ii-sz, field_var::jfx) += wmz * ux; + scatter_view_access(ii-sz, field_var::jfy) += wmz * uy; + scatter_view_access(ii-sz, field_var::jfz) += wmz * uz; + scatter_view_access(ii-sz, field_var::rhof) += wmz; +#endif // defined(SHAPE_QS) +#endif // defined(SHAPE_NGP) }); Kokkos::Experimental::contribute(kfield, scatter_view); diff --git a/src/species_advance/standard/spa_private.h b/src/species_advance/standard/spa_private.h index a2a11e67..db0709ed 100644 --- a/src/species_advance/standard/spa_private.h +++ b/src/species_advance/standard/spa_private.h @@ -57,8 +57,12 @@ typedef struct center_p_pipeline_args { MEM_PTR( const interpolator_t, 128 ) f0; // Interpolator array float qdt_2mc; // Particle/field coupling int np; // Number of particles - +#ifdef EXTERNAL_FORCE + float dt_2c; // Particle/field coupling + PAD_STRUCT( 2*SIZEOF_MEM_PTR + 2*sizeof(float) + sizeof(int) ) +#else PAD_STRUCT( 2*SIZEOF_MEM_PTR + sizeof(float) + sizeof(int) ) +#endif } center_p_pipeline_args_t; @@ -76,8 +80,12 @@ typedef struct energy_p_pipeline_args { float qdt_2mc; // Particle/field coupling float msp; // Species particle rest mass int np; // Number of particles - +#ifdef EXTERNAL_FORCE + float dt_2c; // Particle/field coupling + PAD_STRUCT( 3*SIZEOF_MEM_PTR + 3*sizeof(float) + sizeof(int) ) +#else PAD_STRUCT( 3*SIZEOF_MEM_PTR + 2*sizeof(float) + sizeof(int) ) +#endif } energy_p_pipeline_args_t; diff --git a/src/species_advance/standard/uncenter_p.cc b/src/species_advance/standard/uncenter_p.cc index a37d5112..a59d399a 100644 --- a/src/species_advance/standard/uncenter_p.cc +++ b/src/species_advance/standard/uncenter_p.cc @@ -6,7 +6,8 @@ void uncenter_p_kokkos( k_particles_i_t& k_particles_i, k_interpolator_t& k_interp, int np, - float qdt_2mc_c + float qdt_2mc_c, + float dt_2c ) { #ifndef VARIABLE_CHARGE @@ -15,6 +16,7 @@ void uncenter_p_kokkos( #else const float dt_2mc_c = qdt_2mc_c; #endif + const float minus_dt_2c = -dt_2c; // For backwards half advance const float one = 1.; const float one_third = 1./3.; const float two_fifteenths = 2./15.; @@ -32,28 +34,92 @@ void uncenter_p_kokkos( #define pii k_particles_i(p_index) // Interpolator Defines (f->x) - #define f_cbx k_interp(ii, interpolator_var::cbx) - #define f_cby k_interp(ii, interpolator_var::cby) - #define f_cbz k_interp(ii, interpolator_var::cbz) - #define f_ex k_interp(ii, interpolator_var::ex) - #define f_ey k_interp(ii, interpolator_var::ey) - #define f_ez k_interp(ii, interpolator_var::ez) - + #define f_ex k_interp(ii, interpolator_var::ex) + #define f_dexdx k_interp(ii, interpolator_var::dexdx) #define f_dexdy k_interp(ii, interpolator_var::dexdy) #define f_dexdz k_interp(ii, interpolator_var::dexdz) - - #define f_d2exdydz k_interp(ii, interpolator_var::d2exdydz) + #define f_d2exdx k_interp(ii, interpolator_var::d2exdx) + #define f_d2exdy k_interp(ii, interpolator_var::d2exdy) + #define f_d2exdz k_interp(ii, interpolator_var::d2exdz) + #define f_ey k_interp(ii, interpolator_var::ey) #define f_deydx k_interp(ii, interpolator_var::deydx) + #define f_deydy k_interp(ii, interpolator_var::deydy) #define f_deydz k_interp(ii, interpolator_var::deydz) - - #define f_d2eydzdx k_interp(ii, interpolator_var::d2eydzdx) + #define f_d2eydx k_interp(ii, interpolator_var::d2eydx) + #define f_d2eydy k_interp(ii, interpolator_var::d2eydy) + #define f_d2eydz k_interp(ii, interpolator_var::d2eydz) + #define f_ez k_interp(ii, interpolator_var::ez) #define f_dezdx k_interp(ii, interpolator_var::dezdx) #define f_dezdy k_interp(ii, interpolator_var::dezdy) - - #define f_d2ezdxdy k_interp(ii, interpolator_var::d2ezdxdy) + #define f_dezdz k_interp(ii, interpolator_var::dezdz) + #define f_d2ezdx k_interp(ii, interpolator_var::d2ezdx) + #define f_d2ezdy k_interp(ii, interpolator_var::d2ezdy) + #define f_d2ezdz k_interp(ii, interpolator_var::d2ezdz) + #define f_cbx k_interp(ii, interpolator_var::cbx) #define f_dcbxdx k_interp(ii, interpolator_var::dcbxdx) + #define f_dcbxdy k_interp(ii, interpolator_var::dcbxdy) + #define f_dcbxdz k_interp(ii, interpolator_var::dcbxdz) + #define f_d2cbxdx k_interp(ii, interpolator_var::d2cbxdx) + #define f_d2cbxdy k_interp(ii, interpolator_var::d2cbxdy) + #define f_d2cbxdz k_interp(ii, interpolator_var::d2cbxdz) + #define f_cby k_interp(ii, interpolator_var::cby) + #define f_dcbydx k_interp(ii, interpolator_var::dcbydx) #define f_dcbydy k_interp(ii, interpolator_var::dcbydy) + #define f_dcbydz k_interp(ii, interpolator_var::dcbydz) + #define f_d2cbydx k_interp(ii, interpolator_var::d2cbydx) + #define f_d2cbydy k_interp(ii, interpolator_var::d2cbydy) + #define f_d2cbydz k_interp(ii, interpolator_var::d2cbydz) + #define f_cbz k_interp(ii, interpolator_var::cbz) + #define f_dcbzdx k_interp(ii, interpolator_var::dcbzdx) + #define f_dcbzdy k_interp(ii, interpolator_var::dcbzdy) #define f_dcbzdz k_interp(ii, interpolator_var::dcbzdz) + #define f_d2cbzdx k_interp(ii, interpolator_var::d2cbzdx) + #define f_d2cbzdy k_interp(ii, interpolator_var::d2cbzdy) + #define f_d2cbzdz k_interp(ii, interpolator_var::d2cbzdz) + + #define f_Ex0 k_interp(ii, interpolator_var::Ex0) + #define f_dEx0dx k_interp(ii, interpolator_var::dEx0dx) + #define f_dEx0dy k_interp(ii, interpolator_var::dEx0dy) + #define f_dEx0dz k_interp(ii, interpolator_var::dEx0dz) + #define f_d2Ex0dx k_interp(ii, interpolator_var::d2Ex0dx) + #define f_d2Ex0dy k_interp(ii, interpolator_var::d2Ex0dy) + #define f_d2Ex0dz k_interp(ii, interpolator_var::d2Ex0dz) + #define f_Ey0 k_interp(ii, interpolator_var::Ey0) + #define f_dEy0dx k_interp(ii, interpolator_var::dEy0dx) + #define f_dEy0dy k_interp(ii, interpolator_var::dEy0dy) + #define f_dEy0dz k_interp(ii, interpolator_var::dEy0dz) + #define f_d2Ey0dx k_interp(ii, interpolator_var::d2Ey0dx) + #define f_d2Ey0dy k_interp(ii, interpolator_var::d2Ey0dy) + #define f_d2Ey0dz k_interp(ii, interpolator_var::d2Ey0dz) + #define f_Ez0 k_interp(ii, interpolator_var::Ez0) + #define f_dEz0dx k_interp(ii, interpolator_var::dEz0dx) + #define f_dEz0dy k_interp(ii, interpolator_var::dEz0dy) + #define f_dEz0dz k_interp(ii, interpolator_var::dEz0dz) + #define f_d2Ez0dx k_interp(ii, interpolator_var::d2Ez0dx) + #define f_d2Ez0dy k_interp(ii, interpolator_var::d2Ez0dy) + #define f_d2Ez0dz k_interp(ii, interpolator_var::d2Ez0dz) + + #define f_Gx0 k_interp(ii, interpolator_var::Gx0) + #define f_dGx0dx k_interp(ii, interpolator_var::dGx0dx) + #define f_dGx0dy k_interp(ii, interpolator_var::dGx0dy) + #define f_dGx0dz k_interp(ii, interpolator_var::dGx0dz) + #define f_d2Gx0dx k_interp(ii, interpolator_var::d2Gx0dx) + #define f_d2Gx0dy k_interp(ii, interpolator_var::d2Gx0dy) + #define f_d2Gx0dz k_interp(ii, interpolator_var::d2Gx0dz) + #define f_Gy0 k_interp(ii, interpolator_var::Gy0) + #define f_dGy0dx k_interp(ii, interpolator_var::dGy0dx) + #define f_dGy0dy k_interp(ii, interpolator_var::dGy0dy) + #define f_dGy0dz k_interp(ii, interpolator_var::dGy0dz) + #define f_d2Gy0dx k_interp(ii, interpolator_var::d2Gy0dx) + #define f_d2Gy0dy k_interp(ii, interpolator_var::d2Gy0dy) + #define f_d2Gy0dz k_interp(ii, interpolator_var::d2Gy0dz) + #define f_Gz0 k_interp(ii, interpolator_var::Gz0) + #define f_dGz0dx k_interp(ii, interpolator_var::dGz0dx) + #define f_dGz0dy k_interp(ii, interpolator_var::dGz0dy) + #define f_dGz0dz k_interp(ii, interpolator_var::dGz0dz) + #define f_d2Gz0dx k_interp(ii, interpolator_var::d2Gz0dx) + #define f_d2Gz0dy k_interp(ii, interpolator_var::d2Gz0dy) + #define f_d2Gz0dz k_interp(ii, interpolator_var::d2Gz0dz) // this goes to np using p_index @@ -69,12 +135,71 @@ void uncenter_p_kokkos( float qdt_4mc = 0.5*qdt_2mc; // For backwards half rotation #endif - hax = qdt_2mc*( ( f_ex ) ); - hay = qdt_2mc*( ( f_ey ) ); - haz = qdt_2mc*( ( f_ez ) ); +#ifdef SHAPE_NGP + #ifdef EXTERNAL_FORCE + hax = qdt_2mc*( f_ex + f_Ex0 ) + minus_dt_2c * f_Gx0; + hay = qdt_2mc*( f_ey + f_Ey0 ) + minus_dt_2c * f_Gy0; + haz = qdt_2mc*( f_ez + f_Ez0 ) + minus_dt_2c * f_Gz0; + #else + hax = qdt_2mc*( f_ex ); + hay = qdt_2mc*( f_ey ); + haz = qdt_2mc*( f_ez ); + #endif l_cbx = f_cbx;// + p_dx*f_dcbxdx; // Interpolate B l_cby = f_cby;// + p_dy*f_dcbydy; l_cbz = f_cbz;// + p_dz*f_dcbzdz; +#else +#ifdef SHAPE_QS + #ifdef EXTERNAL_FORCE + hax = qdt_2mc*( f_ex + p_dx*( f_dexdx + p_dx*f_d2exdx ) // Interpolate E, E0 + + p_dy*( f_dexdy + p_dy*f_d2exdy ) + + p_dz*( f_dexdz + p_dz*f_d2exdz ) + + f_Ex0 + p_dx*( f_dEx0dx + p_dx*f_d2Ex0dx ) + + p_dy*( f_dEx0dy + p_dy*f_d2Ex0dy ) + + p_dz*( f_dEx0dz + p_dz*f_d2Ex0dz ) ); + hay = qdt_2mc*( f_ey + p_dx*( f_deydx + p_dx*f_d2eydx ) + + p_dy*( f_deydy + p_dy*f_d2eydy ) + + p_dz*( f_deydz + p_dz*f_d2eydz ) + + f_Ey0 + p_dx*( f_dEy0dx + p_dx*f_d2Ey0dx ) + + p_dy*( f_dEy0dy + p_dy*f_d2Ey0dy ) + + p_dz*( f_dEy0dz + p_dz*f_d2Ey0dz ) ); + haz = qdt_2mc*( f_ez + p_dx*( f_dezdx + p_dx*f_d2ezdx ) + + p_dy*( f_dezdy + p_dy*f_d2ezdy ) + + p_dz*( f_dezdz + p_dz*f_d2ezdz ) + + f_Ez0 + p_dx*( f_dEz0dx + p_dx*f_d2Ez0dx ) + + p_dy*( f_dEz0dy + p_dy*f_d2Ez0dy ) + + p_dz*( f_dEz0dz + p_dz*f_d2Ez0dz ) ); + hax += minus_dt_2c *( f_Gx0 + p_dx*( f_dGx0dx + p_dx*f_d2Gx0dx ) // Interpolate G0 + + p_dy*( f_dGx0dy + p_dy*f_d2Gx0dy ) + + p_dz*( f_dGx0dz + p_dz*f_d2Gx0dz ) ); + hay += minus_dt_2c *( f_Gy0 + p_dx*( f_dGy0dx + p_dx*f_d2Gy0dx ) + + p_dy*( f_dGy0dy + p_dy*f_d2Gy0dy ) + + p_dz*( f_dGy0dz + p_dz*f_d2Gy0dz ) ); + haz += minus_dt_2c *( f_Gz0 + p_dx*( f_dGz0dx + p_dx*f_d2Gz0dx ) + + p_dy*( f_dGz0dy + p_dy*f_d2Gz0dy ) + + p_dz*( f_dGz0dz + p_dz*f_d2Gz0dz ) ); + #else + hax = qdt_2mc*( f_ex + p_dx*( f_dexdx + p_dx*f_d2exdx ) // Interpolate E + + p_dy*( f_dexdy + p_dy*f_d2exdy ) + + p_dz*( f_dexdz + p_dz*f_d2exdz ) ); + hay = qdt_2mc*( f_ey + p_dx*( f_deydx + p_dx*f_d2eydx ) + + p_dy*( f_deydy + p_dy*f_d2eydy ) + + p_dz*( f_deydz + p_dz*f_d2eydz ) ); + haz = qdt_2mc*( f_ez + p_dx*( f_dezdx + p_dx*f_d2ezdx ) + + p_dy*( f_dezdy + p_dy*f_d2ezdy ) + + p_dz*( f_dezdz + p_dz*f_d2ezdz ) ); + #endif + l_cbx = f_cbx + p_dx*( f_dcbxdx + p_dx*f_d2cbxdx ) // Interpolate B + + p_dy*( f_dcbxdy + p_dy*f_d2cbxdy ) + + p_dz*( f_dcbxdz + p_dz*f_d2cbxdz ); + l_cby = f_cby + p_dx*( f_dcbydx + p_dx*f_d2cbydx ) + + p_dy*( f_dcbydy + p_dy*f_d2cbydy ) + + p_dz*( f_dcbydz + p_dz*f_d2cbydz ); + l_cbz = f_cbz + p_dx*( f_dcbzdx + p_dx*f_d2cbzdx ) + + p_dy*( f_dcbzdy + p_dy*f_d2cbzdy ) + + p_dz*( f_dcbzdz + p_dz*f_d2cbzdz ); +#endif +#endif v0 = qdt_4mc;///(float)sqrt(one + (p_ux*p_ux + (p_uy*p_uy + p_uz*p_uz))); /**/ // Boris - scalars v1 = l_cbx*l_cbx + (l_cby*l_cby + l_cbz*l_cbz); @@ -93,6 +218,104 @@ void uncenter_p_kokkos( p_uz += haz; }); + #undef p_dx + #undef p_dy + #undef p_dz + #undef p_ux + #undef p_uy + #undef p_uz +#ifdef VARIABLE_CHARGE + #undef p_q +#endif + #undef pii + + #undef f_ex + #undef f_dexdx + #undef f_dexdy + #undef f_dexdz + #undef f_d2exdx + #undef f_d2exdy + #undef f_d2exdz + #undef f_ey + #undef f_deydx + #undef f_deydy + #undef f_deydz + #undef f_d2eydx + #undef f_d2eydy + #undef f_d2eydz + #undef f_ez + #undef f_dezdx + #undef f_dezdy + #undef f_dezdz + #undef f_d2ezdx + #undef f_d2ezdy + #undef f_d2ezdz + #undef f_cbx + #undef f_dcbxdx + #undef f_dcbxdy + #undef f_dcbxdz + #undef f_d2cbxdx + #undef f_d2cbxdy + #undef f_d2cbxdz + #undef f_cby + #undef f_dcbydx + #undef f_dcbydy + #undef f_dcbydz + #undef f_d2cbydx + #undef f_d2cbydy + #undef f_d2cbydz + #undef f_cbz + #undef f_dcbzdx + #undef f_dcbzdy + #undef f_dcbzdz + #undef f_d2cbzdx + #undef f_d2cbzdy + #undef f_d2cbzdz + + #undef f_Ex0 + #undef f_dEx0dx + #undef f_dEx0dy + #undef f_dEx0dz + #undef f_d2Ex0dx + #undef f_d2Ex0dy + #undef f_d2Ex0dz + #undef f_Ey0 + #undef f_dEy0dx + #undef f_dEy0dy + #undef f_dEy0dz + #undef f_d2Ey0dx + #undef f_d2Ey0dy + #undef f_d2Ey0dz + #undef f_Ez0 + #undef f_dEz0dx + #undef f_dEz0dy + #undef f_dEz0dz + #undef f_d2Ez0dx + #undef f_d2Ez0dy + #undef f_d2Ez0dz + + #undef f_Gx0 + #undef f_dGx0dx + #undef f_dGx0dy + #undef f_dGx0dz + #undef f_d2Gx0dx + #undef f_d2Gx0dy + #undef f_d2Gx0dz + #undef f_Gy0 + #undef f_dGy0dx + #undef f_dGy0dy + #undef f_dGy0dz + #undef f_d2Gy0dx + #undef f_d2Gy0dy + #undef f_d2Gy0dz + #undef f_Gz0 + #undef f_dGz0dx + #undef f_dGz0dy + #undef f_dGz0dz + #undef f_d2Gz0dx + #undef f_d2Gz0dy + #undef f_d2Gz0dz + } void @@ -111,5 +334,6 @@ uncenter_p( /**/ species_t * RESTRICT sp, #else const float qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac); #endif - uncenter_p_kokkos(k_particles, k_particles_i, k_interp, np, qdt_2mc); + const float dt_2c = (sp->g->dt)/(2*sp->g->cvac); + uncenter_p_kokkos(k_particles, k_particles_i, k_interp, np, qdt_2mc, dt_2c); } diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc index e3d03f22..4ab420e8 100644 --- a/src/vpic/dump.cc +++ b/src/vpic/dump.cc @@ -312,17 +312,35 @@ vpic_simulation::dump_fluids( const char *fsp_name, * New dump logic *---------------------------------------------------------------------------*/ -static FieldInfo fieldInfo[12] = { - { "Electric Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, - { "Electric Field Divergence Error", "SCALAR", "1", "FLOATING_POINT", - sizeof(float) }, - { "Magnetic Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, - { "Magnetic Field Divergence Error", "SCALAR", "1", "FLOATING_POINT", - sizeof(float) }, - { "TCA Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, - { "Bound Charge Density", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, - { "Free Current Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, - { "Charge Density", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, +static FieldInfo fieldInfo[total_field_groups] = { + { "Electric Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // ex,ey,ez + { "Electric Field Divergence Error", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // div_e_err + { "Magnetic Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // cbx,cby,cbz + { "Scalar Electron Pressure", "SCALAR", "1", "FLOATING POINT", sizeof(float) }, // pe + { "External Magnetic Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // cbx0,cby0,cbz0 + { "Initial Electron Temperature", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // te0 + { "TCA Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // tcax,tcay,tcaz + { "Bound Charge Density", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // rhob + { "Free Current Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // jfx,jfy,jfz + { "Charge Density", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // rhof + { "Old Free Current Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // jfxold,jfyold,jfzold + { "Old Charge Density", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // rhofold + { "Smoothed E Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // tx,ty,tz + { "Electron Temperature", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // te + { "Smoothed B Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // ox,oy,oz + { "oe", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // oe + { "Electron Pressure", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // pex,pey,pez + { "Magnetic Field Divergence Error", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // div_b_err + { "Electron Velocity", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // ux,uy,uz + { "ue", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // ue + { "Electron Momentum Source", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // sx,sy,sz + { "Electron Energy Source", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // se +#ifdef EXTERNAL_FORCE + { "External Electric Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // Ex0,Ey0,Ez0 + { "_pad1", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // _pad1 + { "External Gravity Field", "VECTOR", "3", "FLOATING_POINT", sizeof(float) }, // Gx0,Gy0,Gz0 + { "_pad2", "SCALAR", "1", "FLOATING_POINT", sizeof(float) }, // _pad2 +#endif { "Edge Material", "VECTOR", "3", "INTEGER", sizeof(material_id) }, { "Node Material", "SCALAR", "1", "INTEGER", sizeof(material_id) }, { "Face Material", "VECTOR", "3", "INTEGER", sizeof(material_id) }, diff --git a/src/vpic/dump_strategy.cc b/src/vpic/dump_strategy.cc index 38e7ef56..145d77ec 100644 --- a/src/vpic/dump_strategy.cc +++ b/src/vpic/dump_strategy.cc @@ -916,8 +916,33 @@ void HDF5Dump::dump_fields( if (field_dump_flag.flags["oz"]) DUMP_FIELD_TO_HDF5("oz", oz, H5T_NATIVE_FLOAT); if (field_dump_flag.flags["oe"]) DUMP_FIELD_TO_HDF5("oe", oe, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["pex"]) DUMP_FIELD_TO_HDF5("pex", pex, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["pey"]) DUMP_FIELD_TO_HDF5("pey", pey, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["pez"]) DUMP_FIELD_TO_HDF5("pez", pez, H5T_NATIVE_FLOAT); if (field_dump_flag.flags["div_b_err"]) DUMP_FIELD_TO_HDF5("div_b_err", div_e_err, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["ux"]) DUMP_FIELD_TO_HDF5("ux", ux, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["uy"]) DUMP_FIELD_TO_HDF5("uy", uy, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["uz"]) DUMP_FIELD_TO_HDF5("uz", uz, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["ue"]) DUMP_FIELD_TO_HDF5("ue", ue, H5T_NATIVE_FLOAT); + + if (field_dump_flag.flags["sx"]) DUMP_FIELD_TO_HDF5("sx", sx, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["sy"]) DUMP_FIELD_TO_HDF5("sy", sy, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["sz"]) DUMP_FIELD_TO_HDF5("sz", sz, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["se"]) DUMP_FIELD_TO_HDF5("se", se, H5T_NATIVE_FLOAT); + +#ifdef EXTERNAL_FORCE + if (field_dump_flag.flags["Ex0"]) DUMP_FIELD_TO_HDF5("Ex0", Ex0, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["Ey0"]) DUMP_FIELD_TO_HDF5("Ey0", Ey0, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["Ez0"]) DUMP_FIELD_TO_HDF5("Ez0", Ez0, H5T_NATIVE_FLOAT); + //if (field_dump_flag.flags["_pad1"]) DUMP_FIELD_TO_HDF5("_pad1", _pad1, H5T_NATIVE_FLOAT); + + if (field_dump_flag.flags["Gx0"]) DUMP_FIELD_TO_HDF5("Gx0", Gx0, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["Gy0"]) DUMP_FIELD_TO_HDF5("Gy0", Gy0, H5T_NATIVE_FLOAT); + if (field_dump_flag.flags["Gz0"]) DUMP_FIELD_TO_HDF5("Gz0", Gz0, H5T_NATIVE_FLOAT); + //if (field_dump_flag.flags["_pad2"]) DUMP_FIELD_TO_HDF5("_pad2", _pad2, H5T_NATIVE_FLOAT); +#endif + el2 = uptime() - el2; if ( rank==0 ) log_printf("TimeHDF5Write: %.2f s\n", el2); diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h index 69fca667..7ca3ed6b 100644 --- a/src/vpic/dump_strategy.h +++ b/src/vpic/dump_strategy.h @@ -161,23 +161,41 @@ struct field_dump_flag_t std::vector flag_keys = { "ex", "ey", "ez", "div_e_err", "cbx", "cby", "cbz", "pe", + "cbx0", "cby0", "cbz0", "te0", "tcax", "tcay", "tcaz", "rhob", "jfx", "jfy", "jfz", "rhof", "jfxold", "jfyold", "jfzold", "rhofold", - "cbx0", "cby0", "cbz0", "te0", "tx", "ty", "tz", "te", - "ox", "oy", "oz", "oe","div_b_err" + "ox", "oy", "oz", "oe", + "pex", "pey", "pez", "div_b_err", + "ux", "uy", "uz", "ue", + "sx", "sy", "sz", "se", +#ifdef EXTERNAL_FORCE + "Ex0", "Ey0", "Ez0", "_pad1", + "Gx0", "Gy0", "Gz0", "_pad2", +#endif + "ematx", "ematy", "ematz", "nmat", + "fmatx", "fmaty", "fmatz", "cmat", }; std::unordered_map flags = { {"ex", true}, {"ey", true}, {"ez", true}, {"div_e_err", true}, {"cbx", true}, {"cby", true}, {"cbz", true}, {"pe", true}, + {"cbx0", true}, {"cby0", true}, {"cbz0", true}, {"te0", true}, {"tcax", true}, {"tcay", true}, {"tcaz", true}, {"rhob", true}, {"jfx", true}, {"jfy", true}, {"jfz", true}, {"rhof", true}, {"jfxold", true}, {"jfyold", true}, {"jfzold", true}, {"rhofold", true}, - {"cbx0", true}, {"cby0", true}, {"cbz0", true}, {"te0", true}, {"tx", true}, {"ty", true}, {"tz", true}, {"te", true}, - {"ox", true}, {"oy", true}, {"oz", true}, {"oe", true}, {"div_b_err", true} + {"ox", true}, {"oy", true}, {"oz", true}, {"oe", true}, + {"pex", true}, {"pey", true}, {"pez", true}, {"div_b_err", true}, + {"ux", true}, {"uy", true}, {"uz", true}, {"ue", true}, + {"sx", true}, {"sy", true}, {"sz", true}, {"se", true}, +#ifdef EXTERNAL_FORCE + {"Ex0", true}, {"Ey0", true}, {"Ez0", true}, {"_pad1", true}, + {"Gx0", true}, {"Gy0", true}, {"Gz0", true}, {"_pad2", true}, +#endif + {"ematx", true}, {"ematy", true}, {"ematz", true}, {"nmat", true}, + {"fmatx", true}, {"fmaty", true}, {"fmatz", true}, {"cmat", true}, }; void disableE() { @@ -212,6 +230,16 @@ struct field_dump_flag_t flags["ox"] = false, flags["oy"] = false, flags["oz"] = false; } +#ifdef EXTERNAL_FORCE + void disableE0() { + flags["Ex0"] = false, flags["Ey0"] = false, flags["Ez0"] = false; + } + + void disableG0() { + flags["Gx0"] = false, flags["Gy0"] = false, flags["Gz0"] = false; + } +#endif + void disableALL() { for (auto& [key,_] : flags) flags[key] = false; @@ -249,6 +277,16 @@ struct field_dump_flag_t flags["ox"] = true, flags["oy"] = true, flags["oz"] = true; } +#ifdef EXTERNAL_FORCE + void enableE0() { + flags["Ex0"] = true, flags["Ey0"] = true, flags["Ez0"] = true; + } + + void enableG0() { + flags["Gx0"] = true, flags["Gy0"] = true, flags["Gz0"] = true; + } +#endif + void enableALL() { for (auto& [key,_] : flags) flags[key] = true; @@ -285,6 +323,17 @@ struct field_dump_flag_t bool enabledO() { return flags["ox"] && flags["oy"] && flags["oz"]; } + +#ifdef EXTERNAL_FORCE + bool enabledE0() { + return flags["Ex0"] && flags["Ey0"] && flags["Ez0"]; + } + + bool enabledG0() { + return flags["Gx0"] && flags["Gy0"] && flags["Gz0"]; + } +#endif + }; struct hydro_dump_flag_t diff --git a/src/vpic/initialize.cc b/src/vpic/initialize.cc index 30ebf605..40a1aabe 100644 --- a/src/vpic/initialize.cc +++ b/src/vpic/initialize.cc @@ -31,6 +31,22 @@ vpic_simulation::initialize( int argc, grid->eos_den = 1.; grid->kappa = 0; + // need isub=0 initialized for advance_b(...) to + // migrate QS shape currents from ghosts to live cells + grid->isub = 0; + +#ifdef EXTERNAL_FORCE + // Initialize E0, G0 fields to default values + for (int ii = 0; ii < grid->nv; ii++) { + field(ii).Ex0 = 0.; + field(ii).Ey0 = 0.; + field(ii).Ez0 = 0.; + field(ii).Gx0 = 0.; + field(ii).Gy0 = 0.; + field(ii).Gz0 = 0.; + } +#endif + // Call the user initialize the simulation TIC user_initialization( argc, argv ); TOC( user_initialization, 1 ); diff --git a/src/vpic/kokkos_helpers.h b/src/vpic/kokkos_helpers.h index 63696346..be42882b 100644 --- a/src/vpic/kokkos_helpers.h +++ b/src/vpic/kokkos_helpers.h @@ -9,8 +9,13 @@ // This module implements kokkos macros -#define FIELD_VAR_COUNT 44 -#define FIELD_EDGE_COUNT 8 +#ifdef EXTERNAL_FORCE + #define FIELD_VAR_COUNT 52 + #define FIELD_EDGE_COUNT 8 +#else + #define FIELD_VAR_COUNT 44 + #define FIELD_EDGE_COUNT 8 +#endif #ifdef VARIABLE_CHARGE #define PARTICLE_VAR_COUNT 8 @@ -21,7 +26,6 @@ #define PARTICLE_MOVER_VAR_COUNT 3 #define ACCUMULATOR_VAR_COUNT 4 #define ACCUMULATOR_ARRAY_LENGTH 4 -#define INTERPOLATOR_VAR_COUNT 18 #define MATERIAL_COEFFICIENT_VAR_COUNT 13 #ifdef VARIABLE_CHARGE #define HYDRO_VAR_COUNT 16 @@ -31,6 +35,22 @@ #define NUM_J_DIMS 4 #define FLUID_VAR_COUNT 6+4 +#ifdef SHAPE_NGP + #ifdef EXTERNAL_FORCE + #define INTERPOLATOR_VAR_COUNT 12 + #else + #define INTERPOLATOR_VAR_COUNT 6 + #endif +#else +#ifdef SHAPE_QS + #ifdef EXTERNAL_FORCE + #define INTERPOLATOR_VAR_COUNT 84 + #else + #define INTERPOLATOR_VAR_COUNT 42 + #endif +#endif +#endif + #ifdef KOKKOS_ENABLE_CUDA #define KOKKOS_SCATTER_DUPLICATED Kokkos::Experimental::ScatterNonDuplicated #define KOKKOS_SCATTER_ATOMIC Kokkos::Experimental::ScatterAtomic @@ -189,7 +209,15 @@ namespace field_var { sx = 40, sy = 41, sz = 42, - se = 43 + se = 43, + #ifdef EXTERNAL_FORCE + Ex0 = 44, + Ey0 = 45, + Ez0 = 46, + Gx0 = 47, + Gy0 = 48, + Gz0 = 49, + #endif }; }; namespace field_edge_var { \ @@ -207,24 +235,111 @@ namespace field_edge_var { \ namespace interpolator_var { enum i_r { +#ifdef SHAPE_NGP + ex = 0, + ey = 1, + ez = 2, + cbx = 3, + cby = 4, + cbz = 5, + #ifdef EXTERNAL_FORCE + Ex0 = 6, + Ey0 = 7, + Ez0 = 8, + Gx0 = 9, + Gy0 = 10, + Gz0 = 11, + #endif +#else +#ifdef SHAPE_QS ex = 0, - dexdy = 1, - dexdz = 2, - d2exdydz = 3, - ey = 4, - deydz = 5, - deydx = 6, - d2eydzdx = 7, - ez = 8, - dezdx = 9, - dezdy = 10, - d2ezdxdy = 11, - cbx = 12, - dcbxdx = 13, - cby = 14, - dcbydy = 15, - cbz = 16, - dcbzdz = 17, + dexdx = 1, + dexdy = 2, + dexdz = 3, + d2exdx = 4, + d2exdy = 5, + d2exdz = 6, + ey = 7, + deydx = 8, + deydy = 9, + deydz = 10, + d2eydx = 11, + d2eydy = 12, + d2eydz = 13, + ez = 14, + dezdx = 15, + dezdy = 16, + dezdz = 17, + d2ezdx = 18, + d2ezdy = 19, + d2ezdz = 20, + cbx = 21, + dcbxdx = 22, + dcbxdy = 23, + dcbxdz = 24, + d2cbxdx = 25, + d2cbxdy = 26, + d2cbxdz = 27, + cby = 28, + dcbydx = 29, + dcbydy = 30, + dcbydz = 31, + d2cbydx = 32, + d2cbydy = 33, + d2cbydz = 34, + cbz = 35, + dcbzdx = 36, + dcbzdy = 37, + dcbzdz = 38, + d2cbzdx = 39, + d2cbzdy = 40, + d2cbzdz = 41, + #ifdef EXTERNAL_FORCE + Ex0 = 42, + dEx0dx = 43, + dEx0dy = 44, + dEx0dz = 45, + d2Ex0dx = 46, + d2Ex0dy = 47, + d2Ex0dz = 48, + Ey0 = 49, + dEy0dx = 50, + dEy0dy = 51, + dEy0dz = 52, + d2Ey0dx = 53, + d2Ey0dy = 54, + d2Ey0dz = 55, + Ez0 = 56, + dEz0dx = 57, + dEz0dy = 58, + dEz0dz = 59, + d2Ez0dx = 60, + d2Ez0dy = 61, + d2Ez0dz = 62, + Gx0 = 63, + dGx0dx = 64, + dGx0dy = 65, + dGx0dz = 66, + d2Gx0dx = 67, + d2Gx0dy = 68, + d2Gx0dz = 69, + Gy0 = 70, + dGy0dx = 71, + dGy0dy = 72, + dGy0dz = 73, + d2Gy0dx = 74, + d2Gy0dy = 75, + d2Gy0dz = 76, + Gz0 = 77, + dGz0dx = 78, + dGz0dy = 79, + dGz0dz = 80, + d2Gz0dx = 81, + d2Gz0dy = 82, + d2Gz0dz = 83, + #endif +#endif +#endif }; }; diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index f01eba3b..0a082293 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -50,30 +50,47 @@ const uint64_t electric (1ULL<<0 | 1ULL<<1 | 1ULL<<2); const uint64_t div_e_err (1ULL<<3); const uint64_t magnetic (1ULL<<4 | 1ULL<<5 | 1ULL<<6); const uint64_t pe (1ULL<<7); -const uint64_t tca (1ULL<<8 | 1ULL<<9 | 1ULL<<10); -const uint64_t rhob (1ULL<<11); -const uint64_t current (1ULL<<12 | 1ULL<<13 | 1ULL<<14); -const uint64_t rhof (1ULL<<15); -const uint64_t currentold (1ULL<<16 | 1ULL<<17 | 1ULL<<18); -const uint64_t rhofold (1ULL<<19); -const uint64_t magnetic0 (1ULL<<20 | 1ULL<<21 | 1ULL<<22); -const uint64_t te0 (1ULL<<23); +const uint64_t magnetic0 (1ULL<<8 | 1ULL<<9 | 1ULL<<10); +const uint64_t te0 (1ULL<<11); +const uint64_t tca (1ULL<<12 | 1ULL<<13 | 1ULL<<14); +const uint64_t rhob (1ULL<<15); +const uint64_t current (1ULL<<16 | 1ULL<<17 | 1ULL<<18); +const uint64_t rhof (1ULL<<19); +const uint64_t currentold (1ULL<<20 | 1ULL<<21 | 1ULL<<22); +const uint64_t rhofold (1ULL<<23); const uint64_t tempt (1ULL<<24 | 1ULL<<25 | 1ULL<<26); const uint64_t te (1ULL<<27); const uint64_t tempo (1ULL<<28 | 1ULL<<29 | 1ULL<<30); const uint64_t oe (1ULL<<31); -//const uint64_t tempp (1ULL<<32 | 1ULL<<33 | 1ULL<<34); -//const uint64_t pe (1ULL<<35); -//const uint64_t emat (1ULL<<36 | 1ULL<<37 | 1ULL<<38); -//const uint64_t nmat (1ULL<<39); -//const uint64_t fmat (1ULL<<40 | 1ULL<<41 | 1ULL<<42); -//const uint64_t cmat (1ULL<<43); +const uint64_t tempp (1ULL<<32 | 1ULL<<33 | 1ULL<<34); +const uint64_t div_b_err (1ULL<<35); +const uint64_t uxyz (1ULL<<36 | 1ULL<<37 | 1ULL<<38); +const uint64_t ue (1ULL<<39); +const uint64_t sxyz (1ULL<<40 | 1ULL<<41 | 1ULL<<42); +const uint64_t se (1ULL<<43); +const uint64_t electric0 (1ULL<<44 | 1ULL<<45 | 1ULL<<46); +//const uint64_t pad1 (1ULL<<47); +const uint64_t gravity0 (1ULL<<48 | 1ULL<<49 | 1ULL<<50); +//const uint64_t pad2 (1ULL<<51); +//const uint64_t emat (1ULL<<52 | 1ULL<<53 | 1ULL<<54); +//const uint64_t nmat (1ULL<<55); +//const uint64_t fmat (1ULL<<56 | 1ULL<<57 | 1ULL<<58); +//const uint64_t cmat (1ULL<<59); // 1ULL = unsigned long long ensures 64bits for bitshift operators -const size_t total_field_variables(32); -const size_t total_field_groups(16); // this counts vectors, tensors etc... +#ifdef EXTERNAL_FORCE +const size_t total_field_variables(60); +const size_t total_field_groups(30); // this counts vectors, tensors etc... // These bits will be tested to determine which variables to output -const size_t field_indeces[22] = { 0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31 }; +const size_t field_indeces[30] = { 0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31, + 32, 35, 36, 39, 40, 43, 44, 47, 48, 51, 52, 55, 56, 59 }; +#else +const size_t total_field_variables(52); +const size_t total_field_groups(26); // this counts vectors, tensors etc... +// These bits will be tested to determine which variables to output +const size_t field_indeces[26] = { 0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31, + 32, 35, 36, 39, 40, 43, 44, 47, 48, 51 }; +#endif struct FieldInfo { char name[128]; diff --git a/test/integrated/to_completion/fluid_species.deck b/test/integrated/to_completion/fluid_species.deck index 9ad09f03..0c93c210 100644 --- a/test/integrated/to_completion/fluid_species.deck +++ b/test/integrated/to_completion/fluid_species.deck @@ -366,7 +366,7 @@ sim_log( "Loading fields" ); * Convenience functions for simlog output *------------------------------------------------------------------------*/ - char varlist[512]; + char varlist[1024]; create_field_list(varlist, global->fdParams); sim_log ( "Fields variable list: " << varlist ); diff --git a/test/integrated/to_completion/pcai.deck b/test/integrated/to_completion/pcai.deck index 0d8c3108..301751aa 100644 --- a/test/integrated/to_completion/pcai.deck +++ b/test/integrated/to_completion/pcai.deck @@ -462,7 +462,7 @@ sim_log( "Loading fields" ); * Convenience functions for simlog output *------------------------------------------------------------------------*/ - char varlist[512]; + char varlist[1024]; create_field_list(varlist, global->fdParams); sim_log ( "Fields variable list: " << varlist ); diff --git a/test/integrated/to_completion/shock.deck b/test/integrated/to_completion/shock.deck index 00661360..5d4f5db9 100755 --- a/test/integrated/to_completion/shock.deck +++ b/test/integrated/to_completion/shock.deck @@ -542,7 +542,7 @@ sim_log( "Loading fields" ); * Convenience functions for simlog output *------------------------------------------------------------------------*/ - char varlist[512]; + char varlist[1024]; create_field_list(varlist, global->fdParams); sim_log ( "Fields variable list: " << varlist );