From c694d90c91a0ad45deede302345afcc16f099b66 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 15 Jul 2013 13:49:05 +0200
Subject: [PATCH 001/118] MIC requires that mpi.h be included before stdio.h
 (issue #12)

---
 PSKOutput3D/PSKhdf5adaptor.cpp      | 1 +
 bc/BcParticles.cpp                  | 1 +
 fields/EMfields3D.cpp               | 1 +
 grids/Grid3DCU.cpp                  | 1 +
 iPic3D.cpp                          | 1 +
 inputoutput/Collective.cpp          | 1 +
 inputoutput/WriteOutputParallel.cpp | 1 +
 inputoutput/phdf5.cpp               | 1 +
 particles/Particles3D.cpp           | 1 +
 particles/Particles3Dcomm.cpp       | 1 +
 solvers/CG.cpp                      | 1 +
 solvers/GMRES.cpp                   | 1 +
 12 files changed, 12 insertions(+)
diff --git a/PSKOutput3D/PSKhdf5adaptor.cpp b/PSKOutput3D/PSKhdf5adaptor.cpp
index bb220cce..33c83115 100644
--- a/PSKOutput3D/PSKhdf5adaptor.cpp
+++ b/PSKOutput3D/PSKhdf5adaptor.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "PSKhdf5adaptor.h"
 
 using namespace PSK;
diff --git a/bc/BcParticles.cpp b/bc/BcParticles.cpp
index b26dee05..ff3999a3 100644
--- a/bc/BcParticles.cpp
+++ b/bc/BcParticles.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "BcParticles.h"
 
 /** set the boundary VirtualTopology3D3Dcondition  for particle in 2D
diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 585d8dc9..bca24228 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "EMfields3D.h"
 
 /*! constructor */
diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index 1fd94891..c2b92797 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "Grid3DCU.h"
 
 /*! constructor */
diff --git a/iPic3D.cpp b/iPic3D.cpp
index 7fc86b24..1803f2aa 100644
--- a/iPic3D.cpp
+++ b/iPic3D.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include <iomanip>
 #include "iPic3D.h"
 
diff --git a/inputoutput/Collective.cpp b/inputoutput/Collective.cpp
index 1081f4a7..9f11e3de 100644
--- a/inputoutput/Collective.cpp
+++ b/inputoutput/Collective.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "Collective.h"
 
 /*! Read the input file from text file and put the data in a collective wrapper: if it's a restart read from input file basic sim data and load particles and EM field from restart file */
diff --git a/inputoutput/WriteOutputParallel.cpp b/inputoutput/WriteOutputParallel.cpp
index 2f83af88..6de0d2ea 100644
--- a/inputoutput/WriteOutputParallel.cpp
+++ b/inputoutput/WriteOutputParallel.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "WriteOutputParallel.h"
 
 void WriteOutputParallel(Grid3DCU *grid, EMfields3D *EMf, CollectiveIO *col, VCtopology3D *vct, int cycle){
diff --git a/inputoutput/phdf5.cpp b/inputoutput/phdf5.cpp
index 9ffc94f3..591f54ca 100644
--- a/inputoutput/phdf5.cpp
+++ b/inputoutput/phdf5.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "phdf5.h"
 
 PHDF5fileClass::PHDF5fileClass(string filestr, int nd, int *coord, MPI_Comm mpicomm){
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index dc24f0ca..a2a7f05c 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -5,6 +5,7 @@ developers: Stefano Markidis, Giovanni Lapenta
  ********************************************************************************************/
 
 
+#include <mpi.h>
 #include <iostream>
 #include <math.h>
 
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index d7fcef81..db0ec204 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -4,6 +4,7 @@
 developers: Stefano Markidis, Giovanni Lapenta.
  ********************************************************************************************/
 
+#include <mpi.h>
 #include <iostream>
 #include <math.h>
 #include "VirtualTopology3D.h"
diff --git a/solvers/CG.cpp b/solvers/CG.cpp
index 7e9f41ef..8b36d461 100644
--- a/solvers/CG.cpp
+++ b/solvers/CG.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "CG.h"
 
 /**
diff --git a/solvers/GMRES.cpp b/solvers/GMRES.cpp
index c245c79a..1e3d4b1b 100644
--- a/solvers/GMRES.cpp
+++ b/solvers/GMRES.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "GMRES.h"
 
 void GMRES(FIELD_IMAGE FunctionImage, double *xkrylov, int xkrylovlen, double *b, int m, int max_iter, double tol, Grid * grid, VirtualTopology3D * vct, Field * field) {

From c4eaee26ea1dadbb0f944cc938812d8e0e5a2515 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 15 Jul 2013 21:20:35 +0200
Subject: [PATCH 002/118] Issue #29: add errors.h for diagnostics: eprintf()
 and invalid_value_error()

---
 include/errors.h   | 25 +++++++++++++++++++++++++
 utility/errors.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 include/errors.h
 create mode 100644 utility/errors.cpp

diff --git a/include/errors.h b/include/errors.h
new file mode 100644
index 00000000..ca80313f
--- /dev/null
+++ b/include/errors.h
@@ -0,0 +1,25 @@
+#ifndef ipic_errors_H
+#define ipic_errors_H
+
+void errmsg_printf_fileLine(const char *func, const char *file, int line_number, const char *format, ...);
+void eprintf_fileLine(const char *func, const char *file, int line_number, const char *format, ...);
+void Wprintf_fileLine(const char *func, const char *file, int line_number, const char *format, ...);
+
+#define errmsg_printf(args...) \
+  errmsg_printf_fileLine(__func__, __FILE__, __LINE__, ## args);
+#define eprintf(args...) \
+  errmsg_printf_fileLine(__func__, __FILE__, __LINE__, ## args);
+#define Wprintf(args...) \
+  Wprintf_fileLine(__func__, __FILE__, __LINE__, ## args);
+#define declare_invalid_value_error(t1) \
+  void invalid_value_error_fileLine(const char* file, int line, const char* func, \
+    const char* type, const char* expr, t1 val);
+declare_invalid_value_error(double);
+declare_invalid_value_error(int);
+declare_invalid_value_error(const char*);
+#define unsupported_value_error(val) invalid_value_error_fileLine( \
+  __FILE__, __LINE__, __func__, "unsupported", #val, val);
+#define invalid_value_error(val) invalid_value_error_fileLine( \
+  __FILE__, __LINE__, __func__, "invalid", #val, val);
+
+#endif
diff --git a/utility/errors.cpp b/utility/errors.cpp
new file mode 100644
index 00000000..3572df99
--- /dev/null
+++ b/utility/errors.cpp
@@ -0,0 +1,44 @@
+ 
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include "errors.h"
+//#include "MPIdata.h" // for rank
+
+/** implementation of declarations in errors.h **/
+
+void errmsg_printf_fileLine(const char *func, const char *file, int line_number,
+  const char *format, ...)
+{
+  FILE* fptr = stdout;
+  fflush(fptr);
+  va_list args;
+  va_start(args, format);
+  fprintf(fptr, "ERROR in function %s, file %s, line %d: \n\t",
+    func, file, line_number);
+  /* print out remainder of message */
+  vfprintf(fptr, format, args);
+  va_end(args);
+  // append terminating newline so user does not have to do it
+  fprintf(fptr, "\n");
+  fflush(fptr);
+
+  abort();
+}
+
+#include <iostream>
+using namespace std;
+#define implement_invalid_value_error(t1) \
+  void invalid_value_error_fileLine(const char* file, int line, const char* func, \
+    const char* type, const char* expr, t1 val) \
+  { \
+    std::cerr<< "ERROR in file " << file << ", line " << line  \
+      << ", function " << func  \
+      <<"\n\t" << type << " value: " << expr << " = " << val << endl; \
+      abort(); \
+  }
+
+implement_invalid_value_error(double);
+implement_invalid_value_error(int);
+implement_invalid_value_error(const char*);
+

From 37414e353b3aa458a6b81325a645f1db8faf6b09 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 15 Jul 2013 21:32:35 +0200
Subject: [PATCH 003/118] issue #30: MPIdata should be a singleton

---
 iPic3D.cpp          |  3 +++
 include/MPIdata.h   | 60 ++++++++++++++-------------------------------
 include/PSKOutput.h |  4 +--
 include/iPic3D.h    |  2 +-
 main/iPic3Dlib.cpp  | 15 +++++++++---
 utility/MPIdata.cpp | 59 ++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 95 insertions(+), 48 deletions(-)
 create mode 100644 utility/MPIdata.cpp

diff --git a/iPic3D.cpp b/iPic3D.cpp
index 1803f2aa..5e31c535 100644
--- a/iPic3D.cpp
+++ b/iPic3D.cpp
@@ -2,6 +2,7 @@
 #include <mpi.h>
 #include <iomanip>
 #include "iPic3D.h"
+#include "debug.h"
 
 using namespace iPic3D;
 
@@ -10,6 +11,8 @@ int main(int argc, char **argv) {
   iPic3D::c_Solver KCode;
   bool b_err = false;
 
+  MPIdata::init(&argc, &argv);
+  //dprintf("MPI has been initialized.");
   KCode.Init(argc, argv);
 
   for (int i = KCode.FirstCycle(); i < KCode.LastCycle(); i++) {
diff --git a/include/MPIdata.h b/include/MPIdata.h
index 07efe13b..c4d535ab 100644
--- a/include/MPIdata.h
+++ b/include/MPIdata.h
@@ -11,11 +11,6 @@ email                : markidis@lanl.gov, lapenta@lanl.gov
 #define MPIDATA_H
 
 #include <mpi.h>
-#include <iostream>
-
-using std::cout;
-using std::endl;
-
 /**
  * MPI Data Structure. This class contains:
  *
@@ -29,55 +24,38 @@ using std::endl;
  * (C) 2004 Los Alamos National Laboratory
  * @author Stefano Markidis, Giovanni Lapenta
  * @version 1.0
+ *
+ * I made this class a singleton.  It should only be created once,
+ * since MPI_Init should be called only once. -Alec
  */
 class MPIdata {
 public:
-  /** constructor: setup MPI environment */
-  MPIdata(int *, char ***);
-  /** destructor */
-   ~MPIdata();
-  /** initialize MPIdata */
-  void init(int *, char ***);
+  static MPIdata& instance();
+private:
+  // disable constructor and destructor of this singleton
+  // by making them private.
+  ~MPIdata(){}
+  MPIdata(){}
+public:
+  /** initialize MPI environment */
+  static void init(int *, char ***);
   /** close MPI environment */
   void finalize_mpi();
   /** print MPI data structure */
   void Print(void);
   /** MPI status during the communication */
   MPI_Status status;
+public:
+  static int get_rank(){return instance().rank;}
+  static int get_nprocs(){return instance().nprocs;}
+private:
   /** rank of the process */
-  int rank;
+  static int rank;
   /** number of processes */
-  int nprocs;
+  static int nprocs;
 
+  // evidently unused...
   char *buffer;
   int buffer_size;
 };
-inline MPIdata::MPIdata(int *argc, char ***argv) {
-  /* Initialize the MPI API */
-  MPI_Init(argc, argv);
-
-  /* Set rank */
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-  /* Set total number of processors */
-  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-
-}
-
-inline MPIdata::~MPIdata() {
-}
-
-inline void MPIdata::finalize_mpi() {
-  MPI_Finalize();
-}
-
-inline void MPIdata::Print(void) {
-  cout << endl;
-  cout << "Number of processes = " << nprocs << endl;
-  cout << "-------------------------" << endl;
-  cout << endl;
-}
-
-// extern MPIdata *mpi; // instantiated in iPIC3D.cpp
-
 #endif
diff --git a/include/PSKOutput.h b/include/PSKOutput.h
index d1e10dc6..604387cf 100644
--- a/include/PSKOutput.h
+++ b/include/PSKOutput.h
@@ -335,7 +335,7 @@ template < class Toa > class myOutputAgent:public PSK::OutputAgent < Toa > {
     stringstream ss;
     stringstream cc;
     stringstream ii;
-    ss << _mpi->rank;
+    ss << _mpi->get_rank();
     cc << cycle;
     const int ns = _col->getNs();
     if (tag.find("last_cycle", 0) != string::npos)
@@ -608,7 +608,7 @@ template < class Toa > class myOutputAgent:public PSK::OutputAgent < Toa > {
   void output(const string & tag, int cycle, int sample) {
     stringstream ss;
     stringstream cc;
-    ss << _mpi->rank;
+    ss << _mpi->get_rank();
     cc << cycle;
     const int ns = _col->getNs();
 
diff --git a/include/iPic3D.h b/include/iPic3D.h
index a1b551f5..440fdf02 100644
--- a/include/iPic3D.h
+++ b/include/iPic3D.h
@@ -46,7 +46,7 @@ namespace iPic3D {
     inline int get_myrank();
 
   private:
-    MPIdata       * mpi;
+    static MPIdata * mpi;
     Collective    *col;
     VCtopology3D  *vct;
     Grid3DCU      *grid;
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 1f15e0dc..54e8495a 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -2,14 +2,20 @@
 #include "iPic3D.h"
 
 using namespace iPic3D;
+MPIdata* iPic3D::c_Solver::mpi=0;
 
 int c_Solver::Init(int argc, char **argv) {
-  // initialize MPI environment
+  // get MPI data
+  //
+  // c_Solver is not a singleton, so the following line was pulled out.
+  //MPIdata::init(&argc, &argv);
+  //
+  // initialized MPI environment
   // nprocs = number of processors
   // myrank = rank of tha process*/
-  mpi = new MPIdata(&argc, &argv);
-  nprocs = mpi->nprocs;
-  myrank = mpi->rank;
+  mpi = &MPIdata::instance();
+  nprocs = MPIdata::get_nprocs();
+  myrank = MPIdata::get_rank();
 
   col = new Collective(argc, argv); // Every proc loads the parameters of simulation from class Collective
   verbose = col->getVerbose();
@@ -353,3 +359,4 @@ void c_Solver::Finalize() {
   // close MPI
   mpi->finalize_mpi();
 }
+
diff --git a/utility/MPIdata.cpp b/utility/MPIdata.cpp
new file mode 100644
index 00000000..6baa3697
--- /dev/null
+++ b/utility/MPIdata.cpp
@@ -0,0 +1,59 @@
+#include <mpi.h>
+#include <iostream>
+#include <assert.h>
+#include "MPIdata.h"
+
+using std::cout;
+using std::endl;
+
+// code to check that init() is called before instance()
+//
+// no need for this to have more than file scope
+int MPIdata::rank=-1;
+int MPIdata::nprocs=-1;
+static bool MPIdata_is_initialized=false;
+bool MPIdata_assert_initialized()
+{
+  assert(MPIdata_is_initialized);
+  return true;
+}
+
+MPIdata& MPIdata::instance()
+{
+  // This is executed on the first call to check that
+  // MPIdata has first been initialized.
+  static bool check = MPIdata_assert_initialized();
+  static MPIdata* instance = new MPIdata;
+  // After the first call, this is the only line
+  // that is actually executed.
+  return *instance;
+}
+
+void MPIdata::init(int *argc, char ***argv) {
+  assert(!MPIdata_is_initialized);
+
+  /* Initialize the MPI API */
+  MPI_Init(argc, argv);
+
+  /* Set rank */
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  /* Set total number of processors */
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  MPIdata_is_initialized = true;
+}
+
+void MPIdata::finalize_mpi() {
+  MPI_Finalize();
+}
+
+void MPIdata::Print(void) {
+  cout << endl;
+  cout << "Number of processes = " << get_nprocs() << endl;
+  cout << "-------------------------" << endl;
+  cout << endl;
+}
+
+// extern MPIdata *mpi; // instantiated in iPIC3D.cpp
+

From 984f59954cab1b85287b4cd08f224b2ca7a7cc09 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 15 Jul 2013 21:34:00 +0200
Subject: [PATCH 004/118] debug.h was broken under commit bd0fa30835c.  This
 uses MPIdata singleton to fix it.

---
 include/debug.h   | 2 +-
 utility/debug.cpp | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/debug.h b/include/debug.h
index 04b69172..6532f599 100644
--- a/include/debug.h
+++ b/include/debug.h
@@ -4,7 +4,7 @@
 #ifndef __DEBUG_H__
 #define __DEBUG_H__
 
-#include <stdarg.h>
+#include <cstdarg>
 #include <cstdio>
 
 #include "debug.h"
diff --git a/utility/debug.cpp b/utility/debug.cpp
index e4271d88..306775fb 100644
--- a/utility/debug.cpp
+++ b/utility/debug.cpp
@@ -1,11 +1,12 @@
 
+#include "MPIdata.h" // for get_rank
 #include "debug.h"
 
 #define implement_dprintvar_fileLine(code,type) \
   void dprintvar_fileLine(const char* func, const char* file, int line, \
     const char* name, type val) \
   { \
-    dfprintf_fileLine(stderr,func,file,line, code " == %s",val,name); \
+    dfprintf_fileLine(stdout,func,file,line, code " == %s",val,name); \
   }
 
 implement_dprintvar_fileLine("%s", const char *);
@@ -16,8 +17,10 @@ void dfprintf_fileLine(FILE * fptr, const char *func, const char *file, int line
   fflush(fptr);
   va_list args;
   va_start(args, format);
-  fprintf(fptr, "(%d) DEBUG %s(), %s:%d: ", func, file, // my_basename(file),
-          line_number);
+  fprintf(fptr, "(%d) DEBUG %s(), %s:%d: ",
+    MPIdata::get_rank(),
+    func, file, // my_basename(file),
+    line_number);
   /* print out remainder of message */
   vfprintf(fptr, format, args);
   va_end(args);

From da737d93b64bf3732bd46f033f15361f3fb76ee5 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 15 Jul 2013 21:38:35 +0200
Subject: [PATCH 005/118] issue#31: consistently use stdout rather than stderr

---
 utility/asserts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utility/asserts.cpp b/utility/asserts.cpp
index 1de2301e..c5beed50 100644
--- a/utility/asserts.cpp
+++ b/utility/asserts.cpp
@@ -2,7 +2,7 @@
 #include "asserts.h"
 
 void assert_error(const char *file, int line, const char *func, const char *op, const char *lhs_str, const char *rhs_str, double lhs, double rhs) {
-  fprintf(stderr, "ERROR in file %s, line %d, function %s" "\n\tassertion failed: %s %s %s, i.e., %24.16e %s %24.16e\n", file, line, func, lhs_str, op, rhs_str, lhs, op, rhs);
+  fprintf(stdout, "ERROR in file %s, line %d, function %s" "\n\tassertion failed: %s %s %s, i.e., %24.16e %s %24.16e\n", file, line, func, lhs_str, op, rhs_str, lhs, op, rhs);
   abort();
 }
 

From aeccc32e81466330a89ebd5a1d0cfaced4258fd6 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 15 Jul 2013 21:43:48 +0200
Subject: [PATCH 006/118] issue #32: implemented USING_PARALLEL_HDF5 so that
 users are not forced to compile with parallel hdf5

---
 include/ipicdefs.h    |  7 +++++++
 inputoutput/phdf5.cpp | 12 ++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 include/ipicdefs.h

diff --git a/include/ipicdefs.h b/include/ipicdefs.h
new file mode 100644
index 00000000..d9b7720c
--- /dev/null
+++ b/include/ipicdefs.h
@@ -0,0 +1,7 @@
+#ifndef __IPIC_DEFS_H__
+#define __IPIC_DEFS_H__
+
+// uncomment the following line to use parallel hdf5
+//#define USING_PARALLEL_HDF5
+
+#endif
diff --git a/inputoutput/phdf5.cpp b/inputoutput/phdf5.cpp
index 591f54ca..3eb39397 100644
--- a/inputoutput/phdf5.cpp
+++ b/inputoutput/phdf5.cpp
@@ -1,6 +1,8 @@
 
 #include <mpi.h>
 #include "phdf5.h"
+#include "ipicdefs.h"
+#include "errors.h"
 
 PHDF5fileClass::PHDF5fileClass(string filestr, int nd, int *coord, MPI_Comm mpicomm){
 
@@ -66,7 +68,12 @@ void PHDF5fileClass::CreatePHDF5file(double *L, int *dglob, int *dlocl, bool bp)
   /* 2- Tell HDF5 that we want to use MPI-IO */
   /* --------------------------------------- */
 
+  #ifdef USING_PARALLEL_HDF5
   H5Pset_fapl_mpio(acc_t, comm, MPI_INFO_NULL);
+  #else
+  eprintf("WriteMethod==Parallel in input file "
+          "requires setting USING_PARALLEL_HDF5 in ipicdefs.h");
+  #endif
 
   /* ------------------------------------------------------- */
   /* 3- Load file identifier and release the access template */
@@ -201,7 +208,12 @@ int PHDF5fileClass::WritePHDF5dataset(string grpname, string datasetname, double
   /* --------------------------------- */
 
   dataset_xfer = H5Pcreate(H5P_DATASET_XFER);
+  #ifdef USING_PARALLEL_HDF5
   H5Pset_dxpl_mpio(dataset_xfer, H5FD_MPIO_COLLECTIVE);
+  #else
+  eprintf("WriteMethod==Parallel in input file "
+          "requires setting USING_PARALLEL_HDF5 in ipicdefs.h");
+  #endif
 
   /* ---------------------------- */
   /* 9- Write data to the dataset */

From 6c90362934902521a9a9a2fcf4e47a0a8df86090 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 07:27:39 +0200
Subject: [PATCH 007/118] Restored implementation of TimeTasks (issue #17). 
 Some communication time for accumulating moments was being reckoned as
 calculation time.

---
 communication/ComNodes3D.cpp |  53 +++++++--------
 fields/EMfields3D.cpp        |   5 +-
 iPic3D.cpp                   |   2 -
 include/ComNodes3D.h         |   4 +-
 include/EMfields3D.h         |   2 +-
 include/Particles3D.h        |   2 +-
 main/iPic3Dlib.cpp           |  23 ++++---
 particles/Particles3D.cpp    |   4 +-
 utility/TimeTasks.cpp        | 124 +++++++++++++++++++++++++++++++++++
 9 files changed, 174 insertions(+), 45 deletions(-)
 create mode 100644 utility/TimeTasks.cpp

diff --git a/communication/ComNodes3D.cpp b/communication/ComNodes3D.cpp
index 575e8248..30cb5ed8 100644
--- a/communication/ComNodes3D.cpp
+++ b/communication/ComNodes3D.cpp
@@ -1,10 +1,11 @@
 
 #include "ComNodes3D.h"
+#include "TimeTasks.h"
 
 /** communicate ghost cells (FOR NODES) */
 void communicateNode(int nx, int ny, int nz, double ***vector, VirtualTopology3D * vct) {
+  timeTasks.start_communicate();
 
-  // timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -101,12 +102,12 @@ void communicateNode(int nx, int ny, int nz, double ***vector, VirtualTopology3D
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-  // timeTasks.addto_communicate();
 
+  timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR NODES) */
 void communicateNodeBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  // timeTasks.start_communicate();
+  timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -206,12 +207,12 @@ void communicateNodeBC(int nx, int ny, int nz, double ***vector, int bcFaceXrigh
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-  // timeTasks.addto_communicate();
 
+  timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR NODES) with particles BC*/
 void communicateNodeBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  // timeTasks.start_communicate();
+  timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -311,14 +312,14 @@ void communicateNodeBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXri
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-  // timeTasks.addto_communicate();
 
+  timeTasks.addto_communicate();
 }
 
 /** SPECIES: communicate ghost cells */
 void communicateNode(int nx, int ny, int nz, double ****vector, int ns, VirtualTopology3D * vct) {
 
-  // timeTasks.start_communicate();
+  timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -414,15 +415,15 @@ void communicateNode(int nx, int ny, int nz, double ****vector, int ns, VirtualT
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-  // timeTasks.addto_communicate();
 
+  timeTasks.addto_communicate();
 }                               // 
 
 // PARTICLES
 /** SPECIES: communicate ghost cells */
 void communicateNode_P(int nx, int ny, int nz, double ****vector, int ns, VirtualTopology3D * vct) {
+  timeTasks.start_communicate();
 
-  // timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -518,15 +519,15 @@ void communicateNode_P(int nx, int ny, int nz, double ****vector, int ns, Virtua
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-  // timeTasks.addto_communicate();
 
+  timeTasks.addto_communicate();
 }
 
 // 
 /** communicate ghost cells (FOR CENTERS) */
 void communicateCenter(int nx, int ny, int nz, double ***vector, VirtualTopology3D * vct) {
 
-  // timeTasks.start_communicate();
+  timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -621,12 +622,12 @@ void communicateCenter(int nx, int ny, int nz, double ***vector, VirtualTopology
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-  // timeTasks.addto_communicate();
 
+  timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
 void communicateCenterBoxStencilBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  // timeTasks.start_communicate();
+  timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -653,12 +654,12 @@ void communicateCenterBoxStencilBC(int nx, int ny, int nz, double ***vector, int
   delete[]ghostYleftFace;
   delete[]ghostZrightFace;
   delete[]ghostZleftFace;
-  // timeTasks.addto_communicate();
+  timeTasks.addto_communicate();
 }
 // particles
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
 void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  // timeTasks.start_communicate();
+  timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -685,14 +686,14 @@ void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, double ***vector, i
   delete[]ghostYleftFace;
   delete[]ghostZrightFace;
   delete[]ghostZleftFace;
-  // timeTasks.addto_communicate();
+  timeTasks.addto_communicate();
 }
 
 // 
 
 
 void communicateNodeBoxStencilBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  // timeTasks.start_communicate();
+  timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -719,11 +720,11 @@ void communicateNodeBoxStencilBC(int nx, int ny, int nz, double ***vector, int b
   delete[]ghostYleftFace;
   delete[]ghostZrightFace;
   delete[]ghostZleftFace;
-  // timeTasks.addto_communicate();
+  timeTasks.addto_communicate();
 }
 
 void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  // timeTasks.start_communicate();
+  timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -750,15 +751,15 @@ void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, double ***vector, int
   delete[]ghostYleftFace;
   delete[]ghostZrightFace;
   delete[]ghostZleftFace;
-  // timeTasks.addto_communicate();
+  timeTasks.addto_communicate();
 }
 
 
 
 /** SPECIES: communicate ghost cells */
 void communicateCenter(int nx, int ny, int nz, double ****vector, int ns, VirtualTopology3D * vct) {
+  timeTasks.start_communicate();
 
-  // timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -852,13 +853,13 @@ void communicateCenter(int nx, int ny, int nz, double ****vector, int ns, Virtua
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-  // timeTasks.addto_communicate();
 
+  timeTasks.addto_communicate();
 }
 // /////////// communication + BC ////////////////////////////
 void communicateCenterBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+  timeTasks.start_communicate();
 
-  // timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -955,13 +956,13 @@ void communicateCenterBC(int nx, int ny, int nz, double ***vector, int bcFaceXri
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-  // timeTasks.addto_communicate();
 
+  timeTasks.addto_communicate();
 }
 // /////////// communication + BC ////////////////////////////
 void communicateCenterBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+  timeTasks.start_communicate();
 
-  // timeTasks.start_communicate();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -1058,6 +1059,6 @@ void communicateCenterBC_P(int nx, int ny, int nz, double ***vector, int bcFaceX
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-  // timeTasks.addto_communicate();
 
+  timeTasks.addto_communicate();
 }
diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index bca24228..28de6676 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -1,6 +1,7 @@
 
 #include <mpi.h>
 #include "EMfields3D.h"
+#include "TimeTasks.h"
 
 /*! constructor */
 EMfields3D::EMfields3D(Collective * col, Grid * grid) {
@@ -1072,6 +1073,8 @@ void EMfields3D::interpDensitiesN2C(VirtualTopology3D * vct, Grid * grid) {
 /*! communicate ghost for grid -> Particles interpolation */
 void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, VirtualTopology3D * vct) {
   // interpolate adding common nodes among processors
+  timeTasks.start_communicate();
+
   communicateInterp(nxn, nyn, nzn, ns, rhons, 0, 0, 0, 0, 0, 0, vct);
   communicateInterp(nxn, nyn, nzn, ns, Jxs, 0, 0, 0, 0, 0, 0, vct);
   communicateInterp(nxn, nyn, nzn, ns, Jys, 0, 0, 0, 0, 0, 0, vct);
@@ -1085,6 +1088,7 @@ void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft,
   // calculate the correct densities on the boundaries
   adjustNonPeriodicDensities(ns, vct);
   // put the correct values on ghost cells
+  timeTasks.addto_communicate();
 
   communicateNode_P(nxn, nyn, nzn, rhons, ns, vct);
   communicateNode_P(nxn, nyn, nzn, Jxs, ns, vct);
@@ -1096,7 +1100,6 @@ void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft,
   communicateNode_P(nxn, nyn, nzn, pYYsn, ns, vct);
   communicateNode_P(nxn, nyn, nzn, pYZsn, ns, vct);
   communicateNode_P(nxn, nyn, nzn, pZZsn, ns, vct);
-
 }
 
 
diff --git a/iPic3D.cpp b/iPic3D.cpp
index 5e31c535..d670264c 100644
--- a/iPic3D.cpp
+++ b/iPic3D.cpp
@@ -2,7 +2,6 @@
 #include <mpi.h>
 #include <iomanip>
 #include "iPic3D.h"
-#include "debug.h"
 
 using namespace iPic3D;
 
@@ -12,7 +11,6 @@ int main(int argc, char **argv) {
   bool b_err = false;
 
   MPIdata::init(&argc, &argv);
-  //dprintf("MPI has been initialized.");
   KCode.Init(argc, argv);
 
   for (int i = KCode.FirstCycle(); i < KCode.LastCycle(); i++) {
diff --git a/include/ComNodes3D.h b/include/ComNodes3D.h
index 3b465c49..8e1636e5 100644
--- a/include/ComNodes3D.h
+++ b/include/ComNodes3D.h
@@ -11,9 +11,9 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #define ComNodes_H
 
 #include "ComBasic3D.h"
-#include "TimeTasks.h"
+//#include "TimeTasks.h"
 
-extern TimeTasks timeTasks;
+//extern TimeTasks timeTasks;
 
 // boundary condition for fields
 #include "BcFields3D.h"
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 23ecdeea..9faca505 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -21,7 +21,7 @@
 #include "Collective.h"
 #include "ComNodes3D.h"
 #include "ComInterpNodes3D.h"
-#include "TimeTasks.h"
+//#include "TimeTasks.h"
 #include "asserts.h"
 #include "BCStructure.h"
 
diff --git a/include/Particles3D.h b/include/Particles3D.h
index 8d701c35..2c178918 100644
--- a/include/Particles3D.h
+++ b/include/Particles3D.h
@@ -8,7 +8,7 @@ developers: Stefano Markidis, Enrico Camporeale, Giovanni Lapenta, David Burgess
 #define Part2D_H
 
 #include "Particles3Dcomm.h"
-#include "TimeTasks.h"
+//#include "TimeTasks.h"
 
 /**
  * 
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 54e8495a..6cd4531e 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -1,5 +1,6 @@
 
 #include "iPic3D.h"
+#include "TimeTasks.h"
 
 using namespace iPic3D;
 MPIdata* iPic3D::c_Solver::mpi=0;
@@ -164,15 +165,17 @@ int c_Solver::Init(int argc, char **argv) {
 
 void c_Solver::CalculateField() {
 
-  // timeTasks.resetCycle();
+  timeTasks.resetCycle();
   // interpolation
-  // timeTasks.start(TimeTasks::MOMENTS);
+  timeTasks.start(TimeTasks::MOMENTS);
 
   EMf->updateInfoFields(grid,vct,col);
   EMf->setZeroDensities();                  // set to zero the densities
 
   for (int i = 0; i < ns; i++)
+  {
     part[i].interpP2G(EMf, grid, vct);      // interpolate Particles to Grid(Nodes)
+  }
 
   EMf->sumOverSpecies(vct);                 // sum all over the species
 
@@ -188,12 +191,12 @@ void c_Solver::CalculateField() {
   EMf->interpDensitiesN2C(vct, grid);       // calculate densities on centers from nodes
   EMf->calculateHatFunctions(grid, vct);    // calculate the hat quantities for the implicit method
   MPI_Barrier(MPI_COMM_WORLD);
-  // timeTasks.end(TimeTasks::MOMENTS);
+  timeTasks.end(TimeTasks::MOMENTS);
 
   // MAXWELL'S SOLVER
-  // timeTasks.start(TimeTasks::FIELDS);
+  timeTasks.start(TimeTasks::FIELDS);
   EMf->calculateE(grid, vct, col);               // calculate the E field
-  // timeTasks.end(TimeTasks::FIELDS);
+  timeTasks.end(TimeTasks::FIELDS);
 
 }
 
@@ -203,13 +206,13 @@ bool c_Solver::ParticlesMover() {
   /*  Particle mover */
   /*  -------------- */
 
-  // timeTasks.start(TimeTasks::PARTICLES);
+  timeTasks.start(TimeTasks::PARTICLES);
   for (int i = 0; i < ns; i++)  // move each species
   {
     // #pragma omp task inout(part[i]) in(grid) target_device(booster)
     mem_avail = part[i].mover_PC(grid, vct, EMf); // use the Predictor Corrector scheme 
   }
-  // timeTasks.end(TimeTasks::PARTICLES);
+  timeTasks.end(TimeTasks::PARTICLES);
 
   if (mem_avail < 0) {          // not enough memory space allocated for particles: stop the simulation
     if (myrank == 0) {
@@ -252,12 +255,12 @@ bool c_Solver::ParticlesMover() {
   /* This step must be taken out of here! */
   /* --------------------- */
 
-  // timeTasks.start(TimeTasks::BFIELD);
+  timeTasks.start(TimeTasks::BFIELD);
   EMf->calculateB(grid, vct, col);   // calculate the B field
-  // timeTasks.end(TimeTasks::BFIELD);
+  timeTasks.end(TimeTasks::BFIELD);
 
   // print out total time for all tasks
-  // timeTasks.print_cycle_times();
+  timeTasks.print_cycle_times();
   return (false);
 
 }
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index a2a7f05c..4dfeb724 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -510,7 +510,7 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   // ********************//
   // COMMUNICATION 
   // *******************//
-  // timeTasks.start_communicate();
+  timeTasks.start_communicate();
   const int avail = communicate(vct);
   if (avail < 0)
     return (-1);
@@ -523,7 +523,7 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       return (-1);
     MPI_Barrier(MPI_COMM_WORLD);
   }
-  // timeTasks.addto_communicate();
+  timeTasks.addto_communicate();
   return (0);                   // exit succcesfully (hopefully) 
 }
 
diff --git a/utility/TimeTasks.cpp b/utility/TimeTasks.cpp
new file mode 100644
index 00000000..69f56fb7
--- /dev/null
+++ b/utility/TimeTasks.cpp
@@ -0,0 +1,124 @@
+
+#include <mpi.h>
+#include <stdarg.h>
+#include "TimeTasks.h"
+#include "asserts.h"
+#include "MPIdata.h" // for get_rank
+
+/** implementation of declarations in utility/TimeTasks.h **/
+
+TimeTasks timeTasks;
+
+void TimeTasks::resetCycle()
+{
+  for(int e=0;e<LAST;e++)
+  {
+    //compute[e]=0.;
+    start_times[e]=0.;
+    task_duration[e]=0.;
+    communicate[e]=0.;
+  }
+  active_task=NONE;
+  active_mode=COMPUTATION;
+  t_start_communicate = 0.;
+}
+void TimeTasks::start(int taskid)
+{
+  assert_eq(active_task+1,taskid);
+  active_task = taskid;
+  double now = MPI_Wtime();
+  start_times[active_task] = now;
+}
+void TimeTasks::end(int taskid)
+{
+  assert_eq(taskid,active_task);
+  double now = MPI_Wtime();
+  task_duration[active_task] = now - start_times[active_task];
+  compute[active_task] = task_duration[active_task]-communicate[active_task];
+}
+void TimeTasks::start_communicate()
+{
+  if(!active_task) return;
+  assert_eq(active_mode,COMPUTATION);
+  t_start_communicate = MPI_Wtime();
+  active_mode=COMMUNICATION;
+}
+void TimeTasks::addto_communicate()
+{
+  if(!active_task) return;
+  assert_eq(active_mode,COMMUNICATION);
+  assert_ne(t_start_communicate,0.);
+  communicate[active_task] += MPI_Wtime()-t_start_communicate;
+  t_start_communicate = 0.;
+  active_mode=COMPUTATION;
+}
+#define TIMING_PREFIX "| "
+void TimeTasks::print_cycle_times()
+{
+  // we could report average for all processes
+  if(!MPIdata::get_rank())
+  {
+    fflush(stdout);
+    fprintf(stdout,"=== times for cycle for rank %d) === \n",
+      MPIdata::get_rank());
+    fprintf(stdout, TIMING_PREFIX
+      "moms flds pcls Bfld cycl\n");
+    fprintf(stdout, TIMING_PREFIX
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f (total time)\n",
+      get_time(TimeTasks::MOMENTS),
+      get_time(TimeTasks::FIELDS),
+      get_time(TimeTasks::PARTICLES),
+      get_time(TimeTasks::BFIELD),
+      get_time()
+      );
+    fprintf(stdout, TIMING_PREFIX
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f (communication)\n",
+      get_communicate(TimeTasks::MOMENTS),
+      get_communicate(TimeTasks::FIELDS),
+      get_communicate(TimeTasks::PARTICLES),
+      get_communicate(TimeTasks::BFIELD),
+      get_communicate()
+      );
+    fprintf(stdout, TIMING_PREFIX
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f (computation)\n",
+      get_compute(TimeTasks::MOMENTS),
+      get_compute(TimeTasks::FIELDS),
+      get_compute(TimeTasks::PARTICLES),
+      get_compute(TimeTasks::BFIELD),
+      get_compute()
+      );
+    //fprintf(stdout, TIMING_PREFIX
+    //  "MOMS comm  FLDS comm  PCLS comm  CYCL comm\n");
+    //fprintf(stdout, TIMING_PREFIX
+    //  "%4.2f "
+    //  "%4.2f  "
+    //  "%4.2f "
+    //  "%4.2f  "
+    //  "%4.2f "
+    //  "%4.2f  "
+    //  "%4.2f "
+    //  "%4.2f\n",
+    //  get_time(TimeTasks::MOMENTS),
+    //  get_communicate(TimeTasks::MOMENTS),
+    //  get_time(TimeTasks::FIELDS),
+    //  get_communicate(TimeTasks::FIELDS),
+    //  get_time(TimeTasks::PARTICLES),
+    //  get_communicate(TimeTasks::PARTICLES),
+    //  get_time(),
+    //  get_communicate()
+    //  );
+    fflush(stdout);
+  }
+}

From b8a1a585a704f1e93160d5053f030d07bc74dc4b Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 08:00:23 +0200
Subject: [PATCH 008/118] issue #27: rhocs last dimension is allocated
 incorrectly in EMfields.cpp

---
 fields/EMfields3D.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 28de6676..1dc44a12 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -119,7 +119,7 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) {
   Jz_ext = newArr3(double,nxn,nyn,nzn);
   // involving species
   rhons = newArr4(double, ns, nxn, nyn, nzn);
-  rhocs = newArr4(double, ns, nxc, nyc, nzn);
+  rhocs = newArr4(double, ns, nxc, nyc, nzc);
   Jxs = newArr4(double, ns, nxn, nyn, nzn);
   Jys = newArr4(double, ns, nxn, nyn, nzn);
   Jzs = newArr4(double, ns, nxn, nyn, nzn);

From c201c8c3b72622f58818336735ec843fba967cd3 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 10:34:25 +0200
Subject: [PATCH 009/118] issue #35: making MPI_Barrier() a no-op

---
 communication/ComInterpNodes3D.cpp | 1 +
 communication/ComNodes3D.cpp       | 1 +
 include/ComParticles3D.h           | 1 +
 include/ipicdefs.h                 | 3 +++
 main/iPic3Dlib.cpp                 | 1 +
 particles/Particles3D.cpp          | 1 +
 performances/Timing.cpp            | 1 +
 7 files changed, 9 insertions(+)

diff --git a/communication/ComInterpNodes3D.cpp b/communication/ComInterpNodes3D.cpp
index b03b308b..1e24dd73 100644
--- a/communication/ComInterpNodes3D.cpp
+++ b/communication/ComInterpNodes3D.cpp
@@ -1,5 +1,6 @@
 
 #include "ComInterpNodes3D.h"
+#include "ipicdefs.h"
 
 /** communicate ghost cells and sum the contribution with a index indicating the number of species*/
 void communicateInterp(int nx, int ny, int nz, int ns, double ****vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
diff --git a/communication/ComNodes3D.cpp b/communication/ComNodes3D.cpp
index 30cb5ed8..977494f4 100644
--- a/communication/ComNodes3D.cpp
+++ b/communication/ComNodes3D.cpp
@@ -1,6 +1,7 @@
 
 #include "ComNodes3D.h"
 #include "TimeTasks.h"
+#include "ipicdefs.h"
 
 /** communicate ghost cells (FOR NODES) */
 void communicateNode(int nx, int ny, int nz, double ***vector, VirtualTopology3D * vct) {
diff --git a/include/ComParticles3D.h b/include/ComParticles3D.h
index da7c3470..76090d45 100644
--- a/include/ComParticles3D.h
+++ b/include/ComParticles3D.h
@@ -10,6 +10,7 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #define ComParticles3D_H
 
 #include "MPIdata.h"
+#include "ipicdefs.h"
 #include "ComBasic3D.h"
 
 /** comunicate particles and receive particles to and from 6 processors */
diff --git a/include/ipicdefs.h b/include/ipicdefs.h
index d9b7720c..dfa44969 100644
--- a/include/ipicdefs.h
+++ b/include/ipicdefs.h
@@ -4,4 +4,7 @@
 // uncomment the following line to use parallel hdf5
 //#define USING_PARALLEL_HDF5
 
+// use precprocessor to remove MPI_Barrier() calls.
+#define MPI_Barrier(args...)
+
 #endif
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 6cd4531e..418fc8ce 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -1,6 +1,7 @@
 
 #include "iPic3D.h"
 #include "TimeTasks.h"
+#include "ipicdefs.h"
 
 using namespace iPic3D;
 MPIdata* iPic3D::c_Solver::mpi=0;
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 4dfeb724..357f113a 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -19,6 +19,7 @@ developers: Stefano Markidis, Giovanni Lapenta
 #include "Grid3DCU.h"
 #include "Field.h"
 #include "MPIdata.h"
+#include "ipicdefs.h"
 #include "TimeTasks.h"
 
 #include "Particles3D.h"
diff --git a/performances/Timing.cpp b/performances/Timing.cpp
index 6b3eec48..e639fd88 100644
--- a/performances/Timing.cpp
+++ b/performances/Timing.cpp
@@ -1,5 +1,6 @@
 
 #include "Timing.h"
+#include "ipicdefs.h"
 
 /**
  * 

From 7e4d4d1b6bc04aae6bb92c594443d6b37a22daeb Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 11:45:40 +0200
Subject: [PATCH 010/118] issue #36: use int rather than long long in critical
 particle loops

---
 particles/Particles3D.cpp     | 5 ++++-
 particles/Particles3Dcomm.cpp | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 357f113a..397a1c62 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -8,6 +8,8 @@ developers: Stefano Markidis, Giovanni Lapenta
 #include <mpi.h>
 #include <iostream>
 #include <math.h>
+#include <limits.h>
+#include "asserts.h"
 
 #include "VirtualTopology3D.h"
 #include "VCtopology3D.h"
@@ -329,12 +331,13 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
 
   const double dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
   const double inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
+  assert_le(nop,INT_MAX); // else would need to use long long
   // don't bother trying to push any particles simultaneously;
   // MIC already does vectorization automatically, and trying
   // to do it by hand only hurts performance.
 #pragma omp parallel for
 #pragma simd                    // this just slows things down (why?)
-  for (long long rest = 0; rest < nop; rest++) {
+  for (int rest = 0; rest < nop; rest++) {
     // copy the particle
     double xp = x[rest];
     double yp = y[rest];
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index db0ec204..1a799dd7 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -7,6 +7,8 @@ developers: Stefano Markidis, Giovanni Lapenta.
 #include <mpi.h>
 #include <iostream>
 #include <math.h>
+#include <limits.h>
+#include "asserts.h"
 #include "VirtualTopology3D.h"
 #include "VCtopology3D.h"
 #include "CollectiveIO.h"
@@ -305,7 +307,8 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
     //Moments speciesMoments(nxn,nyn,nzn,invVOL);
     //speciesMoments.set_to_zero();
     //#pragma omp for
-    for (register long long i = 0; i < nop; i++)
+    assert_le(nop,INT_MAX); // else would need to use long long
+    for (int i = 0; i < nop; i++)
     {
       const int ix = 2 + int (floor((x[i] - xstart) * inv_dx));
       const int iy = 2 + int (floor((y[i] - ystart) * inv_dy));

From 23f510cba5e2b162285efe7ffb51293135b8b9bd Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 19:51:43 +0200
Subject: [PATCH 011/118] fixed typo in TimeTask output introduced in commit
 6c903629

---
 utility/TimeTasks.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utility/TimeTasks.cpp b/utility/TimeTasks.cpp
index 69f56fb7..b48bb070 100644
--- a/utility/TimeTasks.cpp
+++ b/utility/TimeTasks.cpp
@@ -59,7 +59,7 @@ void TimeTasks::print_cycle_times()
   if(!MPIdata::get_rank())
   {
     fflush(stdout);
-    fprintf(stdout,"=== times for cycle for rank %d) === \n",
+    fprintf(stdout,"=== times for cycle for rank %d === \n",
       MPIdata::get_rank());
     fprintf(stdout, TIMING_PREFIX
       "moms flds pcls Bfld cycl\n");

From 3687a02412be2a660aab226a059a86f6457cfb2e Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 19:39:25 +0200
Subject: [PATCH 012/118] Local omp.h was conflicting with system omp.h

---
 include/ompdefs.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 include/ompdefs.h

diff --git a/include/ompdefs.h b/include/ompdefs.h
new file mode 100644
index 00000000..5c5fab00
--- /dev/null
+++ b/include/ompdefs.h
@@ -0,0 +1,25 @@
+#ifndef ompdefs_H
+#define ompdefs_H
+
+#include <stdio.h>
+#include "asserts.h"
+// the compiler sets _OPENMP if the -openmp flag is used
+#ifdef _OPENMP
+#include <omp.h>
+#else
+inline int omp_get_thread_num() {
+    return 0;
+}
+#endif
+
+inline int omp_thread_count() {
+    int n = 0;
+    #pragma omp parallel reduction(+:n)
+    n += 1;
+    #ifndef _OPENMP // USING_OMP
+    assert_eq(n,1);
+    #endif
+    return n;
+}
+
+#endif

From bb6ec5f19aa1ce565e8939500b268d0eb029452d Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 19:36:55 +0200
Subject: [PATCH 013/118] issue #33: making grid accessors inline again

---
 grids/Grid3DCU.cpp | 114 ---------------------------------------------
 include/Grid3DCU.h |  71 ++++++++++------------------
 2 files changed, 24 insertions(+), 161 deletions(-)

diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index c2b92797..973b20f8 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -413,117 +413,3 @@ void Grid3DCU::interpN2C(double ****vecFieldC, int ns, double ****vecFieldN) {
 }
 
 
-/** get nxc */
-int Grid3DCU::getNXC() {
-  return (nxc);
-}
-
-/** get nxn */
-int Grid3DCU::getNXN() {
-  return (nxn);
-}
-
-/** get nyc */
-int Grid3DCU::getNYC() {
-  return (nyc);
-}
-
-/** get nyn */
-int Grid3DCU::getNYN() {
-  return (nyn);
-}
-
-/** get nzc */
-int Grid3DCU::getNZC() {
-  return (nzc);
-}
-
-/** get nzn */
-int Grid3DCU::getNZN() {
-  return (nzn);
-}
-
-/** get dx */
-double Grid3DCU::getDX() {
-  return (dx);
-}
-
-/** get dy */
-double Grid3DCU::getDY() {
-  return (dy);
-}
-
-/** get dz */
-double Grid3DCU::getDZ() {
-  return (dz);
-}
-
-/** get xn[][][] */
-double &Grid3DCU::getXN(int indexX, int indexY, int indexZ) {
-  return (node_coordinate[indexX][indexY][indexZ][0]);
-}
-
-/** get yn[][][] */
-double &Grid3DCU::getYN(int indexX, int indexY, int indexZ) {
-  return (node_coordinate[indexX][indexY][indexZ][1]);
-}
-
-/** get zn[][][] */
-double &Grid3DCU::getZN(int indexX, int indexY, int indexZ) {
-  return (node_coordinate[indexX][indexY][indexZ][2]);
-}
-
-/** get xc[][][] */
-double &Grid3DCU::getXC(int indexX, int indexY, int indexZ) {
-  return (center_coordinate[indexX][indexY][indexZ][0]);
-}
-
-/** get yc[][][] */
-double &Grid3DCU::getYC(int indexX, int indexY, int indexZ) {
-  return (center_coordinate[indexX][indexY][indexZ][1]);
-}
-
-/** get zc[][][] */
-double &Grid3DCU::getZC(int indexX, int indexY, int indexZ) {
-  return (center_coordinate[indexX][indexY][indexZ][2]);
-}
-
-/** get the whole vector of nodes*/
-double ****Grid3DCU::getN() {
-  return node_coordinate;
-}
-
-/** get Xstart */
-double Grid3DCU::getXstart() {
-  return (xStart);
-}
-
-/** get Xend */
-double Grid3DCU::getXend() {
-  return (xEnd);
-}
-
-/** get Ystart */
-double Grid3DCU::getYstart() {
-  return (yStart);
-}
-
-/** get Yend */
-double Grid3DCU::getYend() {
-  return (yEnd);
-}
-
-/** get Zstart */
-double Grid3DCU::getZstart() {
-  return (zStart);
-}
-
-/** get Zend */
-double Grid3DCU::getZend() {
-  return (zEnd);
-}
-
-/** get the inverse of volume */
-double Grid3DCU::getInvVOL() {
-  return (invVOL);
-}
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index 819c041e..f085ea34 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -81,53 +81,6 @@ class Grid3DCU                  // :public Grid
   /** interpolate on central points from nodes */
   void interpN2C(double ****vecFieldC, int ns, double ****vecFieldN);
 
-  /** return nxc */
-  int getNXC();
-  /** return nxn */
-  int getNXN();
-  /** return nyc */
-  int getNYC();
-  /** return nyn */
-  int getNYN();
-  /** return nzc */
-  int getNZC();
-  /** return nzn */
-  int getNZN();
-  /** return dx */
-  double getDX();
-  /** return dy */
-  double getDY();
-  /** return dz */
-  double getDZ();
-  /** get xn(X,Y,Z) */
-  double &getXN(int indexX, int indexY, int indexZ);
-  /** get yn(X,Y,Z) */
-  double &getYN(int indexX, int indexY, int indexZ);
-  /** get zn(X,Y,Z) */
-  double &getZN(int indexX, int indexY, int indexZ);
-  /** get the whole vector of nodes*/
-  double ****getN();
-  /** get xc(X,Y,Z) */
-  double &getXC(int indexX, int indexY, int indexZ);
-  /** get yc(X,Y,Z) */
-  double &getYC(int indexX, int indexY, int indexZ);
-  /** get zc(X,Y,Z) */
-  double &getZC(int indexX, int indexY, int indexZ);
-  /** get Xstart */
-  double getXstart();
-  /** get Xend */
-  double getXend();
-  /** get Ystart */
-  double getYstart();
-  /** get Yend */
-  double getYend();
-  /** get Zstart */
-  double getZstart();
-  /** get Zend */
-  double getZend();
-  /** get the inverse of volume */
-  double getInvVOL();
-
   // /////////// PRIVATE VARIABLES //////////////
 private:
   /** number of cells - X direction, including + 2 (guard cells) */
@@ -163,6 +116,30 @@ class Grid3DCU                  // :public Grid
   /** local grid boundaries coordinate  */
   double xStart, xEnd, yStart, yEnd, zStart, zEnd;
 
+public: // accessors (inline)
+  int getNXC() { return (nxc); }
+  int getNXN() { return (nxn); }
+  int getNYC() { return (nyc); }
+  int getNYN() { return (nyn); }
+  int getNZC() { return (nzc); }
+  int getNZN() { return (nzn); }
+  double getDX() { return (dx); }
+  double getDY() { return (dy); }
+  double getDZ() { return (dz); }
+  double &getXN(int X, int Y, int Z) { return (node_coordinate[X][Y][Z][0]); }
+  double &getYN(int X, int Y, int Z) { return (node_coordinate[X][Y][Z][1]); }
+  double &getZN(int X, int Y, int Z) { return (node_coordinate[X][Y][Z][2]); }
+  double &getXC(int X, int Y, int Z) { return (center_coordinate[X][Y][Z][0]); }
+  double &getYC(int X, int Y, int Z) { return (center_coordinate[X][Y][Z][1]); }
+  double &getZC(int X, int Y, int Z) { return (center_coordinate[X][Y][Z][2]); }
+  double ****getN() { return node_coordinate; }
+  double getXstart() { return (xStart); }
+  double getXend() { return (xEnd); }
+  double getYstart() { return (yStart); }
+  double getYend() { return (yEnd); } 
+  double getZstart() { return (zStart); }
+  double getZend() { return (zEnd); }
+  double getInvVOL() { return (invVOL); }
 };
 
 typedef Grid3DCU Grid;

From 2bcd7506d85a4374abe068152133ba409e1d813b Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 16:03:24 +0200
Subject: [PATCH 014/118] issues #23 and issue #33: using one preallocated
 Moments instance per thread

---
 fields/EMfields3D.cpp         | 180 +++-------------------------
 fields/Moments.cpp            |  53 +++++++++
 include/EMfields3D.h          | 216 +++++++++++++---------------------
 include/Moments.h             | 182 ++++++++++++++++++++++++++++
 include/ipicdefs.h            |   3 +
 particles/Particles3Dcomm.cpp |  36 +++---
 6 files changed, 355 insertions(+), 315 deletions(-)
 create mode 100644 fields/Moments.cpp
 create mode 100644 include/Moments.h

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 1dc44a12..c6c5936d 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -2,6 +2,8 @@
 #include <mpi.h>
 #include "EMfields3D.h"
 #include "TimeTasks.h"
+#include "Moments.h"
+#include "ompdefs.h"
 
 /*! constructor */
 EMfields3D::EMfields3D(Collective * col, Grid * grid) {
@@ -162,6 +164,13 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) {
   vectY = newArr3(double, nxn, nyn, nzn);
   vectZ = newArr3(double, nxn, nyn, nzn);
   divC = newArr3(double, nxc, nyc, nzc);
+  sizeMomentsArray = omp_thread_count();
+  momentsArray = new Moments*[sizeMomentsArray];
+  for(int i=0;i<sizeMomentsArray;i++)
+  {
+    momentsArray[i] = new Moments;
+    momentsArray[i]->init(nxn,nyn,nzn,invVOL);
+  }
 }
 
 /*! Calculate Electric field with the implicit solver: the Maxwell solver method is called here */
@@ -1102,98 +1111,7 @@ void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft,
   communicateNode_P(nxn, nyn, nzn, pZZsn, ns, vct);
 }
 
-
-/** add an amount of charge density to charge density field at node X,Y */
-void Moments::addRho(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        rho[X - i][Y - j][Z - k] += temp;
-      }
-}
-/** add an amount of charge density to current density - direction X to current density field on the node*/
-void Moments::addJx(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        Jx[X - i][Y - j][Z - k] += temp;
-      }
-}
-/** add an amount of current density - direction Y to current density field on the node */
-void Moments::addJy(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        Jy[X - i][Y - j][Z - k] += temp;
-      }
-}
-/** add an amount of current density - direction Z to current density field on the node */
-void Moments::addJz(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        Jz[X - i][Y - j][Z - k] += temp;
-      }
-}
-/** add an amount of pressure density - direction XX to current density field on the node */
-void Moments::addPxx(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        pXX[X - i][Y - j][Z - k] += temp;
-      }
-}
-/** add an amount of pressure density - direction XY to current density field on the node*/
-void Moments::addPxy(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        pXY[X - i][Y - j][Z - k] += temp;
-      }
-}
-/** add an amount of pressure density - direction XZ to current density field on the node */
-void Moments::addPxz(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        pXZ[X - i][Y - j][Z - k] += temp;
-      }
-}
-/** add an amount of pressure density - direction YY to current density field on the node*/
-void Moments::addPyy(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        pYY[X - i][Y - j][Z - k] += temp;
-      }
-}
-/** add an amount of pressure density - direction YZ to current density field on the node */
-void Moments::addPyz(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        pYZ[X - i][Y - j][Z - k] += temp;
-      }
-}
-/** add an amount of pressure density - direction ZZ to current density field on the node */
-void Moments::addPzz(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        pZZ[X - i][Y - j][Z - k] += temp;
-      }
-}
-
+/* add moments (e.g. from an OpenMP thread) to the accumulated moments */
 void EMfields3D::addToSpeciesMoments(const Moments & in, int is) {
   assert_eq(in.get_nx(), nxn);
   assert_eq(in.get_ny(), nyn);
@@ -1215,79 +1133,6 @@ void EMfields3D::addToSpeciesMoments(const Moments & in, int is) {
   }
 }
 
-/*! add an amount of charge density to charge density field at node X,Y */
-void EMfields3D::addRho(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        rhons[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-/*! add an amount of charge density to current density - direction X to current density field on the node */
-void EMfields3D::addJx(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        Jxs[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-/*! add an amount of current density - direction Y to current density field on the node */
-void EMfields3D::addJy(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        Jys[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-/*! add an amount of current density - direction Z to current density field on the node */
-void EMfields3D::addJz(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        Jzs[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-/*! add an amount of pressure density - direction XX to current density field on the node */
-void EMfields3D::addPxx(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        pXXsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-/*! add an amount of pressure density - direction XY to current density field on the node */
-void EMfields3D::addPxy(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        pXYsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-/*! add an amount of pressure density - direction XZ to current density field on the node */
-void EMfields3D::addPxz(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        pXZsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-/*! add an amount of pressure density - direction YY to current density field on the node */
-void EMfields3D::addPyy(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        pYYsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-/*! add an amount of pressure density - direction YZ to current density field on the node */
-void EMfields3D::addPyz(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        pYZsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-/*! add an amount of pressure density - direction ZZ to current density field on the node */
-void EMfields3D::addPzz(double weight[][2][2], int X, int Y, int Z, int is) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        pZZsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
-}
-
-
-
 /*! set to 0 all the densities fields */
 void EMfields3D::setZeroDensities() {
   for (register int i = 0; i < nxn; i++)
@@ -3362,4 +3207,9 @@ EMfields3D::~EMfields3D() {
   delArr3(vectY, nxn, nyn);
   delArr3(vectZ, nxn, nyn);
   delArr3(divC, nxc, nyc);
+  for(int i=0;i<sizeMomentsArray;i++)
+  {
+    delete momentsArray[i];
+  }
+  delete [] momentsArray;
 }
diff --git a/fields/Moments.cpp b/fields/Moments.cpp
new file mode 100644
index 00000000..846ed9e6
--- /dev/null
+++ b/fields/Moments.cpp
@@ -0,0 +1,53 @@
+#include "Moments.h"
+#include "Alloc.h"
+
+// construct empty instance (not zeroed)
+void Moments::init(int nx_, int ny_, int nz_, double invVOL_) {
+  nx = nx_;
+  ny = ny_;
+  nz = nz_;
+  invVOL = invVOL_;
+  rho = newArr3(double, nx, ny, nz);
+  Jx = newArr3(double, nx, ny, nz);
+  Jy = newArr3(double, nx, ny, nz);
+  Jz = newArr3(double, nx, ny, nz);
+  pXX = newArr3(double, nx, ny, nz);
+  pXY = newArr3(double, nx, ny, nz);
+  pXZ = newArr3(double, nx, ny, nz);
+  pYY = newArr3(double, nx, ny, nz);
+  pYZ = newArr3(double, nx, ny, nz);
+  pZZ = newArr3(double, nx, ny, nz);
+}
+
+Moments::~Moments() {
+  // nodes and species
+  delArr3(rho, nx, ny);
+  delArr3(Jx, nx, ny);
+  delArr3(Jy, nx, ny);
+  delArr3(Jz, nx, ny);
+  delArr3(pXX, nx, ny);
+  delArr3(pXY, nx, ny);
+  delArr3(pXZ, nx, ny);
+  delArr3(pYY, nx, ny);
+  delArr3(pYZ, nx, ny);
+  delArr3(pZZ, nx, ny);
+}
+
+void Moments::set_to_zero() {
+  // #pragma omp parallel for collapse(1)
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++) {
+        rho[i][j][k] = 0.0;
+        Jx[i][j][k] = 0.0;
+        Jy[i][j][k] = 0.0;
+        Jz[i][j][k] = 0.0;
+        pXX[i][j][k] = 0.0;
+        pXY[i][j][k] = 0.0;
+        pXZ[i][j][k] = 0.0;
+        pYY[i][j][k] = 0.0;
+        pYZ[i][j][k] = 0.0;
+        pZZ[i][j][k] = 0.0;
+      }
+}
+
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 9faca505..2e2d9bd6 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -3,7 +3,6 @@
 #ifndef EMfields3D_H
 #define EMfields3D_H
 
-
 #include <iostream>
 #include <sstream>
 
@@ -31,140 +30,7 @@ using std::endl;
 
 /*! Electromagnetic fields and sources defined for each local grid, and for an implicit maxwell's solver @date May 2008 @par Copyright: (C) 2008 KUL @author Stefano Markidis, Giovanni Lapenta. @version 3.0 */
 
-// class to accumulate node-centered species moments
-// 
-class Moments {
-  private:
-    double invVOL;
-    double ***rho;
-
-    /** current density, defined on nodes */
-    double ***Jx;
-    double ***Jy;
-    double ***Jz;
-
-    /** pressure tensor components, defined on nodes */
-    double ***pXX;
-    double ***pXY;
-    double ***pXZ;
-    double ***pYY;
-    double ***pYZ;
-    double ***pZZ;
-    int nx;
-    int ny;
-    int nz;
-  public:
-    int get_nx() const {
-      return nx;
-    }
-    int get_ny() const {
-      return ny;
-    }
-    int get_nz() const {
-      return nz;
-    }
-    double get_invVOL() const {
-      return invVOL;
-    }
-    double get_rho(int i, int j, int k) const {
-      return rho[i][j][k];
-    }
-    double get_Jx(int i, int j, int k) const {
-      return Jx[i][j][k];
-    }
-    double get_Jy(int i, int j, int k) const {
-      return Jy[i][j][k];
-    }
-    double get_Jz(int i, int j, int k) const {
-      return Jz[i][j][k];
-    }
-    double get_pXX(int i, int j, int k) const {
-      return pXX[i][j][k];
-    }
-    double get_pXY(int i, int j, int k) const {
-      return pXY[i][j][k];
-    }
-    double get_pXZ(int i, int j, int k) const {
-      return pXZ[i][j][k];
-    }
-    double get_pYY(int i, int j, int k) const {
-      return pYY[i][j][k];
-    }
-    double get_pYZ(int i, int j, int k) const {
-      return pYZ[i][j][k];
-    }
-    double get_pZZ(int i, int j, int k) const {
-      return pZZ[i][j][k];
-    }
-  public:
-    Moments() {
-    };
-    Moments(int nx_, int ny_, int nz_, double invVOL_);
-    ~Moments();
-    void set_to_zero();
-    void addRho(double weight[][2][2], int X, int Y, int Z);
-    void addJx(double weight[][2][2], int X, int Y, int Z);
-    void addJy(double weight[][2][2], int X, int Y, int Z);
-    void addJz(double weight[][2][2], int X, int Y, int Z);
-
-    void addPxx(double weight[][2][2], int X, int Y, int Z);
-    void addPxy(double weight[][2][2], int X, int Y, int Z);
-    void addPxz(double weight[][2][2], int X, int Y, int Z);
-    void addPyy(double weight[][2][2], int X, int Y, int Z);
-    void addPyz(double weight[][2][2], int X, int Y, int Z);
-    void addPzz(double weight[][2][2], int X, int Y, int Z);
-};
-
-// construct empty instance (not zeroed)
-inline Moments::Moments(int nx_, int ny_, int nz_, double invVOL_) {
-  nx = nx_;
-  ny = ny_;
-  nz = nz_;
-  invVOL = invVOL_;
-  rho = newArr3(double, nx, ny, nz);
-  Jx = newArr3(double, nx, ny, nz);
-  Jy = newArr3(double, nx, ny, nz);
-  Jz = newArr3(double, nx, ny, nz);
-  pXX = newArr3(double, nx, ny, nz);
-  pXY = newArr3(double, nx, ny, nz);
-  pXZ = newArr3(double, nx, ny, nz);
-  pYY = newArr3(double, nx, ny, nz);
-  pYZ = newArr3(double, nx, ny, nz);
-  pZZ = newArr3(double, nx, ny, nz);
-}
-
-inline Moments::~Moments() {
-  // nodes and species
-  delArr3(rho, nx, ny);
-  delArr3(Jx, nx, ny);
-  delArr3(Jy, nx, ny);
-  delArr3(Jz, nx, ny);
-  delArr3(pXX, nx, ny);
-  delArr3(pXY, nx, ny);
-  delArr3(pXZ, nx, ny);
-  delArr3(pYY, nx, ny);
-  delArr3(pYZ, nx, ny);
-  delArr3(pZZ, nx, ny);
-}
-
-inline void Moments::set_to_zero() {
-  // #pragma omp parallel for collapse(1)
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++) {
-        rho[i][j][k] = 0.0;
-        Jx[i][j][k] = 0.0;
-        Jy[i][j][k] = 0.0;
-        Jz[i][j][k] = 0.0;
-        pXX[i][j][k] = 0.0;
-        pXY[i][j][k] = 0.0;
-        pXZ[i][j][k] = 0.0;
-        pYY[i][j][k] = 0.0;
-        pYZ[i][j][k] = 0.0;
-        pZZ[i][j][k] = 0.0;
-      }
-}
-
+class Moments;
 class EMfields3D                // :public Field
 {
   public:
@@ -410,6 +276,12 @@ class EMfields3D                // :public Field
     /*! get the magnetic field energy */
     double getBenergy();
 
+  /*! fetch array for summing moments of thread i */
+  Moments& fetch_momentsArray(int i){
+    assert_le(0,i);
+    assert_le(i,sizeMomentsArray);
+    return *momentsArray[i];
+  }
 
     /*! print electromagnetic fields info */
     void print(void) const;
@@ -531,6 +403,9 @@ class EMfields3D                // :public Field
     double ***vectY;
     double ***vectZ;
     double ***divC;
+    /* temporary arrays for summing moments */
+    int sizeMomentsArray;
+    Moments **momentsArray;
 
 
     // *******************************************************************************
@@ -658,6 +533,77 @@ class EMfields3D                // :public Field
 
 };
 
+inline void EMfields3D::addRho(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        rhons[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+/*! add an amount of charge density to current density - direction X to current density field on the node */
+inline void EMfields3D::addJx(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        Jxs[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+/*! add an amount of current density - direction Y to current density field on the node */
+inline void EMfields3D::addJy(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        Jys[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+/*! add an amount of current density - direction Z to current density field on the node */
+inline void EMfields3D::addJz(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        Jzs[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+/*! add an amount of pressure density - direction XX to current density field on the node */
+inline void EMfields3D::addPxx(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        pXXsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+/*! add an amount of pressure density - direction XY to current density field on the node */
+inline void EMfields3D::addPxy(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        pXYsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+/*! add an amount of pressure density - direction XZ to current density field on the node */
+inline void EMfields3D::addPxz(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        pXZsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+/*! add an amount of pressure density - direction YY to current density field on the node */
+inline void EMfields3D::addPyy(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        pYYsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+/*! add an amount of pressure density - direction YZ to current density field on the node */
+inline void EMfields3D::addPyz(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        pYZsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+/*! add an amount of pressure density - direction ZZ to current density field on the node */
+inline void EMfields3D::addPzz(double weight[][2][2], int X, int Y, int Z, int is) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++)
+        pZZsn[is][X - i][Y - j][Z - k] += weight[i][j][k] * invVOL;
+}
+
+
 typedef EMfields3D Field;
 
 #endif
diff --git a/include/Moments.h b/include/Moments.h
new file mode 100644
index 00000000..d4349ecc
--- /dev/null
+++ b/include/Moments.h
@@ -0,0 +1,182 @@
+#ifndef Moments_H
+#define Moments_H
+
+// class to accumulate node-centered species moments
+// 
+class Moments {
+  private:
+    double invVOL;
+    double ***rho;
+
+    /** current density, defined on nodes */
+    double ***Jx;
+    double ***Jy;
+    double ***Jz;
+
+    /** pressure tensor components, defined on nodes */
+    double ***pXX;
+    double ***pXY;
+    double ***pXZ;
+    double ***pYY;
+    double ***pYZ;
+    double ***pZZ;
+    int nx;
+    int ny;
+    int nz;
+  public:
+    int get_nx() const {
+      return nx;
+    }
+    int get_ny() const {
+      return ny;
+    }
+    int get_nz() const {
+      return nz;
+    }
+    double get_invVOL() const {
+      return invVOL;
+    }
+    double get_rho(int i, int j, int k) const {
+      return rho[i][j][k];
+    }
+    double get_Jx(int i, int j, int k) const {
+      return Jx[i][j][k];
+    }
+    double get_Jy(int i, int j, int k) const {
+      return Jy[i][j][k];
+    }
+    double get_Jz(int i, int j, int k) const {
+      return Jz[i][j][k];
+    }
+    double get_pXX(int i, int j, int k) const {
+      return pXX[i][j][k];
+    }
+    double get_pXY(int i, int j, int k) const {
+      return pXY[i][j][k];
+    }
+    double get_pXZ(int i, int j, int k) const {
+      return pXZ[i][j][k];
+    }
+    double get_pYY(int i, int j, int k) const {
+      return pYY[i][j][k];
+    }
+    double get_pYZ(int i, int j, int k) const {
+      return pYZ[i][j][k];
+    }
+    double get_pZZ(int i, int j, int k) const {
+      return pZZ[i][j][k];
+    }
+  public:
+    Moments() {
+    };
+    Moments(int nx_, int ny_, int nz_, double invVOL_){
+      init(nx_,ny_,nz_,invVOL_);
+    }
+    void init(int nx_, int ny_, int nz_, double invVOL_);
+    ~Moments();
+    void set_to_zero();
+    void addRho(double weight[][2][2], int X, int Y, int Z);
+    void addJx(double weight[][2][2], int X, int Y, int Z);
+    void addJy(double weight[][2][2], int X, int Y, int Z);
+    void addJz(double weight[][2][2], int X, int Y, int Z);
+
+    void addPxx(double weight[][2][2], int X, int Y, int Z);
+    void addPxy(double weight[][2][2], int X, int Y, int Z);
+    void addPxz(double weight[][2][2], int X, int Y, int Z);
+    void addPyy(double weight[][2][2], int X, int Y, int Z);
+    void addPyz(double weight[][2][2], int X, int Y, int Z);
+    void addPzz(double weight[][2][2], int X, int Y, int Z);
+};
+
+/** add an amount of charge density to charge density field at node X,Y */
+inline void Moments::addRho(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++) {
+        const double temp = weight[i][j][k] * invVOL;
+        rho[X - i][Y - j][Z - k] += temp;
+      }
+}
+/** add an amount of charge density to current density - direction X to current density field on the node*/
+inline void Moments::addJx(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++){
+        const double temp = weight[i][j][k] * invVOL;
+        Jx[X - i][Y - j][Z - k] += temp;
+      }
+}
+/** add an amount of current density - direction Y to current density field on the node */
+inline void Moments::addJy(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++){
+        const double temp = weight[i][j][k] * invVOL;
+        Jy[X - i][Y - j][Z - k] += temp;
+      }
+}
+/** add an amount of current density - direction Z to current density field on the node */
+inline void Moments::addJz(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++){
+        const double temp = weight[i][j][k] * invVOL;
+        Jz[X - i][Y - j][Z - k] += temp;
+      }
+}
+/** add an amount of pressure density - direction XX to current density field on the node */
+inline void Moments::addPxx(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++){
+        const double temp = weight[i][j][k] * invVOL;
+        pXX[X - i][Y - j][Z - k] += temp;
+      }
+}
+/** add an amount of pressure density - direction XY to current density field on the node*/
+inline void Moments::addPxy(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++){
+        const double temp = weight[i][j][k] * invVOL;
+        pXY[X - i][Y - j][Z - k] += temp;
+      }
+}
+/** add an amount of pressure density - direction XZ to current density field on the node */
+inline void Moments::addPxz(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++){
+        const double temp = weight[i][j][k] * invVOL;
+        pXZ[X - i][Y - j][Z - k] += temp;
+      }
+}
+/** add an amount of pressure density - direction YY to current density field on the node*/
+inline void Moments::addPyy(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++){
+        const double temp = weight[i][j][k] * invVOL;
+        pYY[X - i][Y - j][Z - k] += temp;
+      }
+}
+/** add an amount of pressure density - direction YZ to current density field on the node */
+inline void Moments::addPyz(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++){
+        const double temp = weight[i][j][k] * invVOL;
+        pYZ[X - i][Y - j][Z - k] += temp;
+      }
+}
+/** add an amount of pressure density - direction ZZ to current density field on the node */
+inline void Moments::addPzz(double weight[][2][2], int X, int Y, int Z) {
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++)
+      for (int k = 0; k < 2; k++){
+        const double temp = weight[i][j][k] * invVOL;
+        pZZ[X - i][Y - j][Z - k] += temp;
+      }
+}
+
+#endif
diff --git a/include/ipicdefs.h b/include/ipicdefs.h
index dfa44969..ef86aaf1 100644
--- a/include/ipicdefs.h
+++ b/include/ipicdefs.h
@@ -1,6 +1,9 @@
 #ifndef __IPIC_DEFS_H__
 #define __IPIC_DEFS_H__
 
+// comment this out if OpenMP is not installed on your system.
+#define USING_OMP
+
 // uncomment the following line to use parallel hdf5
 //#define USING_PARALLEL_HDF5
 
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 1a799dd7..5d1be951 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -20,13 +20,16 @@ developers: Stefano Markidis, Giovanni Lapenta.
 #include "Grid.h"
 #include "Grid3DCU.h"
 #include "Field.h"
+#include "Moments.h"
 #include "MPIdata.h"
+#include "ompdefs.h"
 
 #include "Particles3Dcomm.h"
 
 #include "hdf5.h"
 #include <vector>
 #include <complex>
+#include "debug.h"
 
 using std::cout;
 using std::cerr;
@@ -302,11 +305,14 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
   const double nxn = grid->getNXN();
   const double nyn = grid->getNYN();
   const double nzn = grid->getNZN();
-  //#pragma omp parallel
+  #pragma omp parallel
   {
+    dprint(omp_get_thread_num());
+    Moments& speciesMoments = EMf->fetch_momentsArray(omp_get_thread_num());
     //Moments speciesMoments(nxn,nyn,nzn,invVOL);
-    //speciesMoments.set_to_zero();
-    //#pragma omp for
+    //Field& speciesMoments = *EMf;
+    speciesMoments.set_to_zero();
+    #pragma omp for
     assert_le(nop,INT_MAX); // else would need to use long long
     for (int i = 0; i < nop; i++)
     {
@@ -336,65 +342,65 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
       //weight[1][1][0] = q[i] * xi[1] * eta[1] * zeta[0] * invVOL;
       //weight[1][1][1] = q[i] * xi[1] * eta[1] * zeta[1] * invVOL;
       // add charge density
-      EMf->addRho(weight, ix, iy, iz, ns);
+      speciesMoments.addRho(weight, ix, iy, iz);
       // add current density - X
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = u[i] * weight[ii][jj][kk];
-      EMf->addJx(temp, ix, iy, iz, ns);
+      speciesMoments.addJx(temp, ix, iy, iz);
       // add current density - Y
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = v[i] * weight[ii][jj][kk];
-      EMf->addJy(temp, ix, iy, iz, ns);
+      speciesMoments.addJy(temp, ix, iy, iz);
       // add current density - Z
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = w[i] * weight[ii][jj][kk];
-      EMf->addJz(temp, ix, iy, iz, ns);
+      speciesMoments.addJz(temp, ix, iy, iz);
       // Pxx - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = u[i] * u[i] * weight[ii][jj][kk];
-      EMf->addPxx(temp, ix, iy, iz, ns);
+      speciesMoments.addPxx(temp, ix, iy, iz);
       // Pxy - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = u[i] * v[i] * weight[ii][jj][kk];
-      EMf->addPxy(temp, ix, iy, iz, ns);
+      speciesMoments.addPxy(temp, ix, iy, iz);
       // Pxz - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = u[i] * w[i] * weight[ii][jj][kk];
-      EMf->addPxz(temp, ix, iy, iz, ns);
+      speciesMoments.addPxz(temp, ix, iy, iz);
       // Pyy - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = v[i] * v[i] * weight[ii][jj][kk];
-      EMf->addPyy(temp, ix, iy, iz, ns);
+      speciesMoments.addPyy(temp, ix, iy, iz);
       // Pyz - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = v[i] * w[i] * weight[ii][jj][kk];
-      EMf->addPyz(temp, ix, iy, iz, ns);
+      speciesMoments.addPyz(temp, ix, iy, iz);
       // Pzz - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = w[i] * w[i] * weight[ii][jj][kk];
-      EMf->addPzz(temp, ix, iy, iz, ns);
+      speciesMoments.addPzz(temp, ix, iy, iz);
     }
     // change this to allow more parallelization after implementing array class
-    //#pragma omp critical
-    //EMf->addToSpeciesMoments(speciesMoments,ns);
+    #pragma omp critical
+    EMf->addToSpeciesMoments(speciesMoments,ns);
   }
   // communicate contribution from ghost cells 
   EMf->communicateGhostP2G(ns, 0, 0, 0, 0, vct);

From 4cbe8208a61a3797ac57266c670d40c535803a44 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 20:07:04 +0200
Subject: [PATCH 015/118] issue #23 and #33: accumulating Moments using
 multiple instances for OpenMP

  Using a grid-sized instance of Moments for every OpenMP thread
  will consume too much memory if a very large number of OpenMP
  threads are used and so is something of a stop-gap.  But the
  current serial code would be unacceptably slow in that case,
  so this shouldn't really break anything.  EMfields should be
  redesigned to contain a set of Moments instances to be passed
  from the particle solver to the field solver.
---
 particles/Particles3Dcomm.cpp | 93 +++++++++++++++++++++++++++++++++--
 1 file changed, 89 insertions(+), 4 deletions(-)

diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 5d1be951..d339894d 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -305,15 +305,19 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
   const double nxn = grid->getNXN();
   const double nyn = grid->getNYN();
   const double nzn = grid->getNZN();
+  assert_le(nop,INT_MAX); // else would need to use long long
+  // to make memory use scale to a large number of threads we
+  // could first apply an efficient parallel sorting algorithm
+  // to the particles and then accumulate moments in smaller
+  // subarrays.
+  //#ifdef _OPENMP
   #pragma omp parallel
   {
-    dprint(omp_get_thread_num());
-    Moments& speciesMoments = EMf->fetch_momentsArray(omp_get_thread_num());
+    int thread_num = omp_get_thread_num();
+    Moments& speciesMoments = EMf->fetch_momentsArray(thread_num);
     //Moments speciesMoments(nxn,nyn,nzn,invVOL);
-    //Field& speciesMoments = *EMf;
     speciesMoments.set_to_zero();
     #pragma omp for
-    assert_le(nop,INT_MAX); // else would need to use long long
     for (int i = 0; i < nop; i++)
     {
       const int ix = 2 + int (floor((x[i] - xstart) * inv_dx));
@@ -402,6 +406,87 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
     #pragma omp critical
     EMf->addToSpeciesMoments(speciesMoments,ns);
   }
+  //#else
+  //{
+  //  assert_le(nop,INT_MAX); // else would need to use long long
+  //  for (int i = 0; i < nop; i++)
+  //  {
+  //    const int ix = 2 + int (floor((x[i] - xstart) * inv_dx));
+  //    const int iy = 2 + int (floor((y[i] - ystart) * inv_dy));
+  //    const int iz = 2 + int (floor((z[i] - zstart) * inv_dz));
+  //    double temp[2][2][2];
+  //    double xi[2], eta[2], zeta[2];
+  //    xi[0] = x[i] - grid->getXN(ix - 1, iy, iz);
+  //    eta[0] = y[i] - grid->getYN(ix, iy - 1, iz);
+  //    zeta[0] = z[i] - grid->getZN(ix, iy, iz - 1);
+  //    xi[1] = grid->getXN(ix, iy, iz) - x[i];
+  //    eta[1] = grid->getYN(ix, iy, iz) - y[i];
+  //    zeta[1] = grid->getZN(ix, iy, iz) - z[i];
+  //    double weight[2][2][2];
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++) {
+  //          weight[ii][jj][kk] = q[i] * xi[ii] * eta[jj] * zeta[kk] * invVOL;
+  //        }
+  //    // add charge density
+  //    EMf->addRho(weight, ix, iy, iz, ns);
+  //    // add current density - X
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++)
+  //          temp[ii][jj][kk] = u[i] * weight[ii][jj][kk];
+  //    EMf->addJx(temp, ix, iy, iz, ns);
+  //    // add current density - Y
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++)
+  //          temp[ii][jj][kk] = v[i] * weight[ii][jj][kk];
+  //    EMf->addJy(temp, ix, iy, iz, ns);
+  //    // add current density - Z
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++)
+  //          temp[ii][jj][kk] = w[i] * weight[ii][jj][kk];
+  //    EMf->addJz(temp, ix, iy, iz, ns);
+  //    // Pxx - add pressure tensor
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++)
+  //          temp[ii][jj][kk] = u[i] * u[i] * weight[ii][jj][kk];
+  //    EMf->addPxx(temp, ix, iy, iz, ns);
+  //    // Pxy - add pressure tensor
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++)
+  //          temp[ii][jj][kk] = u[i] * v[i] * weight[ii][jj][kk];
+  //    EMf->addPxy(temp, ix, iy, iz, ns);
+  //    // Pxz - add pressure tensor
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++)
+  //          temp[ii][jj][kk] = u[i] * w[i] * weight[ii][jj][kk];
+  //    EMf->addPxz(temp, ix, iy, iz, ns);
+  //    // Pyy - add pressure tensor
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++)
+  //          temp[ii][jj][kk] = v[i] * v[i] * weight[ii][jj][kk];
+  //    EMf->addPyy(temp, ix, iy, iz, ns);
+  //    // Pyz - add pressure tensor
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++)
+  //          temp[ii][jj][kk] = v[i] * w[i] * weight[ii][jj][kk];
+  //    EMf->addPyz(temp, ix, iy, iz, ns);
+  //    // Pzz - add pressure tensor
+  //    for (int ii = 0; ii < 2; ii++)
+  //      for (int jj = 0; jj < 2; jj++)
+  //        for (int kk = 0; kk < 2; kk++)
+  //          temp[ii][jj][kk] = w[i] * w[i] * weight[ii][jj][kk];
+  //    EMf->addPzz(temp, ix, iy, iz, ns);
+  //  }
+  //}
+  //#endif
   // communicate contribution from ghost cells 
   EMf->communicateGhostP2G(ns, 0, 0, 0, 0, vct);
 }

From 499c8bfddb71c3e8e18e2971281907edd1727bd7 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 16 Jul 2013 19:48:13 +0200
Subject: [PATCH 016/118] issue #33: eliminating iostream header from asserts.h

---
 include/asserts.h   | 1 -
 utility/asserts.cpp | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asserts.h b/include/asserts.h
index 962095e7..a3cd4584 100644
--- a/include/asserts.h
+++ b/include/asserts.h
@@ -1,7 +1,6 @@
 #ifndef __ASSERTS_H__
 #define __ASSERTS_H__
 
-#include <iostream>
 #include <cstdlib>
 #include <cstdio>
 
diff --git a/utility/asserts.cpp b/utility/asserts.cpp
index c5beed50..576944f4 100644
--- a/utility/asserts.cpp
+++ b/utility/asserts.cpp
@@ -1,4 +1,5 @@
 
+#include <iostream>
 #include "asserts.h"
 
 void assert_error(const char *file, int line, const char *func, const char *op, const char *lhs_str, const char *rhs_str, double lhs, double rhs) {

From cdc05b344bbc02865ddee8e7a42aff6e6cbba709 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 22 Jul 2013 11:33:02 +0200
Subject: [PATCH 017/118] issue #39: removed asgArr3 and asgArr4

---
 include/Alloc.h           | 73 ---------------------------------------
 particles/Particles3D.cpp | 24 ++++++-------
 2 files changed, 12 insertions(+), 85 deletions(-)

diff --git a/include/Alloc.h b/include/Alloc.h
index 3a0c7173..a57b9b70 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -32,34 +32,6 @@ template < class type > type **** _new_4_array(int sz1, int sz2, int sz3, int sz
   return result;
 }
 
-/*! The assigment for 4D array */
-template < class type > type **** _assign_4_array(int sz1, int sz2, int sz3, int sz4, type **** org) {
-
-  type ****all_x;
-  type ***all_y;
-  type **all_z;
-  type *all_r;
-
-  all_x = org;
-  all_y = org[0];
-  all_z = org[0][0];
-  all_r = org[0][0][0];
-
-  type ****result = all_x;
-
-  for (int i = 0; i < sz1; i++, all_y += sz2) {
-    result[i] = all_y;
-    for (int j = 0; j < sz2; j++, all_z += sz3) {
-      result[i][j] = all_z;
-      for (int k = 0; k < sz3; k++, all_r += sz4) {
-        result[i][j][k] = all_r;
-      }
-    }
-  }
-
-  return result;
-}
-
 /*! Deallocator for 4D arrays */
 template < class type > void delArr4(type **** arr, int dummyx, int dummyy, int dummyz) {
   delete[]arr[0][0][0];
@@ -92,29 +64,6 @@ template < class type > type *** _new_3_array(int sz1, int sz2, int sz3) {
 
 }
 
-/*! The assignment for 3D array */
-template < class type > type *** _assign_3_array(int sz1, int sz2, int sz3, type *** org) {
-
-  type ***all_x;
-  type **all_y;
-  type *all_z;
-
-  all_x = org;
-  all_y = org[0];
-  all_z = org[0][0];
-
-  type ***result = org;
-
-  for (int i = 0; i < sz1; i++, all_y += sz2) {
-    result[i] = all_y;
-    for (int j = 0; j < sz2; j++, all_z += sz3) {
-      result[i][j] = all_z;
-    }
-  }
-
-  return result;
-}
-
 /*! Deallocator for 3D arrays */
 template < class type > void delArr3(type *** arr, int dummyx, int dummyy) {
   delete[]arr[0][0];
@@ -141,24 +90,6 @@ template < class type > type ** _new_2_array(int sz1, int sz2) {
 
 }
 
-/*! The assignment for 2D array */
-template < class type > type ** _assign_2_array(int sz1, int sz2, type ** org) {
-
-  type **all_x;
-  type *all_y;
-
-  all_x = org;
-  all_y = org[0];
-
-  type **result = org;
-
-  for (int i = 0; i < sz1; i++, all_y += sz2) {
-    result[i] = all_y;
-  }
-
-  return result;
-}
-
 /*! Deallocator for 2D arrays */
 template < class type > void delArr2(type ** arr, int dummyx) {
   delete[]arr[0];
@@ -169,8 +100,4 @@ template < class type > void delArr2(type ** arr, int dummyx) {
 #define newArr3(type,sz1,sz2,sz3) _new_3_array<type>((sz1),(sz2),(sz3))
 #define newArr2(type,sz1,sz2) _new_2_array<type>((sz1),(sz2))
 
-#define asgArr2(type,sz1,sz2,org) _assign_2_array<type>((sz1),(sz2),(org))
-#define asgArr3(type,sz1,sz2,sz3,org) _assign_3_array<type>((sz1),(sz2),(sz3),(org))
-#define asgArr4(type,sz1,sz2,sz3,sz4,org) _assign_4_array<type>((sz1),(sz2),(sz3),(sz4),(org))
-
 #endif
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 397a1c62..9b475115 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -316,18 +316,18 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
     cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
   }
   double start_mover_PC = MPI_Wtime();
-  double ***Ex = asgArr3(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), EMf->getEx());
-  double ***Ey = asgArr3(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), EMf->getEy());
-  double ***Ez = asgArr3(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), EMf->getEz());
-  double ***Bx = asgArr3(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), EMf->getBx());
-  double ***By = asgArr3(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), EMf->getBy());
-  double ***Bz = asgArr3(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), EMf->getBz());
-
-  double ***Bx_ext = asgArr3(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), EMf->getBx_ext());
-  double ***By_ext = asgArr3(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), EMf->getBy_ext());
-  double ***Bz_ext = asgArr3(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), EMf->getBz_ext());
-
-  double ****node_coordinate = asgArr4(double, grid->getNXN(), grid->getNYN(), grid->getNZN(), 3, grid->getN());
+  double ***Ex = EMf->getEx();
+  double ***Ey = EMf->getEy();
+  double ***Ez = EMf->getEz();
+  double ***Bx = EMf->getBx();
+  double ***By = EMf->getBy();
+  double ***Bz = EMf->getBz();
+
+  double ***Bx_ext = EMf->getBx_ext();
+  double ***By_ext = EMf->getBy_ext();
+  double ***Bz_ext = EMf->getBz_ext();
+
+  double ****node_coordinate = grid->getN();
 
   const double dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
   const double inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;

From e8982ee1c62b10eb559d8bd8ac30576cc6f71752 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 23 Jul 2013 07:10:13 +0200
Subject: [PATCH 018/118] issue#31: use stdout rather than stderr in debug.h

---
 include/debug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/debug.h b/include/debug.h
index 6532f599..402d3819 100644
--- a/include/debug.h
+++ b/include/debug.h
@@ -11,7 +11,7 @@
 
 void dfprintf_fileLine(FILE * fptr, const char *func, const char *file, int line_number, const char *format, ...);
 
-#define dprintf(args...) dfprintf_fileLine(stderr, __func__, __FILE__, __LINE__,## args)
+#define dprintf(args...) dfprintf_fileLine(stdout, __func__, __FILE__, __LINE__,## args)
 #define dprint(var) dprintvar_fileLine(__func__,__FILE__,__LINE__,#var,var);
 #define dprint0(var) dprint(var)
 #define declare_dprintvar_fileLine(type) \

From ed7accfb30a9ce4447490d2ac996d028b3836afa Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 23 Jul 2013 07:59:41 +0200
Subject: [PATCH 019/118] issue #40: replacing node_coordinate and
 center_coordinate arrays

---
 grids/Grid3DCU.cpp        | 46 ++++++++++++++++++---------------------
 include/Grid3DCU.h        | 40 ++++++++++++++++++++++++++--------
 particles/Particles3D.cpp | 14 +++++-------
 3 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index 973b20f8..b222e8d5 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -51,33 +51,29 @@ Grid3DCU::Grid3DCU(CollectiveIO * col, VirtualTopology3D * vct) {
   zEnd = zStart + (col->getLz() / (double) vct->getZLEN());
 
   // arrays allocation: nodes ---> the first node has index 1, the last has index nxn-2!
-  node_coordinate = newArr4(double, nxn, nyn, nzn, 3);  // 0 -> X, 1 -> Y, 2-> Z
-  for (int i = 0; i < nxn; i++) {
-    for (int j = 0; j < nyn; j++) {
-      for (int k = 0; k < nzn; k++) {
-        node_coordinate[i][j][k][0] = xStart + (i - 1) * dx;
-        node_coordinate[i][j][k][1] = yStart + (j - 1) * dy;
-        node_coordinate[i][j][k][2] = zStart + (k - 1) * dz;
-      }
-    }
-  }
+  node_xcoord = new double[nxn];
+  node_ycoord = new double[nyn];
+  node_zcoord = new double[nzn];
+  for (int i=0; i<nxn; i++) node_xcoord[i] = xStart + (i - 1) * dx;
+  for (int j=0; j<nyn; j++) node_ycoord[j] = yStart + (j - 1) * dy;
+  for (int k=0; k<nzn; k++) node_zcoord[k] = zStart + (k - 1) * dz;
   // arrays allocation: cells ---> the first cell has index 1, the last has index ncn-2!
-  center_coordinate = newArr4(double, nxc, nyc, nzc, 3);
-  for (int i = 0; i < nxc; i++) {
-    for (int j = 0; j < nyc; j++) {
-      for (int k = 0; k < nzc; k++) {
-        center_coordinate[i][j][k][0] = .5 * (node_coordinate[i][j][k][0] + node_coordinate[i + 1][j][k][0]);
-        center_coordinate[i][j][k][1] = .5 * (node_coordinate[i][j][k][1] + node_coordinate[i][j + 1][k][1]);
-        center_coordinate[i][j][k][2] = .5 * (node_coordinate[i][j][k][2] + node_coordinate[i][j][k + 1][2]);
-      }
-    }
-  }
+  center_xcoord = new double[nxc];
+  center_ycoord = new double[nyc];
+  center_zcoord = new double[nzc];
+  for(int i=0; i<nxc; i++) center_xcoord[i] = .5*(node_xcoord[i]+node_xcoord[i+1]);
+  for(int j=0; j<nyc; j++) center_ycoord[j] = .5*(node_ycoord[j]+node_ycoord[j+1]);
+  for(int k=0; k<nzc; k++) center_zcoord[k] = .5*(node_zcoord[k]+node_zcoord[k+1]);
 }
 
 /** deallocate the local grid */
 Grid3DCU::~Grid3DCU() {
-  delArr4(node_coordinate, nxn, nyn, nzn);
-  delArr4(center_coordinate, nxc, nyc, nzc);
+  delete [] node_xcoord;
+  delete [] node_ycoord;
+  delete [] node_zcoord;
+  delete [] center_xcoord;
+  delete [] center_ycoord;
+  delete [] center_zcoord;
 }
 
 /** print the local grid info */
@@ -85,9 +81,9 @@ void Grid3DCU::print(VirtualTopology3D * ptVCT) {
   cout << endl;
   cout << "Subgrid (" << ptVCT->getCoordinates(0) << "," << ptVCT->getCoordinates(1) << "," << ptVCT->getCoordinates(2) << ")" << endl;
   cout << "Number of cell: -X=" << nxc - 2 << " -Y=" << nyc - 2 << " -Z=" << nzc - 2 << endl;
-  cout << "Xin = " << node_coordinate[1][1][1][0] << "; Xfin = " << node_coordinate[nxn - 2][1][1][0] << endl;
-  cout << "Yin = " << node_coordinate[1][1][1][1] << "; Yfin = " << node_coordinate[1][nyn - 2][1][1] << endl;
-  cout << "Zin = " << node_coordinate[1][1][1][2] << "; Zfin = " << node_coordinate[1][1][nzn - 2][2] << endl;
+  cout << "Xin = " << node_xcoord[1] << "; Xfin = " << node_xcoord[nxn - 2] << endl;
+  cout << "Yin = " << node_ycoord[1] << "; Yfin = " << node_ycoord[nyn - 2] << endl;
+  cout << "Zin = " << node_zcoord[1] << "; Zfin = " << node_zcoord[nzn - 2] << endl;
   cout << endl;
 }
 
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index f085ea34..9d2e2f01 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -110,9 +110,13 @@ class Grid3DCU                  // :public Grid
   /** invol = inverse of volume*/
   double invVOL;
   /** node coordinate */
-  double ****node_coordinate;
+  double *node_xcoord;
+  double *node_ycoord;
+  double *node_zcoord;
   /** center coordinate */
-  double ****center_coordinate;
+  double *center_xcoord;
+  double *center_ycoord;
+  double *center_zcoord;
   /** local grid boundaries coordinate  */
   double xStart, xEnd, yStart, yEnd, zStart, zEnd;
 
@@ -126,13 +130,31 @@ class Grid3DCU                  // :public Grid
   double getDX() { return (dx); }
   double getDY() { return (dy); }
   double getDZ() { return (dz); }
-  double &getXN(int X, int Y, int Z) { return (node_coordinate[X][Y][Z][0]); }
-  double &getYN(int X, int Y, int Z) { return (node_coordinate[X][Y][Z][1]); }
-  double &getZN(int X, int Y, int Z) { return (node_coordinate[X][Y][Z][2]); }
-  double &getXC(int X, int Y, int Z) { return (center_coordinate[X][Y][Z][0]); }
-  double &getYC(int X, int Y, int Z) { return (center_coordinate[X][Y][Z][1]); }
-  double &getZC(int X, int Y, int Z) { return (center_coordinate[X][Y][Z][2]); }
-  double ****getN() { return node_coordinate; }
+  //
+  // coordinate accessors
+  //
+  // calculated equivalents (preferred for accelerator?):
+  //const double &calcXN(int X) { return xStart+(X-1)*dx;}
+  //const double &calcYN(int Y) { return yStart+(Y-1)*dy;}
+  //const double &calcZN(int Z) { return zStart+(Z-1)*dz;}
+  const double &getXN(int X) { return node_xcoord[X];}
+  const double &getYN(int Y) { return node_ycoord[Y];}
+  const double &getZN(int Z) { return node_zcoord[Z];}
+  const double &getXC(int X) { return center_xcoord[X];}
+  const double &getYC(int Y) { return center_ycoord[Y];}
+  const double &getZC(int Z) { return center_zcoord[Z];}
+  //
+  // The following could be eliminated in favor of the previous
+  // unless we truly anticipate generalizing to a deformed
+  // logically cartesian mesh.  See issue #40.
+  //
+  const double &getXN(int X, int Y, int Z) { return node_xcoord[X];}
+  const double &getYN(int X, int Y, int Z) { return node_ycoord[Y];}
+  const double &getZN(int X, int Y, int Z) { return node_zcoord[Z];}
+  const double &getXC(int X, int Y, int Z) { return center_xcoord[X];}
+  const double &getYC(int X, int Y, int Z) { return center_ycoord[Y];}
+  const double &getZC(int X, int Y, int Z) { return center_zcoord[Z];}
+  //
   double getXstart() { return (xStart); }
   double getXend() { return (xEnd); }
   double getYstart() { return (yStart); }
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 9b475115..2be55cfb 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -327,8 +327,6 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   double ***By_ext = EMf->getBy_ext();
   double ***Bz_ext = EMf->getBz_ext();
 
-  double ****node_coordinate = grid->getN();
-
   const double dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
   const double inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
   assert_le(nop,INT_MAX); // else would need to use long long
@@ -376,12 +374,12 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       double xi[2];
       double eta[2];
       double zeta[2];
-      xi[0] = xp - node_coordinate[ix - 1][iy][iz][0];
-      eta[0] = yp - node_coordinate[ix][iy - 1][iz][1];
-      zeta[0] = zp - node_coordinate[ix][iy][iz - 1][2];
-      xi[1] = node_coordinate[ix][iy][iz][0] - xp;
-      eta[1] = node_coordinate[ix][iy][iz][1] - yp;
-      zeta[1] = node_coordinate[ix][iy][iz][2] - zp;
+      xi[0]   = xp - grid->getXN(ix-1);
+      eta[0]  = yp - grid->getYN(iy-1);
+      zeta[0] = zp - grid->getZN(iz-1);
+      xi[1]   = grid->getXN(ix) - xp;
+      eta[1]  = grid->getYN(iy) - yp;
+      zeta[1] = grid->getZN(iz) - zp;
 
       double Exl = 0.0;
       double Eyl = 0.0;

From 1c2ff247c80370659bc97c0e9b56544552477fac Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 23 Jul 2013 10:11:56 +0200
Subject: [PATCH 020/118] issue#41: interpP2G reimplemented as sumMoments in
 EMfields class

---
 fields/EMfields3D.cpp         | 279 ++++++++++++++++++++++++++++++++--
 fields/Moments.cpp            |   4 +-
 include/EMfields3D.h          |   2 +
 include/Moments.h             | 104 +++++--------
 include/Particles3D.h         |   4 +-
 include/Particles3Dcomm.h     |  10 +-
 main/iPic3Dlib.cpp            |   4 +-
 particles/Particles3D.cpp     | 142 ++++++++---------
 particles/Particles3Dcomm.cpp |  29 ++--
 9 files changed, 407 insertions(+), 171 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index c6c5936d..7be94a18 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -1,6 +1,7 @@
 
 #include <mpi.h>
 #include "EMfields3D.h"
+#include "Particles3Dcomm.h"
 #include "TimeTasks.h"
 #include "Moments.h"
 #include "ompdefs.h"
@@ -169,8 +170,262 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) {
   for(int i=0;i<sizeMomentsArray;i++)
   {
     momentsArray[i] = new Moments;
-    momentsArray[i]->init(nxn,nyn,nzn,invVOL);
-  }
+    momentsArray[i]->init(nxn,nyn,nzn);
+  }
+}
+
+// This was Particles3Dcomm::interpP2G()
+void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct)
+{
+  const double inv_dx = 1.0 / dx;
+  const double inv_dy = 1.0 / dy;
+  const double inv_dz = 1.0 / dz;
+  const int nxn = grid->getNXN();
+  const int nyn = grid->getNYN();
+  const int nzn = grid->getNZN();
+  const double xstart = grid->getXstart();
+  const double ystart = grid->getYstart();
+  const double zstart = grid->getZstart();
+  double const*const x = pcls.getXall();
+  double const*const y = pcls.getYall();
+  double const*const z = pcls.getZall();
+  double const*const u = pcls.getUall();
+  double const*const v = pcls.getVall();
+  double const*const w = pcls.getWall();
+  double const*const q = pcls.getQall();
+  //
+  const int is = pcls.get_ns();
+  double* rhons1d = rhons[is][0][0];
+  double* Jxs1d   = Jxs  [is][0][0];
+  double* Jys1d   = Jys  [is][0][0];
+  double* Jzs1d   = Jzs  [is][0][0];
+  double* pXXsn1d = pXXsn[is][0][0];
+  double* pXYsn1d = pXYsn[is][0][0];
+  double* pXZsn1d = pXZsn[is][0][0];
+  double* pYYsn1d = pYYsn[is][0][0];
+  double* pYZsn1d = pYZsn[is][0][0];
+  double* pZZsn1d = pZZsn[is][0][0];
+  //
+  const long long nop_ll = pcls.getNOP();
+  const int nop = pcls.getNOP();
+  assert_le(nop_ll,INT_MAX); // else would need to use long long
+  // To make memory use scale to a large number of threads, we
+  // could first apply an efficient parallel sorting algorithm
+  // to the particles and then accumulate moments in smaller
+  // subarrays.
+  //#ifdef _OPENMP
+  #pragma omp parallel
+  {
+    int thread_num = omp_get_thread_num();
+    Moments& speciesMoments = fetch_momentsArray(thread_num);
+    speciesMoments.set_to_zero();
+    //
+    double*** rho = speciesMoments.fetch_rho();
+    double*** Jx  = speciesMoments.fetch_Jx();
+    double*** Jy  = speciesMoments.fetch_Jy();
+    double*** Jz  = speciesMoments.fetch_Jz();
+    double*** Pxx = speciesMoments.fetch_Pxx();
+    double*** Pxy = speciesMoments.fetch_Pxy();
+    double*** Pxz = speciesMoments.fetch_Pxz();
+    double*** Pyy = speciesMoments.fetch_Pyy();
+    double*** Pyz = speciesMoments.fetch_Pyz();
+    double*** Pzz = speciesMoments.fetch_Pzz();
+    // The following loop is expensive, so it is wise to assume that the
+    // compiler is stupid.  Therefore we should on the one hand
+    // expand things out and on the other hand avoid repeating computations.
+    #pragma omp for
+    for (int i = 0; i < nop; i++)
+    {
+      // compute the quadratic moments of velocity
+      //
+      const double ui=u[i];
+      const double vi=v[i];
+      const double wi=w[i];
+      const double uui=ui*ui;
+      const double uvi=ui*vi;
+      const double uwi=ui*wi;
+      const double vvi=vi*vi;
+      const double vwi=vi*wi;
+      const double wwi=wi*wi;
+      //
+      // compute the weights to distribute the moments
+      //
+      const int ix = 2 + int (floor((x[i] - xstart) * inv_dx));
+      const int iy = 2 + int (floor((y[i] - ystart) * inv_dy));
+      const int iz = 2 + int (floor((z[i] - zstart) * inv_dz));
+      //const double xi0   = x[i] - grid->getXN(ix - 1, iy, iz);
+      //const double eta0  = y[i] - grid->getYN(ix, iy - 1, iz);
+      //const double zeta0 = z[i] - grid->getZN(ix, iy, iz - 1);
+      //const double xi1   = grid->getXN(ix, iy, iz) - x[i];
+      //const double eta1  = grid->getYN(ix, iy, iz) - y[i];
+      //const double zeta1 = grid->getZN(ix, iy, iz) - z[i];
+      const double xi0   = x[i] - grid->getXN(ix-1);
+      const double eta0  = y[i] - grid->getYN(iy-1);
+      const double zeta0 = z[i] - grid->getZN(iz-1);
+      const double xi1   = grid->getXN(ix) - x[i];
+      const double eta1  = grid->getYN(iy) - y[i];
+      const double zeta1 = grid->getZN(iz) - z[i];
+      const double qi = q[i];
+      const double weight000 = qi * xi0 * eta0 * zeta0 * invVOL;
+      const double weight001 = qi * xi0 * eta0 * zeta1 * invVOL;
+      const double weight010 = qi * xi0 * eta1 * zeta0 * invVOL;
+      const double weight011 = qi * xi0 * eta1 * zeta1 * invVOL;
+      const double weight100 = qi * xi1 * eta0 * zeta0 * invVOL;
+      const double weight101 = qi * xi1 * eta0 * zeta1 * invVOL;
+      const double weight110 = qi * xi1 * eta1 * zeta0 * invVOL;
+      const double weight111 = qi * xi1 * eta1 * zeta1 * invVOL;
+      //
+      // use the weight to distribute the moments
+      //
+      // add charge density
+      //speciesMoments.addRho(weight, ix, iy, iz);
+      rho[ix  ][iy  ][iz  ] += weight000;
+      rho[ix  ][iy  ][iz-1] += weight001;
+      rho[ix  ][iy-1][iz  ] += weight010;
+      rho[ix  ][iy-1][iz-1] += weight011;
+      rho[ix-1][iy  ][iz  ] += weight100;
+      rho[ix-1][iy  ][iz-1] += weight101;
+      rho[ix-1][iy-1][iz  ] += weight110;
+      rho[ix-1][iy-1][iz-1] += weight111;
+      // add current density - X
+      //speciesMoments.addJx(temp, ix, iy, iz);
+      Jx[ix  ][iy  ][iz  ] += ui*weight000;
+      Jx[ix  ][iy  ][iz-1] += ui*weight001;
+      Jx[ix  ][iy-1][iz  ] += ui*weight010;
+      Jx[ix  ][iy-1][iz-1] += ui*weight011;
+      Jx[ix-1][iy  ][iz  ] += ui*weight100;
+      Jx[ix-1][iy  ][iz-1] += ui*weight101;
+      Jx[ix-1][iy-1][iz  ] += ui*weight110;
+      Jx[ix-1][iy-1][iz-1] += ui*weight111;
+      // add current density - Y
+      //speciesMoments.addJy(temp, ix, iy, iz);
+      Jy[ix  ][iy  ][iz  ] += vi*weight000;
+      Jy[ix  ][iy  ][iz-1] += vi*weight001;
+      Jy[ix  ][iy-1][iz  ] += vi*weight010;
+      Jy[ix  ][iy-1][iz-1] += vi*weight011;
+      Jy[ix-1][iy  ][iz  ] += vi*weight100;
+      Jy[ix-1][iy  ][iz-1] += vi*weight101;
+      Jy[ix-1][iy-1][iz  ] += vi*weight110;
+      Jy[ix-1][iy-1][iz-1] += vi*weight111;
+      // add current density - Z
+      //speciesMoments.addJz(temp, ix, iy, iz);
+      Jz[ix  ][iy  ][iz  ] += wi*weight000;
+      Jz[ix  ][iy  ][iz-1] += wi*weight001;
+      Jz[ix  ][iy-1][iz  ] += wi*weight010;
+      Jz[ix  ][iy-1][iz-1] += wi*weight011;
+      Jz[ix-1][iy  ][iz  ] += wi*weight100;
+      Jz[ix-1][iy  ][iz-1] += wi*weight101;
+      Jz[ix-1][iy-1][iz  ] += wi*weight110;
+      Jz[ix-1][iy-1][iz-1] += wi*weight111;
+      // Pxx - add pressure tensor
+      //speciesMoments.addPxx(temp, ix, iy, iz);
+      Pxx[ix  ][iy  ][iz  ] += uui*weight000;
+      Pxx[ix  ][iy  ][iz-1] += uui*weight001;
+      Pxx[ix  ][iy-1][iz  ] += uui*weight010;
+      Pxx[ix  ][iy-1][iz-1] += uui*weight011;
+      Pxx[ix-1][iy  ][iz  ] += uui*weight100;
+      Pxx[ix-1][iy  ][iz-1] += uui*weight101;
+      Pxx[ix-1][iy-1][iz  ] += uui*weight110;
+      Pxx[ix-1][iy-1][iz-1] += uui*weight111;
+      // Pxy - add pressure tensor
+      //speciesMoments.addPxy(temp, ix, iy, iz);
+      Pxy[ix  ][iy  ][iz  ] += uvi*weight000;
+      Pxy[ix  ][iy  ][iz-1] += uvi*weight001;
+      Pxy[ix  ][iy-1][iz  ] += uvi*weight010;
+      Pxy[ix  ][iy-1][iz-1] += uvi*weight011;
+      Pxy[ix-1][iy  ][iz  ] += uvi*weight100;
+      Pxy[ix-1][iy  ][iz-1] += uvi*weight101;
+      Pxy[ix-1][iy-1][iz  ] += uvi*weight110;
+      Pxy[ix-1][iy-1][iz-1] += uvi*weight111;
+      // Pxz - add pressure tensor
+      //speciesMoments.addPxz(temp, ix, iy, iz);
+      Pxz[ix  ][iy  ][iz  ] += uwi*weight000;
+      Pxz[ix  ][iy  ][iz-1] += uwi*weight001;
+      Pxz[ix  ][iy-1][iz  ] += uwi*weight010;
+      Pxz[ix  ][iy-1][iz-1] += uwi*weight011;
+      Pxz[ix-1][iy  ][iz  ] += uwi*weight100;
+      Pxz[ix-1][iy  ][iz-1] += uwi*weight101;
+      Pxz[ix-1][iy-1][iz  ] += uwi*weight110;
+      Pxz[ix-1][iy-1][iz-1] += uwi*weight111;
+      // Pyy - add pressure tensor
+      //speciesMoments.addPyy(temp, ix, iy, iz);
+      Pyy[ix  ][iy  ][iz  ] += vvi*weight000;
+      Pyy[ix  ][iy  ][iz-1] += vvi*weight001;
+      Pyy[ix  ][iy-1][iz  ] += vvi*weight010;
+      Pyy[ix  ][iy-1][iz-1] += vvi*weight011;
+      Pyy[ix-1][iy  ][iz  ] += vvi*weight100;
+      Pyy[ix-1][iy  ][iz-1] += vvi*weight101;
+      Pyy[ix-1][iy-1][iz  ] += vvi*weight110;
+      Pyy[ix-1][iy-1][iz-1] += vvi*weight111;
+      // Pyz - add pressure tensor
+      //speciesMoments.addPyz(temp, ix, iy, iz);
+      Pyz[ix  ][iy  ][iz  ] += vwi*weight000;
+      Pyz[ix  ][iy  ][iz-1] += vwi*weight001;
+      Pyz[ix  ][iy-1][iz  ] += vwi*weight010;
+      Pyz[ix  ][iy-1][iz-1] += vwi*weight011;
+      Pyz[ix-1][iy  ][iz  ] += vwi*weight100;
+      Pyz[ix-1][iy  ][iz-1] += vwi*weight101;
+      Pyz[ix-1][iy-1][iz  ] += vwi*weight110;
+      Pyz[ix-1][iy-1][iz-1] += vwi*weight111;
+      // Pzz - add pressure tensor
+      //speciesMoments.addPzz(temp, ix, iy, iz);
+      Pzz[ix  ][iy  ][iz  ] += wwi*weight000;
+      Pzz[ix  ][iy  ][iz-1] += wwi*weight001;
+      Pzz[ix  ][iy-1][iz  ] += wwi*weight010;
+      Pzz[ix  ][iy-1][iz-1] += wwi*weight011;
+      Pzz[ix-1][iy  ][iz  ] += wwi*weight100;
+      Pzz[ix-1][iy  ][iz-1] += wwi*weight101;
+      Pzz[ix-1][iy-1][iz  ] += wwi*weight110;
+      Pzz[ix-1][iy-1][iz-1] += wwi*weight111;
+    }
+    // The following way is an equivalent reduction but less
+    // efficient for a large number of threads.
+    //
+    //#pragma omp critical
+    //addToSpeciesMoments(speciesMoments,is);
+    //
+    // Instead we split up the reduction tasks.
+    //
+    // One-dimensional array access is presumably
+    // more efficient on poor compilers.
+    //
+    const double*const rho1d = rho[0][0];
+    const double*const Jx1d  = Jx [0][0];
+    const double*const Jy1d  = Jy [0][0];
+    const double*const Jz1d  = Jz [0][0];
+    const double*const Pxx1d = Pxx[0][0];
+    const double*const Pxy1d = Pxy[0][0];
+    const double*const Pxz1d = Pxz[0][0];
+    const double*const Pyy1d = Pyy[0][0];
+    const double*const Pyz1d = Pyz[0][0];
+    const double*const Pzz1d = Pzz[0][0];
+    //
+    assert_eq(speciesMoments.get_nx(), nxn);
+    assert_eq(speciesMoments.get_ny(), nyn);
+    assert_eq(speciesMoments.get_nz(), nzn);
+    const int numel = nxn*nyn*nzn;
+    #pragma omp critical
+    for(int i=0;i<numel;i++) rhons1d[i] += invVOL*rho1d[i];
+    #pragma omp critical
+    for(int i=0;i<numel;i++) Jxs1d  [i] += invVOL*Jx1d [i];
+    #pragma omp critical
+    for(int i=0;i<numel;i++) Jys1d  [i] += invVOL*Jy1d [i];
+    #pragma omp critical
+    for(int i=0;i<numel;i++) Jzs1d  [i] += invVOL*Jz1d [i];
+    #pragma omp critical
+    for(int i=0;i<numel;i++) pXXsn1d[i] += invVOL*Pxx1d[i];
+    #pragma omp critical
+    for(int i=0;i<numel;i++) pXYsn1d[i] += invVOL*Pxy1d[i];
+    #pragma omp critical
+    for(int i=0;i<numel;i++) pXZsn1d[i] += invVOL*Pxz1d[i];
+    #pragma omp critical
+    for(int i=0;i<numel;i++) pYYsn1d[i] += invVOL*Pyy1d[i];
+    #pragma omp critical
+    for(int i=0;i<numel;i++) pYZsn1d[i] += invVOL*Pyz1d[i];
+    #pragma omp critical
+    for(int i=0;i<numel;i++) pZZsn1d[i] += invVOL*Pzz1d[i];
+  }
+  communicateGhostP2G(is, 0, 0, 0, 0, vct);
 }
 
 /*! Calculate Electric field with the implicit solver: the Maxwell solver method is called here */
@@ -1119,16 +1374,16 @@ void EMfields3D::addToSpeciesMoments(const Moments & in, int is) {
   for (register int i = 0; i < nxn; i++) {
     for (register int j = 0; j < nyn; j++)
       for (register int k = 0; k < nzn; k++) {
-        rhons[is][i][j][k] += in.get_rho(i, j, k);
-        Jxs[is][i][j][k] += in.get_Jx(i, j, k);
-        Jys[is][i][j][k] += in.get_Jy(i, j, k);
-        Jzs[is][i][j][k] += in.get_Jz(i, j, k);
-        pXXsn[is][i][j][k] += in.get_pXX(i, j, k);
-        pXYsn[is][i][j][k] += in.get_pXY(i, j, k);
-        pXZsn[is][i][j][k] += in.get_pXZ(i, j, k);
-        pYYsn[is][i][j][k] += in.get_pYY(i, j, k);
-        pYZsn[is][i][j][k] += in.get_pYZ(i, j, k);
-        pZZsn[is][i][j][k] += in.get_pZZ(i, j, k);
+        rhons[is][i][j][k] += invVOL*in.get_rho(i, j, k);
+        Jxs  [is][i][j][k] += invVOL*in.get_Jx(i, j, k);
+        Jys  [is][i][j][k] += invVOL*in.get_Jy(i, j, k);
+        Jzs  [is][i][j][k] += invVOL*in.get_Jz(i, j, k);
+        pXXsn[is][i][j][k] += invVOL*in.get_pXX(i, j, k);
+        pXYsn[is][i][j][k] += invVOL*in.get_pXY(i, j, k);
+        pXZsn[is][i][j][k] += invVOL*in.get_pXZ(i, j, k);
+        pYYsn[is][i][j][k] += invVOL*in.get_pYY(i, j, k);
+        pYZsn[is][i][j][k] += invVOL*in.get_pYZ(i, j, k);
+        pZZsn[is][i][j][k] += invVOL*in.get_pZZ(i, j, k);
       }
   }
 }
diff --git a/fields/Moments.cpp b/fields/Moments.cpp
index 846ed9e6..5565929b 100644
--- a/fields/Moments.cpp
+++ b/fields/Moments.cpp
@@ -2,11 +2,11 @@
 #include "Alloc.h"
 
 // construct empty instance (not zeroed)
-void Moments::init(int nx_, int ny_, int nz_, double invVOL_) {
+void Moments::init(int nx_, int ny_, int nz_)
+{
   nx = nx_;
   ny = ny_;
   nz = nz_;
-  invVOL = invVOL_;
   rho = newArr3(double, nx, ny, nz);
   Jx = newArr3(double, nx, ny, nz);
   Jy = newArr3(double, nx, ny, nz);
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 2e2d9bd6..3d09049c 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -30,6 +30,7 @@ using std::endl;
 
 /*! Electromagnetic fields and sources defined for each local grid, and for an implicit maxwell's solver @date May 2008 @par Copyright: (C) 2008 KUL @author Stefano Markidis, Giovanni Lapenta. @version 3.0 */
 
+class Particles3Dcomm;
 class Moments;
 class EMfields3D                // :public Field
 {
@@ -114,6 +115,7 @@ class EMfields3D                // :public Field
 
     /*! communicate ghost for grid -> Particles interpolation */
     void communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, VirtualTopology3D * vct);
+    void sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
     /*! add accumulated moments to the moments for a given species */
     void addToSpeciesMoments(const Moments & in, int is);
     /*! add an amount of charge density to charge density field at node X,Y,Z */
diff --git a/include/Moments.h b/include/Moments.h
index d4349ecc..e90e01db 100644
--- a/include/Moments.h
+++ b/include/Moments.h
@@ -5,7 +5,6 @@
 // 
 class Moments {
   private:
-    double invVOL;
     double ***rho;
 
     /** current density, defined on nodes */
@@ -24,55 +23,38 @@ class Moments {
     int ny;
     int nz;
   public:
-    int get_nx() const {
-      return nx;
-    }
-    int get_ny() const {
-      return ny;
-    }
-    int get_nz() const {
-      return nz;
-    }
-    double get_invVOL() const {
-      return invVOL;
-    }
-    double get_rho(int i, int j, int k) const {
-      return rho[i][j][k];
-    }
-    double get_Jx(int i, int j, int k) const {
-      return Jx[i][j][k];
-    }
-    double get_Jy(int i, int j, int k) const {
-      return Jy[i][j][k];
-    }
-    double get_Jz(int i, int j, int k) const {
-      return Jz[i][j][k];
-    }
-    double get_pXX(int i, int j, int k) const {
-      return pXX[i][j][k];
-    }
-    double get_pXY(int i, int j, int k) const {
-      return pXY[i][j][k];
-    }
-    double get_pXZ(int i, int j, int k) const {
-      return pXZ[i][j][k];
-    }
-    double get_pYY(int i, int j, int k) const {
-      return pYY[i][j][k];
-    }
-    double get_pYZ(int i, int j, int k) const {
-      return pYZ[i][j][k];
-    }
-    double get_pZZ(int i, int j, int k) const {
-      return pZZ[i][j][k];
-    }
+    // get accessors (read access)
+    int get_nx() const { return nx; }
+    int get_ny() const { return ny; }
+    int get_nz() const { return nz; }
+    double get_rho(int i, int j, int k) const { return rho[i][j][k]; }
+    double get_Jx (int i, int j, int k) const { return Jx [i][j][k]; }
+    double get_Jy (int i, int j, int k) const { return Jy [i][j][k]; }
+    double get_Jz (int i, int j, int k) const { return Jz [i][j][k]; }
+    double get_pXX(int i, int j, int k) const { return pXX[i][j][k]; }
+    double get_pXY(int i, int j, int k) const { return pXY[i][j][k]; }
+    double get_pXZ(int i, int j, int k) const { return pXZ[i][j][k]; }
+    double get_pYY(int i, int j, int k) const { return pYY[i][j][k]; }
+    double get_pYZ(int i, int j, int k) const { return pYZ[i][j][k]; }
+    double get_pZZ(int i, int j, int k) const { return pZZ[i][j][k]; }
+    // fetch accessors (write access)
+    double*** fetch_rho() { return rho; }
+    double*** fetch_Jx () { return Jx ; }
+    double*** fetch_Jy () { return Jy ; }
+    double*** fetch_Jz () { return Jz ; }
+    double*** fetch_Pxx() { return pXX; }
+    double*** fetch_Pxy() { return pXY; }
+    double*** fetch_Pxz() { return pXZ; }
+    double*** fetch_Pyy() { return pYY; }
+    double*** fetch_Pyz() { return pYZ; }
+    double*** fetch_Pzz() { return pZZ; }
   public:
     Moments() {
     };
-    Moments(int nx_, int ny_, int nz_, double invVOL_){
-      init(nx_,ny_,nz_,invVOL_);
+    Moments(int nx_, int ny_, int nz_){
+      init(nx_,ny_,nz_);
     }
-    void init(int nx_, int ny_, int nz_, double invVOL_);
+    void init(int nx_, int ny_, int nz_);
     ~Moments();
     void set_to_zero();
     void addRho(double weight[][2][2], int X, int Y, int Z);
@@ -93,8 +75,7 @@ inline void Moments::addRho(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++) {
-        const double temp = weight[i][j][k] * invVOL;
-        rho[X - i][Y - j][Z - k] += temp;
+        rho[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 /** add an amount of charge density to current density - direction X to current density field on the node*/
@@ -102,8 +83,7 @@ inline void Moments::addJx(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++){
-        const double temp = weight[i][j][k] * invVOL;
-        Jx[X - i][Y - j][Z - k] += temp;
+        Jx[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 /** add an amount of current density - direction Y to current density field on the node */
@@ -111,8 +91,7 @@ inline void Moments::addJy(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++){
-        const double temp = weight[i][j][k] * invVOL;
-        Jy[X - i][Y - j][Z - k] += temp;
+        Jy[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 /** add an amount of current density - direction Z to current density field on the node */
@@ -120,8 +99,7 @@ inline void Moments::addJz(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++){
-        const double temp = weight[i][j][k] * invVOL;
-        Jz[X - i][Y - j][Z - k] += temp;
+        Jz[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 /** add an amount of pressure density - direction XX to current density field on the node */
@@ -129,8 +107,7 @@ inline void Moments::addPxx(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++){
-        const double temp = weight[i][j][k] * invVOL;
-        pXX[X - i][Y - j][Z - k] += temp;
+        pXX[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 /** add an amount of pressure density - direction XY to current density field on the node*/
@@ -138,8 +115,7 @@ inline void Moments::addPxy(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++){
-        const double temp = weight[i][j][k] * invVOL;
-        pXY[X - i][Y - j][Z - k] += temp;
+        pXY[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 /** add an amount of pressure density - direction XZ to current density field on the node */
@@ -147,8 +123,7 @@ inline void Moments::addPxz(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++){
-        const double temp = weight[i][j][k] * invVOL;
-        pXZ[X - i][Y - j][Z - k] += temp;
+        pXZ[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 /** add an amount of pressure density - direction YY to current density field on the node*/
@@ -156,8 +131,7 @@ inline void Moments::addPyy(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++){
-        const double temp = weight[i][j][k] * invVOL;
-        pYY[X - i][Y - j][Z - k] += temp;
+        pYY[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 /** add an amount of pressure density - direction YZ to current density field on the node */
@@ -165,8 +139,7 @@ inline void Moments::addPyz(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++){
-        const double temp = weight[i][j][k] * invVOL;
-        pYZ[X - i][Y - j][Z - k] += temp;
+        pYZ[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 /** add an amount of pressure density - direction ZZ to current density field on the node */
@@ -174,8 +147,7 @@ inline void Moments::addPzz(double weight[][2][2], int X, int Y, int Z) {
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++)
       for (int k = 0; k < 2; k++){
-        const double temp = weight[i][j][k] * invVOL;
-        pZZ[X - i][Y - j][Z - k] += temp;
+        pZZ[X - i][Y - j][Z - k] += weight[i][j][k];
       }
 }
 
diff --git a/include/Particles3D.h b/include/Particles3D.h
index 2c178918..fd89edd0 100644
--- a/include/Particles3D.h
+++ b/include/Particles3D.h
@@ -63,9 +63,9 @@ class Particles3D:public Particles3Dcomm {
     /** particle repopulator */
     int particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field* EMf);
     /** interpolation Particle->Grid only charge density, current */
-    void interpP2G_notP(Field * EMf, Grid * grid, VirtualTopology3D * vct);
+    //void interpP2G_notP(Field * EMf, Grid * grid, VirtualTopology3D * vct);
     /** interpolation Particle->Grid only for pressure tensor */
-    void interpP2G_onlyP(Field * EMf, Grid * grid, VirtualTopology3D * vct);
+    //void interpP2G_onlyP(Field * EMf, Grid * grid, VirtualTopology3D * vct);
     /*! Delete the particles inside the sphere with radius R and center x_center y_center and return the total charge removed */
     double deleteParticlesInsideSphere(double R, double x_center, double y_center, double z_center);
 
diff --git a/include/Particles3Dcomm.h b/include/Particles3Dcomm.h
index d69cf069..29600957 100644
--- a/include/Particles3Dcomm.h
+++ b/include/Particles3Dcomm.h
@@ -10,7 +10,7 @@ developers: Stefano Markidis, Giovanni Lapenta
 #include "Particles.h"
 /**
  * 
- * Abstract class for particles of the same species, in a 2D space and 3component velocity with communications methods
+ * class for particles of the same species with communications methods
  * @date Fri Jun 4 2007
  * @author Stefano Markidis, Giovanni Lapenta
  * @version 2.0
@@ -26,7 +26,7 @@ class Particles3Dcomm:public Particles {
   void allocate(int species, CollectiveIO * col, VirtualTopology3D * vct, Grid * grid);
 
   /** calculate the weights given the position of particles */
-  void calculateWeights(double weight[][2][2], double xp, double yp, double zp, int ix, int iy, int iz, Grid * grid);
+  //void calculateWeights(double weight[][2][2], double xp, double yp, double zp, int ix, int iy, int iz, Grid * grid);
   /** interpolation method GRID->PARTICLE order 1: CIC */
   void interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vct);
   /** method for communicating exiting particles to X-RIGHT, X-LEFT, Y-RIGHT, Y-LEFT, Z-RIGHT, Z-LEFT processes */
@@ -104,8 +104,12 @@ class Particles3Dcomm:public Particles {
   /** Print the number of particles of this subdomain */
   void PrintNp(VirtualTopology3D * ptVCT) const;
 
+public:
+  // accessors
+  int get_ns()const{return ns;}
+
 protected:
-  /** number of species */
+  /** number of this species */
   int ns;
   /** maximum number of particles of this species on this domain. used for memory allocation */
   long long npmax;
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 418fc8ce..852817b5 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -175,7 +175,9 @@ void c_Solver::CalculateField() {
 
   for (int i = 0; i < ns; i++)
   {
-    part[i].interpP2G(EMf, grid, vct);      // interpolate Particles to Grid(Nodes)
+    // interpolate particles to grid nodes
+    EMf->sumMoments(part[i], grid, vct);
+    //part[i].interpP2G(EMf, grid, vct);
   }
 
   EMf->sumOverSpecies(vct);                 // sum all over the species
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 2be55cfb..6b55f301 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -907,78 +907,78 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
 }
 
 /** interpolation Particle->Grid only for pressure tensor */
-void Particles3D::interpP2G_onlyP(Field * EMf, Grid * grid, VirtualTopology3D * vct) {
-  double weight[2][2][2];
-  double temp[2][2][2];
-  int ix, iy, iz, temp1, temp2, temp3;
-  for (register long long i = 0; i < nop; i++) {
-    ix = 2 + int (floor((x[i] - grid->getXstart()) / grid->getDX()));
-    iy = 2 + int (floor((y[i] - grid->getYstart()) / grid->getDY()));
-    iz = 2 + int (floor((z[i] - grid->getZstart()) / grid->getDZ()));
-    calculateWeights(weight, x[i], y[i], z[i], ix, iy, iz, grid);
-    scale(weight, q[i], 2, 2, 2);
-    // Pxx
-    eqValue(0.0, temp, 2, 2, 2);
-    addscale(u[i] * u[i], temp, weight, 2, 2, 2);
-    EMf->addPxx(temp, ix, iy, iz, ns);
-    // Pxy
-    eqValue(0.0, temp, 2, 2, 2);
-    addscale(u[i] * v[i], temp, weight, 2, 2, 2);
-    EMf->addPxy(temp, ix, iy, iz, ns);
-    // Pxz
-    eqValue(0.0, temp, 2, 2, 2);
-    addscale(u[i] * w[i], temp, weight, 2, 2, 2);
-    EMf->addPxz(temp, ix, iy, iz, ns);
-    // Pyy
-    eqValue(0.0, temp, 2, 2, 2);
-    addscale(v[i] * v[i], temp, weight, 2, 2, 2);
-    EMf->addPyy(temp, ix, iy, iz, ns);
-    // Pyz
-    eqValue(0.0, temp, 2, 2, 2);
-    addscale(v[i] * w[i], temp, weight, 2, 2, 2);
-    EMf->addPyz(temp, ix, iy, iz, ns);
-    // Pzz
-    eqValue(0.0, temp, 2, 2, 2);
-    addscale(w[i] * w[i], temp, weight, 2, 2, 2);
-    EMf->addPzz(temp, ix, iy, iz, ns);
-  }
-}
+//void Particles3D::interpP2G_onlyP(Field * EMf, Grid * grid, VirtualTopology3D * vct) {
+//  double weight[2][2][2];
+//  double temp[2][2][2];
+//  int ix, iy, iz, temp1, temp2, temp3;
+//  for (register long long i = 0; i < nop; i++) {
+//    ix = 2 + int (floor((x[i] - grid->getXstart()) / grid->getDX()));
+//    iy = 2 + int (floor((y[i] - grid->getYstart()) / grid->getDY()));
+//    iz = 2 + int (floor((z[i] - grid->getZstart()) / grid->getDZ()));
+//    calculateWeights(weight, x[i], y[i], z[i], ix, iy, iz, grid);
+//    scale(weight, q[i], 2, 2, 2);
+//    // Pxx
+//    eqValue(0.0, temp, 2, 2, 2);
+//    addscale(u[i] * u[i], temp, weight, 2, 2, 2);
+//    EMf->addPxx(temp, ix, iy, iz, ns);
+//    // Pxy
+//    eqValue(0.0, temp, 2, 2, 2);
+//    addscale(u[i] * v[i], temp, weight, 2, 2, 2);
+//    EMf->addPxy(temp, ix, iy, iz, ns);
+//    // Pxz
+//    eqValue(0.0, temp, 2, 2, 2);
+//    addscale(u[i] * w[i], temp, weight, 2, 2, 2);
+//    EMf->addPxz(temp, ix, iy, iz, ns);
+//    // Pyy
+//    eqValue(0.0, temp, 2, 2, 2);
+//    addscale(v[i] * v[i], temp, weight, 2, 2, 2);
+//    EMf->addPyy(temp, ix, iy, iz, ns);
+//    // Pyz
+//    eqValue(0.0, temp, 2, 2, 2);
+//    addscale(v[i] * w[i], temp, weight, 2, 2, 2);
+//    EMf->addPyz(temp, ix, iy, iz, ns);
+//    // Pzz
+//    eqValue(0.0, temp, 2, 2, 2);
+//    addscale(w[i] * w[i], temp, weight, 2, 2, 2);
+//    EMf->addPzz(temp, ix, iy, iz, ns);
+//  }
+//}
 /** interpolation Particle->Grid only charge density, current */
-void Particles3D::interpP2G_notP(Field * EMf, Grid * grid, VirtualTopology3D * vct) {
-  double weight[2][2][2];
-  double temp[2][2][2];
-  int ix, iy, iz, temp2, temp1, temp3;
-  for (register long long i = 0; i < nop; i++) {
-    ix = 2 + int (floor((x[i] - grid->getXstart()) / grid->getDX()));
-    iy = 2 + int (floor((y[i] - grid->getYstart()) / grid->getDY()));
-    iz = 2 + int (floor((z[i] - grid->getZstart()) / grid->getDZ()));
-    temp1 = (int) min(ix, nxn - 2);
-    temp2 = (int) min(iy, nyn - 2);
-    temp3 = (int) min(iz, nzn - 2);
-    ix = (int) max(temp1, 2);
-    iy = (int) max(temp2, 2);
-    iz = (int) max(temp3, 2);
-    calculateWeights(weight, x[i], y[i], z[i], ix, iy, iz, grid);
-    scale(weight, q[i], 2, 2, 2);
-    // rho
-    EMf->addRho(weight, ix, iy, iz, ns);
-    // Jx
-    eqValue(0.0, temp, 2, 2, 2);
-    addscale(u[i], temp, weight, 2, 2, 2);
-    EMf->addJx(temp, ix, iy, iz, ns);
-    // Jy
-    eqValue(0.0, temp, 2, 2, 2);
-    addscale(v[i], temp, weight, 2, 2, 2);
-    EMf->addJy(temp, ix, iy, iz, ns);
-    // Jz
-    eqValue(0.0, temp, 2, 2, 2);
-    addscale(w[i], temp, weight, 2, 2, 2);
-    EMf->addJz(temp, ix, iy, iz, ns);
-
-  }
-  // communicate contribution from ghost cells 
-  EMf->communicateGhostP2G(ns, 0, 0, 0, 0, vct);
-}
+//void Particles3D::interpP2G_notP(Field * EMf, Grid * grid, VirtualTopology3D * vct) {
+//  double weight[2][2][2];
+//  double temp[2][2][2];
+//  int ix, iy, iz, temp2, temp1, temp3;
+//  for (register long long i = 0; i < nop; i++) {
+//    ix = 2 + int (floor((x[i] - grid->getXstart()) / grid->getDX()));
+//    iy = 2 + int (floor((y[i] - grid->getYstart()) / grid->getDY()));
+//    iz = 2 + int (floor((z[i] - grid->getZstart()) / grid->getDZ()));
+//    temp1 = (int) min(ix, nxn - 2);
+//    temp2 = (int) min(iy, nyn - 2);
+//    temp3 = (int) min(iz, nzn - 2);
+//    ix = (int) max(temp1, 2);
+//    iy = (int) max(temp2, 2);
+//    iz = (int) max(temp3, 2);
+//    calculateWeights(weight, x[i], y[i], z[i], ix, iy, iz, grid);
+//    scale(weight, q[i], 2, 2, 2);
+//    // rho
+//    EMf->addRho(weight, ix, iy, iz, ns);
+//    // Jx
+//    eqValue(0.0, temp, 2, 2, 2);
+//    addscale(u[i], temp, weight, 2, 2, 2);
+//    EMf->addJx(temp, ix, iy, iz, ns);
+//    // Jy
+//    eqValue(0.0, temp, 2, 2, 2);
+//    addscale(v[i], temp, weight, 2, 2, 2);
+//    EMf->addJy(temp, ix, iy, iz, ns);
+//    // Jz
+//    eqValue(0.0, temp, 2, 2, 2);
+//    addscale(w[i], temp, weight, 2, 2, 2);
+//    EMf->addJz(temp, ix, iy, iz, ns);
+//
+//  }
+//  // communicate contribution from ghost cells 
+//  EMf->communicateGhostP2G(ns, 0, 0, 0, 0, vct);
+//}
 /** apply a linear perturbation to particle distribution */
 void Particles3D::linear_perturbation(double deltaBoB, double kx, double ky, double angle, double omega_r, double omega_i, double Ex_mod, double Ex_phase, double Ey_mod, double Ey_phase, double Ez_mod, double Ez_phase, double Bx_mod, double Bx_phase, double By_mod, double By_phase, double Bz_mod, double Bz_phase, Grid * grid, Field * EMf, VirtualTopology3D * vct) {
 
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index d339894d..82ad7b23 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -282,22 +282,23 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
 
 }
 /** calculate the weights given the position of particles 0,0,0 is the left,left, left node */
-void Particles3Dcomm::calculateWeights(double weight[][2][2], double xp, double yp, double zp, int ix, int iy, int iz, Grid * grid) {
-  double xi[2], eta[2], zeta[2];
-  xi[0] = xp - grid->getXN(ix - 1, iy, iz);
-  eta[0] = yp - grid->getYN(ix, iy - 1, iz);
-  zeta[0] = zp - grid->getZN(ix, iy, iz - 1);
-  xi[1] = grid->getXN(ix, iy, iz) - xp;
-  eta[1] = grid->getYN(ix, iy, iz) - yp;
-  zeta[1] = grid->getZN(ix, iy, iz) - zp;
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++)
-        weight[i][j][k] = xi[i] * eta[j] * zeta[k] * invVOL;
-}
+//void Particles3Dcomm::calculateWeights(double weight[][2][2], double xp, double yp, double zp, int ix, int iy, int iz, Grid * grid) {
+//  double xi[2], eta[2], zeta[2];
+//  xi[0] = xp - grid->getXN(ix - 1, iy, iz);
+//  eta[0] = yp - grid->getYN(ix, iy - 1, iz);
+//  zeta[0] = zp - grid->getZN(ix, iy, iz - 1);
+//  xi[1] = grid->getXN(ix, iy, iz) - xp;
+//  eta[1] = grid->getYN(ix, iy, iz) - yp;
+//  zeta[1] = grid->getZN(ix, iy, iz) - zp;
+//  for (int i = 0; i < 2; i++)
+//    for (int j = 0; j < 2; j++)
+//      for (int k = 0; k < 2; k++)
+//        weight[i][j][k] = xi[i] * eta[j] * zeta[k] * invVOL;
+//}
 
 
-/** Interpolation Particle --> Grid */
+// move this to EMfields3D class
+//
 void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vct) {
   const double inv_dx = 1.0 / dx;
   const double inv_dy = 1.0 / dy;

From 8187fbdbc45bf57ffc396d17980c103a04823874 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 23 Jul 2013 10:20:43 +0200
Subject: [PATCH 021/118] remove code commented in previous commit (iss #41)

---
 particles/Particles3D.cpp     | 73 -----------------------------------
 particles/Particles3Dcomm.cpp | 22 -----------
 2 files changed, 95 deletions(-)

diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 6b55f301..cc83f3f0 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -906,79 +906,6 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
   return(0); // exit succcesfully (hopefully)
 }
 
-/** interpolation Particle->Grid only for pressure tensor */
-//void Particles3D::interpP2G_onlyP(Field * EMf, Grid * grid, VirtualTopology3D * vct) {
-//  double weight[2][2][2];
-//  double temp[2][2][2];
-//  int ix, iy, iz, temp1, temp2, temp3;
-//  for (register long long i = 0; i < nop; i++) {
-//    ix = 2 + int (floor((x[i] - grid->getXstart()) / grid->getDX()));
-//    iy = 2 + int (floor((y[i] - grid->getYstart()) / grid->getDY()));
-//    iz = 2 + int (floor((z[i] - grid->getZstart()) / grid->getDZ()));
-//    calculateWeights(weight, x[i], y[i], z[i], ix, iy, iz, grid);
-//    scale(weight, q[i], 2, 2, 2);
-//    // Pxx
-//    eqValue(0.0, temp, 2, 2, 2);
-//    addscale(u[i] * u[i], temp, weight, 2, 2, 2);
-//    EMf->addPxx(temp, ix, iy, iz, ns);
-//    // Pxy
-//    eqValue(0.0, temp, 2, 2, 2);
-//    addscale(u[i] * v[i], temp, weight, 2, 2, 2);
-//    EMf->addPxy(temp, ix, iy, iz, ns);
-//    // Pxz
-//    eqValue(0.0, temp, 2, 2, 2);
-//    addscale(u[i] * w[i], temp, weight, 2, 2, 2);
-//    EMf->addPxz(temp, ix, iy, iz, ns);
-//    // Pyy
-//    eqValue(0.0, temp, 2, 2, 2);
-//    addscale(v[i] * v[i], temp, weight, 2, 2, 2);
-//    EMf->addPyy(temp, ix, iy, iz, ns);
-//    // Pyz
-//    eqValue(0.0, temp, 2, 2, 2);
-//    addscale(v[i] * w[i], temp, weight, 2, 2, 2);
-//    EMf->addPyz(temp, ix, iy, iz, ns);
-//    // Pzz
-//    eqValue(0.0, temp, 2, 2, 2);
-//    addscale(w[i] * w[i], temp, weight, 2, 2, 2);
-//    EMf->addPzz(temp, ix, iy, iz, ns);
-//  }
-//}
-/** interpolation Particle->Grid only charge density, current */
-//void Particles3D::interpP2G_notP(Field * EMf, Grid * grid, VirtualTopology3D * vct) {
-//  double weight[2][2][2];
-//  double temp[2][2][2];
-//  int ix, iy, iz, temp2, temp1, temp3;
-//  for (register long long i = 0; i < nop; i++) {
-//    ix = 2 + int (floor((x[i] - grid->getXstart()) / grid->getDX()));
-//    iy = 2 + int (floor((y[i] - grid->getYstart()) / grid->getDY()));
-//    iz = 2 + int (floor((z[i] - grid->getZstart()) / grid->getDZ()));
-//    temp1 = (int) min(ix, nxn - 2);
-//    temp2 = (int) min(iy, nyn - 2);
-//    temp3 = (int) min(iz, nzn - 2);
-//    ix = (int) max(temp1, 2);
-//    iy = (int) max(temp2, 2);
-//    iz = (int) max(temp3, 2);
-//    calculateWeights(weight, x[i], y[i], z[i], ix, iy, iz, grid);
-//    scale(weight, q[i], 2, 2, 2);
-//    // rho
-//    EMf->addRho(weight, ix, iy, iz, ns);
-//    // Jx
-//    eqValue(0.0, temp, 2, 2, 2);
-//    addscale(u[i], temp, weight, 2, 2, 2);
-//    EMf->addJx(temp, ix, iy, iz, ns);
-//    // Jy
-//    eqValue(0.0, temp, 2, 2, 2);
-//    addscale(v[i], temp, weight, 2, 2, 2);
-//    EMf->addJy(temp, ix, iy, iz, ns);
-//    // Jz
-//    eqValue(0.0, temp, 2, 2, 2);
-//    addscale(w[i], temp, weight, 2, 2, 2);
-//    EMf->addJz(temp, ix, iy, iz, ns);
-//
-//  }
-//  // communicate contribution from ghost cells 
-//  EMf->communicateGhostP2G(ns, 0, 0, 0, 0, vct);
-//}
 /** apply a linear perturbation to particle distribution */
 void Particles3D::linear_perturbation(double deltaBoB, double kx, double ky, double angle, double omega_r, double omega_i, double Ex_mod, double Ex_phase, double Ey_mod, double Ey_phase, double Ez_mod, double Ez_phase, double Bx_mod, double Bx_phase, double By_mod, double By_phase, double Bz_mod, double Bz_phase, Grid * grid, Field * EMf, VirtualTopology3D * vct) {
 
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 82ad7b23..2867068f 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -281,20 +281,6 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   }
 
 }
-/** calculate the weights given the position of particles 0,0,0 is the left,left, left node */
-//void Particles3Dcomm::calculateWeights(double weight[][2][2], double xp, double yp, double zp, int ix, int iy, int iz, Grid * grid) {
-//  double xi[2], eta[2], zeta[2];
-//  xi[0] = xp - grid->getXN(ix - 1, iy, iz);
-//  eta[0] = yp - grid->getYN(ix, iy - 1, iz);
-//  zeta[0] = zp - grid->getZN(ix, iy, iz - 1);
-//  xi[1] = grid->getXN(ix, iy, iz) - xp;
-//  eta[1] = grid->getYN(ix, iy, iz) - yp;
-//  zeta[1] = grid->getZN(ix, iy, iz) - zp;
-//  for (int i = 0; i < 2; i++)
-//    for (int j = 0; j < 2; j++)
-//      for (int k = 0; k < 2; k++)
-//        weight[i][j][k] = xi[i] * eta[j] * zeta[k] * invVOL;
-//}
 
 
 // move this to EMfields3D class
@@ -338,14 +324,6 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
           for (int kk = 0; kk < 2; kk++) {
             weight[ii][jj][kk] = q[i] * xi[ii] * eta[jj] * zeta[kk] * invVOL;
           }
-      //weight[0][0][0] = q[i] * xi[0] * eta[0] * zeta[0] * invVOL;
-      //weight[0][0][1] = q[i] * xi[0] * eta[0] * zeta[1] * invVOL;
-      //weight[0][1][0] = q[i] * xi[0] * eta[1] * zeta[0] * invVOL;
-      //weight[0][1][1] = q[i] * xi[0] * eta[1] * zeta[1] * invVOL;
-      //weight[1][0][0] = q[i] * xi[1] * eta[0] * zeta[0] * invVOL;
-      //weight[1][0][1] = q[i] * xi[1] * eta[0] * zeta[1] * invVOL;
-      //weight[1][1][0] = q[i] * xi[1] * eta[1] * zeta[0] * invVOL;
-      //weight[1][1][1] = q[i] * xi[1] * eta[1] * zeta[1] * invVOL;
       // add charge density
       speciesMoments.addRho(weight, ix, iy, iz);
       // add current density - X

From c3b88b1c27d892e58970ca3ae0115083f0ddf178 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 23 Jul 2013 10:27:07 +0200
Subject: [PATCH 022/118] removed unused code obsoleted under iss#41

---
 include/Moments.h             |  92 ---------------------------
 main/iPic3Dlib.cpp            |   2 +-
 particles/Particles3Dcomm.cpp | 114 ++++------------------------------
 3 files changed, 13 insertions(+), 195 deletions(-)

diff --git a/include/Moments.h b/include/Moments.h
index e90e01db..cb0018aa 100644
--- a/include/Moments.h
+++ b/include/Moments.h
@@ -57,98 +57,6 @@ class Moments {
     void init(int nx_, int ny_, int nz_);
     ~Moments();
     void set_to_zero();
-    void addRho(double weight[][2][2], int X, int Y, int Z);
-    void addJx(double weight[][2][2], int X, int Y, int Z);
-    void addJy(double weight[][2][2], int X, int Y, int Z);
-    void addJz(double weight[][2][2], int X, int Y, int Z);
-
-    void addPxx(double weight[][2][2], int X, int Y, int Z);
-    void addPxy(double weight[][2][2], int X, int Y, int Z);
-    void addPxz(double weight[][2][2], int X, int Y, int Z);
-    void addPyy(double weight[][2][2], int X, int Y, int Z);
-    void addPyz(double weight[][2][2], int X, int Y, int Z);
-    void addPzz(double weight[][2][2], int X, int Y, int Z);
 };
 
-/** add an amount of charge density to charge density field at node X,Y */
-inline void Moments::addRho(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++) {
-        rho[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-/** add an amount of charge density to current density - direction X to current density field on the node*/
-inline void Moments::addJx(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++){
-        Jx[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-/** add an amount of current density - direction Y to current density field on the node */
-inline void Moments::addJy(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++){
-        Jy[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-/** add an amount of current density - direction Z to current density field on the node */
-inline void Moments::addJz(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++){
-        Jz[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-/** add an amount of pressure density - direction XX to current density field on the node */
-inline void Moments::addPxx(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++){
-        pXX[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-/** add an amount of pressure density - direction XY to current density field on the node*/
-inline void Moments::addPxy(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++){
-        pXY[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-/** add an amount of pressure density - direction XZ to current density field on the node */
-inline void Moments::addPxz(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++){
-        pXZ[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-/** add an amount of pressure density - direction YY to current density field on the node*/
-inline void Moments::addPyy(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++){
-        pYY[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-/** add an amount of pressure density - direction YZ to current density field on the node */
-inline void Moments::addPyz(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++){
-        pYZ[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-/** add an amount of pressure density - direction ZZ to current density field on the node */
-inline void Moments::addPzz(double weight[][2][2], int X, int Y, int Z) {
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++)
-      for (int k = 0; k < 2; k++){
-        pZZ[X - i][Y - j][Z - k] += weight[i][j][k];
-      }
-}
-
 #endif
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 852817b5..b05ad584 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -177,7 +177,7 @@ void c_Solver::CalculateField() {
   {
     // interpolate particles to grid nodes
     EMf->sumMoments(part[i], grid, vct);
-    //part[i].interpP2G(EMf, grid, vct);
+    //part[i].interpP2G(EMf, grid, vct); // the old, slow way.
   }
 
   EMf->sumOverSpecies(vct);                 // sum all over the species
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 2867068f..3e5c1ede 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -283,7 +283,7 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
 }
 
 
-// move this to EMfields3D class
+// A much faster version of this is at EMfields3D::sumMoments
 //
 void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vct) {
   const double inv_dx = 1.0 / dx;
@@ -297,14 +297,8 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
   // could first apply an efficient parallel sorting algorithm
   // to the particles and then accumulate moments in smaller
   // subarrays.
-  //#ifdef _OPENMP
-  #pragma omp parallel
   {
-    int thread_num = omp_get_thread_num();
-    Moments& speciesMoments = EMf->fetch_momentsArray(thread_num);
-    //Moments speciesMoments(nxn,nyn,nzn,invVOL);
-    speciesMoments.set_to_zero();
-    #pragma omp for
+    assert_le(nop,INT_MAX); // else would need to use long long
     for (int i = 0; i < nop; i++)
     {
       const int ix = 2 + int (floor((x[i] - xstart) * inv_dx));
@@ -325,147 +319,63 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
             weight[ii][jj][kk] = q[i] * xi[ii] * eta[jj] * zeta[kk] * invVOL;
           }
       // add charge density
-      speciesMoments.addRho(weight, ix, iy, iz);
+      EMf->addRho(weight, ix, iy, iz, ns);
       // add current density - X
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = u[i] * weight[ii][jj][kk];
-      speciesMoments.addJx(temp, ix, iy, iz);
+      EMf->addJx(temp, ix, iy, iz, ns);
       // add current density - Y
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = v[i] * weight[ii][jj][kk];
-      speciesMoments.addJy(temp, ix, iy, iz);
+      EMf->addJy(temp, ix, iy, iz, ns);
       // add current density - Z
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = w[i] * weight[ii][jj][kk];
-      speciesMoments.addJz(temp, ix, iy, iz);
+      EMf->addJz(temp, ix, iy, iz, ns);
       // Pxx - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = u[i] * u[i] * weight[ii][jj][kk];
-      speciesMoments.addPxx(temp, ix, iy, iz);
+      EMf->addPxx(temp, ix, iy, iz, ns);
       // Pxy - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = u[i] * v[i] * weight[ii][jj][kk];
-      speciesMoments.addPxy(temp, ix, iy, iz);
+      EMf->addPxy(temp, ix, iy, iz, ns);
       // Pxz - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = u[i] * w[i] * weight[ii][jj][kk];
-      speciesMoments.addPxz(temp, ix, iy, iz);
+      EMf->addPxz(temp, ix, iy, iz, ns);
       // Pyy - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = v[i] * v[i] * weight[ii][jj][kk];
-      speciesMoments.addPyy(temp, ix, iy, iz);
+      EMf->addPyy(temp, ix, iy, iz, ns);
       // Pyz - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = v[i] * w[i] * weight[ii][jj][kk];
-      speciesMoments.addPyz(temp, ix, iy, iz);
+      EMf->addPyz(temp, ix, iy, iz, ns);
       // Pzz - add pressure tensor
       for (int ii = 0; ii < 2; ii++)
         for (int jj = 0; jj < 2; jj++)
           for (int kk = 0; kk < 2; kk++)
             temp[ii][jj][kk] = w[i] * w[i] * weight[ii][jj][kk];
-      speciesMoments.addPzz(temp, ix, iy, iz);
+      EMf->addPzz(temp, ix, iy, iz, ns);
     }
-    // change this to allow more parallelization after implementing array class
-    #pragma omp critical
-    EMf->addToSpeciesMoments(speciesMoments,ns);
   }
-  //#else
-  //{
-  //  assert_le(nop,INT_MAX); // else would need to use long long
-  //  for (int i = 0; i < nop; i++)
-  //  {
-  //    const int ix = 2 + int (floor((x[i] - xstart) * inv_dx));
-  //    const int iy = 2 + int (floor((y[i] - ystart) * inv_dy));
-  //    const int iz = 2 + int (floor((z[i] - zstart) * inv_dz));
-  //    double temp[2][2][2];
-  //    double xi[2], eta[2], zeta[2];
-  //    xi[0] = x[i] - grid->getXN(ix - 1, iy, iz);
-  //    eta[0] = y[i] - grid->getYN(ix, iy - 1, iz);
-  //    zeta[0] = z[i] - grid->getZN(ix, iy, iz - 1);
-  //    xi[1] = grid->getXN(ix, iy, iz) - x[i];
-  //    eta[1] = grid->getYN(ix, iy, iz) - y[i];
-  //    zeta[1] = grid->getZN(ix, iy, iz) - z[i];
-  //    double weight[2][2][2];
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++) {
-  //          weight[ii][jj][kk] = q[i] * xi[ii] * eta[jj] * zeta[kk] * invVOL;
-  //        }
-  //    // add charge density
-  //    EMf->addRho(weight, ix, iy, iz, ns);
-  //    // add current density - X
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++)
-  //          temp[ii][jj][kk] = u[i] * weight[ii][jj][kk];
-  //    EMf->addJx(temp, ix, iy, iz, ns);
-  //    // add current density - Y
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++)
-  //          temp[ii][jj][kk] = v[i] * weight[ii][jj][kk];
-  //    EMf->addJy(temp, ix, iy, iz, ns);
-  //    // add current density - Z
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++)
-  //          temp[ii][jj][kk] = w[i] * weight[ii][jj][kk];
-  //    EMf->addJz(temp, ix, iy, iz, ns);
-  //    // Pxx - add pressure tensor
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++)
-  //          temp[ii][jj][kk] = u[i] * u[i] * weight[ii][jj][kk];
-  //    EMf->addPxx(temp, ix, iy, iz, ns);
-  //    // Pxy - add pressure tensor
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++)
-  //          temp[ii][jj][kk] = u[i] * v[i] * weight[ii][jj][kk];
-  //    EMf->addPxy(temp, ix, iy, iz, ns);
-  //    // Pxz - add pressure tensor
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++)
-  //          temp[ii][jj][kk] = u[i] * w[i] * weight[ii][jj][kk];
-  //    EMf->addPxz(temp, ix, iy, iz, ns);
-  //    // Pyy - add pressure tensor
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++)
-  //          temp[ii][jj][kk] = v[i] * v[i] * weight[ii][jj][kk];
-  //    EMf->addPyy(temp, ix, iy, iz, ns);
-  //    // Pyz - add pressure tensor
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++)
-  //          temp[ii][jj][kk] = v[i] * w[i] * weight[ii][jj][kk];
-  //    EMf->addPyz(temp, ix, iy, iz, ns);
-  //    // Pzz - add pressure tensor
-  //    for (int ii = 0; ii < 2; ii++)
-  //      for (int jj = 0; jj < 2; jj++)
-  //        for (int kk = 0; kk < 2; kk++)
-  //          temp[ii][jj][kk] = w[i] * w[i] * weight[ii][jj][kk];
-  //    EMf->addPzz(temp, ix, iy, iz, ns);
-  //  }
-  //}
-  //#endif
   // communicate contribution from ghost cells 
   EMf->communicateGhostP2G(ns, 0, 0, 0, 0, vct);
 }

From 8556e6c69183e2e3db9f9c21b3611ea97b5ef5af Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 1 Aug 2013 14:23:28 +0200
Subject: [PATCH 023/118] issue #42: Support -fno-exceptions: replacing throw
 with eprintf

---
 ConfigFile/src/ConfigFile.cpp  |   4 +-
 ConfigFile/src/ConfigFile.h    |   4 +-
 PSKOutput3D/PSKhdf5adaptor.cpp | 300 ++++++++++++++++++---------------
 include/ConfigFile.h           |   4 +-
 include/PSKOutput.h            |  41 +++--
 5 files changed, 199 insertions(+), 154 deletions(-)

diff --git a/ConfigFile/src/ConfigFile.cpp b/ConfigFile/src/ConfigFile.cpp
index 7f6f1f99..ed9f6b53 100644
--- a/ConfigFile/src/ConfigFile.cpp
+++ b/ConfigFile/src/ConfigFile.cpp
@@ -1,6 +1,7 @@
 // ConfigFile.cpp
 
 #include "ConfigFile.h"
+#include "errors.h"
 
 using std::string;
 
@@ -11,7 +12,8 @@ ConfigFile::ConfigFile(string filename, string delimiter, string comment, string
   std::ifstream in(filename.c_str());
 
   if (!in)
-    throw file_not_found(filename);
+    eprintf("file not found: %s", filename.c_str());
+    //throw file_not_found(filename);
 
   in >> (*this);
 }
diff --git a/ConfigFile/src/ConfigFile.h b/ConfigFile/src/ConfigFile.h
index 4de95342..d8d8108a 100644
--- a/ConfigFile/src/ConfigFile.h
+++ b/ConfigFile/src/ConfigFile.h
@@ -48,6 +48,7 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
+#include "errors.h" // for eprintf
 
 using std::string;
 
@@ -175,7 +176,8 @@ template < class T > T ConfigFile::read(const string & key) const {
   // Read the value corresponding to key
   mapci p = myContents.find(key);
   if (p == myContents.end())
-    throw key_not_found(key);
+    eprintf("key not found: %s", key.c_str());
+    //throw key_not_found(key);
   return string_as_T < T > (p->second);
 }
 
diff --git a/PSKOutput3D/PSKhdf5adaptor.cpp b/PSKOutput3D/PSKhdf5adaptor.cpp
index 33c83115..8a4d2d14 100644
--- a/PSKOutput3D/PSKhdf5adaptor.cpp
+++ b/PSKOutput3D/PSKhdf5adaptor.cpp
@@ -1,5 +1,6 @@
 
 #include <mpi.h>
+#include "errors.h"
 #include "PSKhdf5adaptor.h"
 
 using namespace PSK;
@@ -19,8 +20,8 @@ void HDF5OutputAdaptor::get_dataset_context(const std::string & name, std::vecto
   hid_array.resize(ncompx);
 
   if (ncompx == 0) {
-    throw PSK::OutputException("HDF5OutputAdaptor::get_dataset_context()>> zero name components");
-
+    //throw PSK::OutputException("HDF5OutputAdaptor::get_dataset_context()>> zero name components");
+    eprintf(">> zero name components");
   }
   else if (ncompx == 1) {
     hid_array[0] = _hdf5_file_id;
@@ -59,7 +60,8 @@ void HDF5OutputAdaptor::get_dataset_context(const std::string & name, std::vecto
 
           // std::cout << "group create failed \n" ;
 
-          throw PSK::OutputException("Failed to open/create group for <" + name + "> at element <" + name_components[i] + ">", "HDF5OutputAdaptor::get_dataset_context()");
+          //throw PSK::OutputException("Failed to open/create group for <" + name + "> at element <" + name_components[i] + ">", "HDF5OutputAdaptor::get_dataset_context()");
+          eprintf("Failed to open/create group for <%s> at element <%s>", name.c_str(), name_components[i].c_str());
         }
 
       }
@@ -78,7 +80,8 @@ void HDF5OutputAdaptor::get_dataset_context(const std::string & name, std::vecto
  */
 std::string HDF5OutputAdaptor::purify_object_name(const std::string & objname) {
   if (objname.length() == 0)
-    throw PSK::OutputException("Zero length tag name", "HDF5OutputAdaptor::purify_object_name()");
+    //throw PSK::OutputException("Zero length tag name", "HDF5OutputAdaptor::purify_object_name()");
+    eprintf("Zero length tag name");
 
   return objname[0] != '/' ? "/" + objname : objname;
 
@@ -118,12 +121,13 @@ void HDF5OutputAdaptor::open(const std::string & name) {
   _hdf5_file_id = H5Fcreate(name.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
 
   if (_hdf5_file_id <= 0) {
-    PSK::OutputException e("H5FCreate fails", "HDF5OutputAdaptor::open2()");
+    eprintf("H5FCreate fails");
+    //PSK::OutputException e("H5FCreate fails", "HDF5OutputAdaptor::open2()");
 
     // if using H5F_ACC_EXCL
     // e.push( "Using H5F_ACC_EXCL: Check if file " +name + " already exists" );
 
-    throw e;
+    //throw e;
   }
 
   _hdf5_file_name = name;
@@ -155,9 +159,10 @@ void HDF5OutputAdaptor::open_append(const std::string & name) {
   }
 
   if (_hdf5_file_id <= 0) {
-    PSK::OutputException e("H5Fopen fails", "HDF5OutputAdaptor::open_append()");
+    eprintf("H5Fopen fails");
+    //PSK::OutputException e("H5Fopen fails", "HDF5OutputAdaptor::open_append()");
 
-    throw e;
+    //throw e;
   }
   _hdf5_file_name = name;
 
@@ -172,8 +177,9 @@ void HDF5OutputAdaptor::close(void) {
   herr_t hdf5err = H5Fclose(_hdf5_file_id);
 
   if (hdf5err < 0) {
-    PSK::OutputException e("HDF5OutputAdaptor::close()>> H5FClose fails");
-    throw e;
+    eprintf("H5FClose fails");
+    //PSK::OutputException e("HDF5OutputAdaptor::close()>> H5FClose fails");
+    //throw e;
   }
 
   _hdf5_file_name.clear();
@@ -188,7 +194,7 @@ void HDF5OutputAdaptor::close(void) {
  * 
  */
 void HDF5OutputAdaptor::write(const std::string & tag, int i_value) {
-  try {
+  //try {
     std::string ptag = purify_object_name(tag);
 
     std::vector < hid_t > hid_array;
@@ -213,8 +219,9 @@ void HDF5OutputAdaptor::write(const std::string & tag, int i_value) {
     }
 
     if (hdf5err < 0) {
-      PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(int)");
-      throw e;
+      eprintf("make_dataset fails for %s", tag.c_str());
+      //PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(int)");
+      //throw e;
     }
 
     // close groups, if any, but don't try to close the file id at [0]
@@ -223,14 +230,14 @@ void HDF5OutputAdaptor::write(const std::string & tag, int i_value) {
 
     delete[]hdf5dims;
 
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(int)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(int)");
+  //  throw e;
+  //}
 }
 // new long writing
 void HDF5OutputAdaptor::write(const std::string & tag, long i_value) {
-  try {
+  //try {
     std::string ptag = purify_object_name(tag);
 
     std::vector < hid_t > hid_array;
@@ -255,8 +262,9 @@ void HDF5OutputAdaptor::write(const std::string & tag, long i_value) {
     }
 
     if (hdf5err < 0) {
-      PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(long)");
-      throw e;
+      eprintf("make_dataset fails for %s", tag.c_str());
+      //PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(long)");
+      //throw e;
     }
 
     // close groups, if any, but don't try to close the file id at [0]
@@ -265,18 +273,19 @@ void HDF5OutputAdaptor::write(const std::string & tag, long i_value) {
 
     delete[]hdf5dims;
 
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(long)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(long)");
+  //  throw e;
+  //}
 }
 
 // 
 void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, const int *i_array) {
-  try {
+  //try {
     if (dimens.size() == 0) {
-      PSK::OutputException e("Zero Dimens size", "HDF5OutputAdaptor::write(int* array)");
-      throw e;
+      eprintf("Zero Dimens size");
+      //PSK::OutputException e("Zero Dimens size", "HDF5OutputAdaptor::write(int* array)");
+      //throw e;
     }
 
     std::string ptag = purify_object_name(tag);
@@ -304,25 +313,27 @@ void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, cons
     }
 
     if (hdf5err < 0) {
-      PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(int* array)");
-      throw e;
+      eprintf("make_dataset fails for %s", tag.c_str());
+      //PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(int* array)");
+      //throw e;
     }
 
     // close groups, if any, but don't try to close the file id at [0]
     for (int i = hid_array.size() - 1; i > 0; --i)
       hdf5err = H5Gclose(hid_array[i]);
 
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(int* array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(int* array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, const long *i_array) {
-  try {
+  //try {
     if (dimens.size() == 0) {
-      PSK::OutputException e("Zero Dimens size", "HDF5OutputAdaptor::write(long* array)");
-      throw e;
+      eprintf("Zero Dimens size");
+      //PSK::OutputException e("Zero Dimens size", "HDF5OutputAdaptor::write(long* array)");
+      //throw e;
     }
 
     std::string ptag = purify_object_name(tag);
@@ -341,55 +352,57 @@ void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, cons
                                            dimens.size(), hdf5dims, i_array);
 
     if (hdf5err < 0) {
-      PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(long* array)");
-      throw e;
+      eprintf("make_dataset fails for %s", tag.c_str());
+      //PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(long* array)");
+      //throw e;
     }
 
     // close groups, if any, but don't try to close the file id at [0]
     for (int i = hid_array.size() - 1; i > 0; --i)
       hdf5err = H5Gclose(hid_array[i]);
 
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(long* array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(long* array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, const std::vector < int >&i_array) {
-  try {
+  //try {
     int n = dimens.nels();
     int *i_array_p = new int[n];
     for (int i = 0; i < n; ++i)
       i_array_p[i] = i_array[i];
     write(tag, dimens, i_array_p);
     delete[]i_array_p;
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(vector<int> array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(vector<int> array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, const std::vector < long >&i_array) {
-  try {
+  //try {
     int n = dimens.nels();
     long *i_array_p = new long[n];
     for (int i = 0; i < n; ++i)
       i_array_p[i] = i_array[i];
     write(tag, dimens, i_array_p);
     delete[]i_array_p;
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(vector<long> array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(vector<long> array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, const int ***i_array) {
   if (dimens.size() != 3) {
-    PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(int*** array)");
-    throw e;
+    eprintf("Dimens size not 3 for object %s", objname.c_str());
+    //PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(int*** array)");
+    //throw e;
   }
 
-  try {
+  //try {
     int nels = dimens.nels();
     int *i_array_p = new int[nels];
     const int di = dimens[0];
@@ -402,10 +415,10 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
           i_array_p[i * djk + j * dk + k] = i_array[i][j][k];
     write(objname, dimens, i_array_p);
     delete[]i_array_p;
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(int*** array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(int*** array)");
+  //  throw e;
+  //}
 }
 
 
@@ -418,7 +431,7 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
  * 
  */
 void HDF5OutputAdaptor::write(const std::string & tag, float f_value) {
-  try {
+  //try {
     std::string ptag = purify_object_name(tag);
 
     std::vector < hid_t > hid_array;
@@ -434,8 +447,9 @@ void HDF5OutputAdaptor::write(const std::string & tag, float f_value) {
                                             1, hdf5dims, &f_value);
 
     if (hdf5err < 0) {
-      PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(float)");
-      throw e;
+      eprintf("make_dataset fails for %s", tag.c_str());
+      //PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(float)");
+      //throw e;
     }
 
     // close groups, if any, but don't try to close the file id at [0]
@@ -444,17 +458,18 @@ void HDF5OutputAdaptor::write(const std::string & tag, float f_value) {
 
     delete[]hdf5dims;
 
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(float)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(float)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, const float *f_array) {
-  try {
+  //try {
     if (dimens.size() == 0) {
-      PSK::OutputException e("Zero Dimens size", "HDF5OutputAdaptor::write(float* array)");
-      throw e;
+      eprintf("Zero Dimens size");
+      //PSK::OutputException e("Zero Dimens size", "HDF5OutputAdaptor::write(float* array)");
+      //throw e;
     }
 
     std::string ptag = purify_object_name(tag);
@@ -473,41 +488,43 @@ void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, cons
                                             dimens.size(), hdf5dims, f_array);
 
     if (hdf5err < 0) {
-      PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(float* array)");
-      throw e;
+      eprintf("make_dataset fails for %s", tag.c_str());
+      //PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(float* array)");
+      //throw e;
     }
 
     // close groups, if any, but don't try to close the file id at [0]
     for (int i = hid_array.size() - 1; i > 0; --i)
       hdf5err = H5Gclose(hid_array[i]);
 
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(float* array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(float* array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, const std::vector < float >&f_array) {
-  try {
+  //try {
     int n = dimens.nels();
     float *f_array_p = new float[n];
     for (int i = 0; i < n; ++i)
       f_array_p[i] = f_array[i];
     write(tag, dimens, f_array_p);
     delete[]f_array_p;
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(vector<float> array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(vector<float> array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, const float ***f_array) {
   if (dimens.size() != 3) {
-    PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(float*** array)");
-    throw e;
+    eprintf("Dimens size not 3 for object %s", objname.c_str());
+    //PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(float*** array)");
+    //throw e;
   }
 
-  try {
+  //try {
     int nels = dimens.nels();
     float *f_array_p = new float[nels];
     const int di = dimens[0];
@@ -520,10 +537,10 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
           f_array_p[i * djk + j * dk + k] = f_array[i][j][k];
     write(objname, dimens, f_array_p);
     delete[]f_array_p;
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(float*** array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(float*** array)");
+  //  throw e;
+  //}
 }
 
 
@@ -536,7 +553,7 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
  * 
  */
 void HDF5OutputAdaptor::write(const std::string & tag, double d_value) {
-  try {
+  //try {
     std::string ptag = purify_object_name(tag);
 
     std::vector < hid_t > hid_array;
@@ -561,8 +578,9 @@ void HDF5OutputAdaptor::write(const std::string & tag, double d_value) {
     }
 
     if (hdf5err < 0) {
-      PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(double)");
-      throw e;
+      eprintf("make_dataset fails for ", tag.c_str());
+      //PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(double)");
+      //throw e;
     }
 
     // close groups, if any, but don't try to close the file id at [0]
@@ -571,17 +589,18 @@ void HDF5OutputAdaptor::write(const std::string & tag, double d_value) {
 
     delete[]hdf5dims;
 
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(double)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(double)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, const double *d_array) {
-  try {
+  //try {
     if (dimens.size() == 0) {
-      PSK::OutputException e("Zero Dimens size", "HDF5OutputAdaptor::write(double* array)");
-      throw e;
+      eprintf("Zero Dimens size");
+      //PSK::OutputException e("Zero Dimens size", "HDF5OutputAdaptor::write(double* array)");
+      //throw e;
     }
 
     std::string ptag = purify_object_name(tag);
@@ -610,41 +629,43 @@ void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, cons
 
 
     if (hdf5err < 0) {
-      PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(double* array)");
-      throw e;
+      eprintf("make_dataset fails for %s", tag.c_str());
+      //PSK::OutputException e("make_dataset fails for " + tag, "HDF5OutputAdaptor::write(double* array)");
+      //throw e;
     }
 
     // close groups, if any, but don't try to close the file id at [0]
     for (int i = hid_array.size() - 1; i > 0; --i)
       hdf5err = H5Gclose(hid_array[i]);
 
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(double* array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(double* array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, const std::vector < double >&d_array) {
-  try {
+  //try {
     int n = dimens.nels();
     double *d_array_p = new double[n];
     for (int i = 0; i < n; ++i)
       d_array_p[i] = d_array[i];
     write(tag, dimens, d_array_p);
     delete[]d_array_p;
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(vector<double> array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(vector<double> array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, double ***d_array) {
   if (dimens.size() != 3) {
-    PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(double*** array)");
-    throw e;
+    eprintf("Dimens size not 3 for object %s", objname.c_str());
+    //PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(double*** array)");
+    //throw e;
   }
 
-  try {
+  //try {
     int nels = dimens.nels();
     double *d_array_p = new double[nels];
     const int di = dimens[0];
@@ -666,20 +687,21 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
         }
     write(objname, dimens, d_array_p);
     delete[]d_array_p;
-  }
-  catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(double*** array)");
-    throw e;
-  }
+  //}
+  //catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(double*** array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, const int ns, double ****d_array) {
   if (dimens.size() != 3) {
-    PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(double**** array)");
-    throw e;
+    eprintf("Dimens size not 3 for object %s", objname.c_str());
+    //PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(double**** array)");
+    //throw e;
   }
 
-  try {
+  //try {
     int nels = dimens.nels();
     double *d_array_p = new double[nels];
     const int di = dimens[0];
@@ -700,20 +722,21 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
         }
     write(objname, dimens, d_array_p);
     delete[]d_array_p;
-  }
-  catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(double**** array)");
-    throw e;
-  }
+  //}
+  //catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(double**** array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, double **d_array) {
   if (dimens.size() != 2) {
-    PSK::OutputException e("Dimens size not 2 for object " + objname, "HDF5OutputAdaptor::write(double** array)");
-    throw e;
+    eprintf("Dimens size not 2 for object %s", objname.c_str());
+    //PSK::OutputException e("Dimens size not 2 for object " + objname, "HDF5OutputAdaptor::write(double** array)");
+    //throw e;
   }
 
-  try {
+  //try {
     int nels = dimens.nels();
     double *d_array_p = new double[nels];
     const int di = dimens[0];
@@ -723,19 +746,20 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
         d_array_p[i * dj + j] = d_array[i + 1][j + 1];  // I am not writing ghost cells
     write(objname, dimens, d_array_p);
     delete[]d_array_p;
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(double** array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(double** array)");
+  //  throw e;
+  //}
 }
 
 void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, const int ns, double ***d_array) {
   if (dimens.size() != 2) {
-    PSK::OutputException e("Dimens size not 2 for object " + objname, "HDF5OutputAdaptor::write(double*** array)");
-    throw e;
+    eprintf("Dimens size not 2 for object %s", objname.c_str());
+    //PSK::OutputException e("Dimens size not 2 for object " + objname, "HDF5OutputAdaptor::write(double*** array)");
+    //throw e;
   }
 
-  try {
+  //try {
     int nels = dimens.nels();
     double *d_array_p = new double[nels];
     const int di = dimens[0];
@@ -745,8 +769,8 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
         d_array_p[i * dj + j] = d_array[i + 1][j + 1][ns];  // I am not writing ghost cells
     write(objname, dimens, d_array_p);
     delete[]d_array_p;
-  } catch(PSK::Exception & e) {
-    e.push("In HDF5OutputAdaptor::write(double*** array)");
-    throw e;
-  }
+  //} catch(PSK::Exception & e) {
+  //  e.push("In HDF5OutputAdaptor::write(double*** array)");
+  //  throw e;
+  //}
 }
diff --git a/include/ConfigFile.h b/include/ConfigFile.h
index 4de95342..79008be2 100644
--- a/include/ConfigFile.h
+++ b/include/ConfigFile.h
@@ -48,6 +48,7 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
+#include "errors.h"
 
 using std::string;
 
@@ -175,7 +176,8 @@ template < class T > T ConfigFile::read(const string & key) const {
   // Read the value corresponding to key
   mapci p = myContents.find(key);
   if (p == myContents.end())
-    throw key_not_found(key);
+    eprintf("key not found: %s", key.c_str());
+    //throw key_not_found(key);
   return string_as_T < T > (p->second);
 }
 
diff --git a/include/PSKOutput.h b/include/PSKOutput.h
index 604387cf..624fad4c 100644
--- a/include/PSKOutput.h
+++ b/include/PSKOutput.h
@@ -13,6 +13,7 @@ developers: D. Burgess, June/July 2006
 #include <vector>
 #include <list>
 
+#include "errors.h"
 #include "PSKException.h"
 #include "Particles.h"
 #include "Field.h"
@@ -103,49 +104,63 @@ namespace PSK {
   public:
     OutputAdaptor(void) {;
     } virtual void open(const std::string & outf) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::open");
+      eprintf("Function not implemented");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::open");
     }
     virtual void close(void) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::close");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::close");
     }
 
     // write int functions
     virtual void write(const std::string & objname, int i) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(int)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(int)");
     }
     virtual void write(const std::string & objname, const Dimens dimens, const int *i_array) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(int* array)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(int* array)");
     }
     virtual void write(const std::string & objname, const Dimens dimens, const long *i_array) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(long* array)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(long* array)");
     }
     virtual void write(const std::string & objname, const Dimens dimens, const std::vector < int >&i_array) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(vector<int> array)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(vector<int> array)");
     }
 
     virtual void write(const std::string & objname, const Dimens dimens, const std::vector < long >&i_array) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(vector<long> array)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(vector<long> array)");
     }
     // write float functions
     virtual void write(const std::string & objname, float f) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(float)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(float)");
     }
     virtual void write(const std::string & objname, const Dimens dimens, const float *f_array) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(float* array)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(float* array)");
     }
     virtual void write(const std::string & objname, const Dimens dimens, const std::vector < float >&f_array) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(vector<float> array)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(vector<float> array)");
     }
 
     // write double functions
     virtual void write(const std::string & objname, double d) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(double)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(double)");
     }
     virtual void write(const std::string & objname, const Dimens dimens, const double *d_array) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(double* array)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(double* array)");
     }
     virtual void write(const std::string & objname, const Dimens dimens, const std::vector < double >&d_array) {
-      throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(vector<double> array)");
+      eprintf("Function not implemented");
+      //throw OutputException("Function not implemented", "PSK::OutputAdaptor::write(vector<double> array)");
     }
 
   };

From 31ebd95fd2c28d938acc22d7e50ac1d80d472f65 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 25 Jul 2013 17:11:39 +0200
Subject: [PATCH 024/118] issue #43: implemented new array classes

---
 include/Alloc.h       | 468 ++++++++++++++++++++----
 include/arrays.h      | 188 ++++++++++
 include/asserts.h     |   7 +-
 tests/Makefile        |  38 ++
 tests/stopwatch.h     |  97 +++++
 tests/test_arrays.cpp | 827 ++++++++++++++++++++++++++++++++++++++++++
 utility/asserts.cpp   |   3 +
 utility/debug.cpp     |  11 +-
 8 files changed, 1557 insertions(+), 82 deletions(-)
 create mode 100644 include/arrays.h
 create mode 100644 tests/Makefile
 create mode 100644 tests/stopwatch.h
 create mode 100644 tests/test_arrays.cpp

diff --git a/include/Alloc.h b/include/Alloc.h
index a57b9b70..a0837911 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -1,103 +1,415 @@
+#ifndef IPIC_ALLOC_H
+#define IPIC_ALLOC_H
+#include <cstddef> // for alignment stuff
+#include "ipicdefs.h" // for CHECK_BOUNDS
+#include "asserts.h" // for assert_le, assert_lt
+//#include "arrays.h" // fixed-dimension arrays
 
-#ifndef ALLOC_H
-#define ALLOC_H
+/*
+    Array classes developed by
+      Alec Johnson,
+    consolidating arrays developed by 
+      Reger Ferrer, Vicenç Beltran, and Florentino Sainz
+    and earlier arrays defined by
+      Jorge Amaya and Stefano Markidis.
 
-#include <stdlib.h>
-
-/*! The allocator for 4D array */
-template < class type > type **** _new_4_array(int sz1, int sz2, int sz3, int sz4) {
-
-  type ****all_x;
-  type ***all_y;
-  type **all_z;
-  type *all_r;
+    For examples of use of this class,
+    see test_arrays.cpp
+*/
+#define ALIGNMENT (64)
+#ifdef __INTEL_COMPILER
+    #define ALIGNED(X) __assume_aligned(X, ALIGNMENT)
+    #define AlignedAlloc(T, NUM) \
+        (T *const __restrict__)(_mm_malloc(sizeof(T)*NUM, ALIGNMENT))
+    #define AlignedFree(S) (_mm_free(S))
+#else
+    #define ALIGNED(X)
+    #define AlignedFree(S) (delete[] S)
+    #define AlignedAlloc(T, NUM) (new T[NUM]) 
+#endif
 
-  all_x = new type ***[sz1];
-  all_y = new type **[sz1 * sz2];
-  all_z = new type *[sz1 * sz2 * sz3];
-  all_r = new type[sz1 * sz2 * sz3 * sz4];
+// Compile with -DCHECK_BOUNDS to turn on bounds checking.
+//#define CHECK_BOUNDS
+#ifdef CHECK_BOUNDS
+  #define check_bounds(n,S) {assert_le(0, n); assert_lt(n, S);}
+#else
+  #define check_bounds(n,S)
+#endif
 
-  type ****result = all_x;
+/*** begin Array classes with flexible dimensions ***/
 
-  for (int i = 0; i < sz1; i++, all_y += sz2) {
-    result[i] = all_y;
-    for (int j = 0; j < sz2; j++, all_z += sz3) {
-      result[i][j] = all_z;
-      for (int k = 0; k < sz3; k++, all_r += sz4) {
-        result[i][j][k] = all_r;
-      }
-    }
+// methods to allocate arrays.
+// These are a succinct equivalent of Jorge's earler methods,
+// except for the use of AlignedAlloc in place of new.
+//
+template < class type >
+inline type * newArray1(size_t sz1)
+{
+  type *arr = AlignedAlloc(type, sz1); // new type [sz1];
+  return arr;
+}
+template < class type >
+inline type ** newArray2(size_t sz1, size_t sz2)
+{
+  type **arr = AlignedAlloc(type*, sz1); // new type *[sz1];
+  type *ptr = newArray1<type>(sz1*sz2);
+  for (size_t i = 0; i < sz1; i++)
+  {
+    arr[i] = ptr;
+    ptr += sz2;
   }
-
-  return result;
+  return arr;
 }
-
-/*! Deallocator for 4D arrays */
-template < class type > void delArr4(type **** arr, int dummyx, int dummyy, int dummyz) {
-  delete[]arr[0][0][0];
-  delete[]arr[0][0];
-  delete[]arr[0];
-  delete[]arr;
+template < class type >
+inline type *** newArray3(size_t sz1, size_t sz2, size_t sz3)
+{
+  type ***arr = AlignedAlloc(type**, sz1); // new type **[sz1];
+  type **ptr = newArray2<type>(sz1*sz2, sz3);
+  for (size_t i = 0; i < sz1; i++)
+  {
+    arr[i] = ptr;
+    ptr += sz2;
+  }
+  return arr;
+}
+template <class type>
+inline type **** newArray4(size_t sz1, size_t sz2, size_t sz3, size_t sz4)
+{
+  type ****arr = AlignedAlloc(type***, sz1); //(new type ***[sz1]);
+  type ***ptr = newArray3<type>(sz1*sz2, sz3, sz4);
+  for (size_t i = 0; i < sz1; i++) {
+    arr[i] = ptr;
+    ptr += sz2;
+  }
+  return arr;
 }
 
-/*! The allocator for 3D array */
-template < class type > type *** _new_3_array(int sz1, int sz2, int sz3) {
-
-  type ***all_x;
-  type **all_y;
-  type *all_z;
-
-  all_x = new type **[sz1];
-  all_y = new type *[sz1 * sz2];
-  all_z = new type[sz1 * sz2 * sz3];
-
-  type ***result = all_x;
+// methods to deallocate arrays
+//
+template < class type > inline void delArray1(type * arr)
+{ AlignedFree(arr); }
+template < class type > inline void delArray2(type ** arr)
+{ delArray1(arr[0]); AlignedFree(arr); }
+template < class type > inline void delArray3(type *** arr)
+{ delArray2(arr[0]); AlignedFree(arr); }
+template < class type > inline void delArray4(type **** arr)
+{ delArray3(arr[0]); AlignedFree(arr); }
+//
+// versions with dummy dimensions (for backwards compatibility)
+//
+template <class type> inline void delArr1(type * arr)
+{ delArray1(arr); }
+template <class type> inline void delArr2(type ** arr, size_t sz1)
+{ delArray2(arr); }
+template <class type> inline void delArr3(type *** arr, size_t sz1, size_t sz2)
+{ delArray3(arr); }
+template <class type> inline void delArr4(type **** arr,
+  size_t sz1, size_t sz2, size_t sz3)
+{ delArray3(arr); }
 
-  for (int i = 0; i < sz1; i++, all_y += sz2) {
-    result[i] = all_y;
-    for (int j = 0; j < sz2; j++, all_z += sz3) {
-      result[i][j] = all_z;
-    }
+// classes to dereference arrays.
+//
+// ArrayRefN is essentially a dumbed-down version of ArrN with
+// an index shift applied to the underlying array.  The purpose
+// of ArrayRefN is to allow elements of multidimensional arrays
+// to be accessed with a calculated one-dimensional index while
+// using chained operator[] syntax (e.g. myarr[i][j]), i.e. the
+// same syntax as is used for native or nested arrays.  This
+// implementation is likely to be slow unless optimization is
+// turned on, allowing the compiler to figure out that the whole
+// chain of calls to the operator[] methods and to the ArrayRefN
+// constructors reduces to computing a one-dimensional subscript
+// used to access a one-dimensional array.
+//
+template <class type>
+class ArrayRef1
+{
+  type* const __restrict__ arr;
+  const size_t S1;
+  const size_t shift;
+ public:
+  inline ArrayRef1(type*const arr_, size_t k, size_t s1) :
+    arr(arr_), shift(k), S1(s1)
+  {}
+  inline type& operator[](size_t n1){
+    check_bounds(n1, S1);
+    ALIGNED(arr);
+    return arr[shift+n1];
   }
+};
 
-  return result;
+template <class type>
+class ArrayRef2
+{
+  type* const __restrict__ arr;
+  const size_t shift;
+  const size_t S2, S1;
+ public:
+  inline ArrayRef2(type*const arr_, size_t k, size_t s2, size_t s1) :
+    arr(arr_), shift(k), S2(s2), S1(s1)
+  {}
+  inline ArrayRef1<type> operator[](size_t n2){
+    check_bounds(n2,S2);
+    return ArrayRef1<type>(arr, (shift+n2)*S1, S1);
+  }
+};
 
-}
+template <class type>
+class ArrayRef3
+{
+  type* const __restrict__ arr;
+  const size_t shift;
+  const size_t S3, S2, S1;
+ public:
+  inline ArrayRef3(type*const arr_, size_t k, size_t s3, size_t s2, size_t s1) :
+    arr(arr_), shift(k), S3(s3), S2(s2), S1(s1)
+  {}
+  inline ArrayRef2<type> operator[](size_t n3){
+    check_bounds(n3, S3);
+    return ArrayRef2<type>(arr, (shift+n3)*S2, S2, S1);
+  }
+};
 
-/*! Deallocator for 3D arrays */
-template < class type > void delArr3(type *** arr, int dummyx, int dummyy) {
-  delete[]arr[0][0];
-  delete[]arr[0];
-  delete[]arr;
-}
+// ArrN can adopt an array allocated by newArrN
+//
+// The purpose of these classes is to provide more efficient
+// and more regulated access to array elements.  The idea is to
+// maintain backward compatibility while allowing us to move
+// toward a proper array abstraction.
+//
+// The user of ArrN is responsible for memory management.
+// The ArrayN classes are the version of this class
+// with automatic deallocation.
+//
+// Examples:
+//
+// Using constructor to create array:
+// {
+//   Arr2 arr<int>(16, 16);
+//   arr[1][2] = 5;
+//   arr.free();
+// }
+// Using ArrN to adopt an array allocated by newArrN
+// {
+//   int** array = newArray2<int>(16,16)
+//   Arr2 arr(array,16,16); // adopt array
+//   arr[1][2] = 5;
+//   assert_eq(arr[1][2],array[1][2]);
+//   // arr.free(); // should not do both this and next line.
+//   delArray2<int>(array);
+// }
+//
+// proposed improvements:
+// - methods that use parallel arithmetic for omp and vectorized code
 
-/*! The allocator for 2D array */
-template < class type > type ** _new_2_array(int sz1, int sz2) {
+template <class type>
+class Arr1
+{
+  private: // data
+    type* const __restrict__ arr;
+    const size_t S1;
+  public:
+    ~Arr1() { }
+    void free() { AlignedFree(arr); }
+    Arr1(size_t s1) :
+      S1(s1),
+      arr(AlignedAlloc(type, s1))
+    { }
+    Arr1(type* in,
+      size_t s1) :
+      S1(s1),
+      arr(in)
+    { }
+    inline type& operator[](size_t n1){
+      check_bounds(n1, S1);
+      ALIGNED(arr);
+      return arr[n1];
+    }
+    inline size_t getidx(size_t n1) const
+    {
+      check_bounds(n1, S1);
+      return n1;
+    }
+    const type& get(size_t n1) const
+      { ALIGNED(arr); return arr[getidx(n1)]; }
+    type& fetch(size_t n2,size_t n1) const
+      { ALIGNED(arr); return arr[getidx(n1)]; }
+    void set(size_t n1, type value)
+      { ALIGNED(arr); arr[getidx(n1)] = value; }
+};
 
-  type **all_x;
-  type *all_y;
+template <class type>
+class Arr2
+{
+  private: // data
+    const size_t S2,S1;
+    type* const __restrict__ arr;
+  public:
+    ~Arr2(){}
+    void free() { AlignedFree(arr); }
+    Arr2(size_t s2, size_t s1) :
+      S2(s2), S1(s1),
+      arr(AlignedAlloc(type, s2*s1))
+    {
+    }
+    Arr2(type*const* in,
+      size_t s2, size_t s1) :
+      S2(s2), S1(s1),
+      arr(*in)
+    { }
+    // for backwards compatibility support bracket notation
+    inline ArrayRef1<type> operator[](size_t n2){
+      check_bounds(n2, S2);
+      return ArrayRef1<type>(arr, n2*S1, S1);
+    }
+    inline size_t getidx(size_t n2, size_t n1) const
+    {
+      check_bounds(n2, S2);
+      check_bounds(n1, S1);
+      return n2*S1+n1;
+    }
+    // I prefer "fetch" over operator() to hilight read/write access
+    //type& operator()(size_t n2, size_t n1) const
+    //  { ALIGNED(arr); return arr[n1+S1*n2]; }
+    type& fetch(size_t n2,size_t n1) const
+      { ALIGNED(arr); return arr[getidx(n2,n1)]; }
+    // better to use accessors that distinguish read from write:
+    const type& get(size_t n2,size_t n1) const
+      { ALIGNED(arr); return arr[getidx(n2,n1)]; }
+    void set(size_t n2,size_t n1, type value)
+      { ALIGNED(arr); arr[getidx(n2,n1)] = value; }
+};
 
-  all_x = new type *[sz1];
-  all_y = new type[sz1 * sz2];
+template <class type>
+class Arr3
+{
+  private: // data
+    type* const __restrict__ arr;
+    const size_t S3,S2,S1;
+  public:
+    ~Arr3(){}
+    void free() { AlignedFree(arr); }
+    Arr3(size_t s3, size_t s2, size_t s1) :
+      S3(s3), S2(s2), S1(s1),
+      arr(AlignedAlloc(type, s3*s2*s1))
+    { }
+    Arr3(type*const*const* in,
+      size_t s3, size_t s2, size_t s1) :
+      S3(s3), S2(s2), S1(s1),
+      arr(**in)
+    { }
+    inline ArrayRef2<type> operator[](size_t n3){
+      check_bounds(n3, S3);
+      return ArrayRef2<type>(arr, n3*S2, S2, S1);
+    }
+    type* get_arr(){return arr;}
+    inline size_t getidx(size_t n3, size_t n2, size_t n1) const
+    {
+      check_bounds(n3, S3);
+      check_bounds(n2, S2);
+      check_bounds(n1, S1);
+      return (n3*S2+n2)*S1+n1;
+    }
+    //type& operator()(size_t n3, size_t n2, size_t n1) const
+    //{ ALIGNED(arr); return arr[getidx(n3,n2,n1)]; }
+    type& fetch(size_t n3,size_t n2,size_t n1) const
+      { ALIGNED(arr); return arr[getidx(n3,n2,n1)]; }
+    const type& get(size_t n3,size_t n2,size_t n1) const
+      { ALIGNED(arr); return arr[getidx(n3,n2,n1)]; }
+    void set(size_t n3,size_t n2,size_t n1, type value)
+      { ALIGNED(arr); arr[getidx(n3,n2,n1)] = value; }
+};
 
-  type **result = all_x;
+template <class type>
+class Arr4
+{
+  private: // data
+    const size_t S4,S3,S2,S1;
+    type* const __restrict__ arr;
+  public:
+    ~Arr4(){} // nonempty destructor would kill performance
+    void free() { AlignedFree(arr); }
+    Arr4(size_t s4, size_t s3, size_t s2, size_t s1) :
+      arr(AlignedAlloc(type, s4*s3*s2*s1)),
+      S4(s4), S3(s3), S2(s2), S1(s1)
+    { }
+    Arr4(type*const*const*const* in,
+      size_t s4, size_t s3, size_t s2, size_t s1) :
+      S4(s4), S3(s3), S2(s2), S1(s1),
+      arr(***in)
+    { }
+    inline ArrayRef3<type> operator[](size_t n4){
+      check_bounds(n4, S4);
+      return ArrayRef3<type>(arr, n4*S3, S3, S2, S1);
+    }
+    inline size_t getidx(size_t n4, size_t n3, size_t n2, size_t n1) const
+    {
+      check_bounds(n4, S4);
+      check_bounds(n3, S3);
+      check_bounds(n2, S2);
+      check_bounds(n1, S1);
+      return ((n4*S3+n3)*S2+n2)*S1+n1;
+    }
+    const type& get(size_t n4,size_t n3,size_t n2,size_t n1) const
+      { ALIGNED(arr); return arr[getidx(n4,n3,n2,n1)]; }
+    type& fetch(size_t n4,size_t n3,size_t n2,size_t n1) const
+      { ALIGNED(arr); return arr[getidx(n4,n3,n2,n1)]; }
+    void set(size_t n4,size_t n3,size_t n2,size_t n1, type value)
+      { ALIGNED(arr); arr[getidx(n4,n3,n2,n1)] = value; }
+};
 
-  for (int i = 0; i < sz1; i++, all_y += sz2) {
-    result[i] = all_y;
-  }
+// Versions of array classes which automatically free memory.
+//
+// Note that the nonempty destructor kills performance
+// unless compiling with -fno-exceptions
 
-  return result;
+template <class type>
+struct Array1 : public Arr1<type>
+{
+    ~Array1(){Arr1<type>::free();}
+    Array1(size_t s1) : Arr1<type>(s1) { }
+};
 
-}
+template <class type>
+struct Array2 : public Arr2<type>
+{
+    ~Array2(){Arr2<type>::free();}
+    Array2(size_t s2, size_t s1) : Arr2<type>(s2,s1) { }
+};
 
-/*! Deallocator for 2D arrays */
-template < class type > void delArr2(type ** arr, int dummyx) {
-  delete[]arr[0];
-  delete[]arr;
-}
+template <class type>
+struct Array3 : public Arr3<type>
+{
+    ~Array3(){Arr3<type>::free();}
+    Arr3<type>& fast_accessor() { return *(Arr3<type>*)this; }
+    Array3(size_t s3, size_t s2, size_t s1) : Arr3<type>(s3,s2,s1) { }
+};
 
-#define newArr4(type,sz1,sz2,sz3,sz4) _new_4_array<type>((sz1),(sz2),(sz3),(sz4))
-#define newArr3(type,sz1,sz2,sz3) _new_3_array<type>((sz1),(sz2),(sz3))
-#define newArr2(type,sz1,sz2) _new_2_array<type>((sz1),(sz2))
+template <class type>
+struct Array4 : public Arr4<type>
+{
+    ~Array4(){Arr4<type>::free();}
+    Array4(size_t s4, size_t s3, size_t s2, size_t s1)
+      : Arr4<type>(s4,s3,s2,s1) { }
+};
 
+// These aliases are defined for the following flexibilization purposes:
+// - to avoid filling the code with template brackets
+//   (i.e., to minimize explicitly template-dependent code).
+// - so that they can be redefined according to the user's
+//   preferred array implementation.
+//
+typedef Arr1<int> intArr1;
+typedef Arr2<int> intArr2;
+typedef Arr3<int> intArr3;
+typedef Arr4<int> intArr4;
+typedef Arr1<double> doubleArr1;
+typedef Arr2<double> doubleArr2;
+typedef Arr3<double> doubleArr3;
+typedef Arr4<double> doubleArr4;
+//
+#define newArr4(type,sz1,sz2,sz3,sz4) newArray4<type>((sz1),(sz2),(sz3),(sz4))
+#define newArr3(type,sz1,sz2,sz3) newArray3<type>((sz1),(sz2),(sz3))
+#define newArr2(type,sz1,sz2) newArray2<type>((sz1),(sz2))
+/*** end Array classes with flexible dimensions ***/
 #endif
diff --git a/include/arrays.h b/include/arrays.h
new file mode 100644
index 00000000..1a99b39f
--- /dev/null
+++ b/include/arrays.h
@@ -0,0 +1,188 @@
+#ifndef IPIC_ARRAYS_H
+#define IPIC_ARRAYS_H
+#include "Alloc.h" // variable-dimension arrays
+/*
+    Fixed array class developed by
+      Alec Johnson
+
+    For examples of use of this class,
+    see test_arrays.cpp
+*/
+
+/*** begin FixedArray classes for use when dimensions are known at compile time. ***/
+//
+// These classes improve upon native fixed arrays as follows:
+// - bounds-checking is performed if CHECK_BOUNDS is defined,
+// - myarray(i,j) access is supported,  and
+// - functions can return fixed-dimension arrays,
+//   whereas since the C standard does not allow
+//   one to return an array with fixed dimensions.
+//
+// The purpose of implementing these extensions is so that
+// fixed-dimension arrays can be used in iPic3D if doing so
+// yields a significant performance benefit for the choice of
+// architecture and compiler.
+
+template <class type, size_t s1>
+class FixedArray1D
+{
+ public:
+  type arr[s1];
+ public:
+  type& fetch(size_t n1)
+  {
+    check_bounds(n1,s1);
+    return arr[n1];
+  }
+  type& operator[](size_t n1)
+  {
+    check_bounds(n1,s1);
+    return arr[n1];
+  }
+};
+
+// auxiliary class for chained operator[] dereferencing of FixedArray2D
+//
+template <class type, size_t s1, size_t s2>
+class FixedArray2D1
+{
+  type (&arr)[s1][s2];
+  size_t n1;
+ public:
+  FixedArray2D1(type (&_arr)[s1][s2], size_t _n1) :
+    arr(_arr), n1(_n1) {};
+  type& operator[](size_t n2)
+  {
+    check_bounds(n1,s1);
+    check_bounds(n2,s2);
+    return arr[n1][n2];
+  }
+};
+
+template <class type, size_t s1, size_t s2>
+class FixedArray2D
+{
+ public:
+  type arr [s1][s2];
+ public:
+  type& fetch(size_t n1, size_t n2)
+  {
+    check_bounds(n1,s1);
+    check_bounds(n2,s2);
+    return arr[n1][n2];
+  }
+  // Chaining operator[] this way
+  // does not allow bounds checking
+  // and does not work beyond 2D.
+  //type* operator[](size_t n1) { return arr[n1]; }
+  FixedArray2D1<type,s1,s2> operator[](size_t n1)
+    { return FixedArray2D1<type,s1,s2>(arr,n1); }
+};
+
+// auxiliary classes for chained operator[] dereferencing of FixedArray3D
+//
+template <class type, size_t s1, size_t s2, size_t s3>
+class FixedArray3D1
+{
+  type (&arr)[s1][s2][s3];
+  size_t n1, n2;
+ public:
+  FixedArray3D1(type (&_arr)[s1][s2][s3], size_t _n1, size_t _n2) :
+    arr(_arr), n1(_n1), n2(_n2) {};
+  type& operator[](size_t n3)
+  {
+    check_bounds(n1,s1);
+    check_bounds(n2,s2);
+    check_bounds(n3,s3);
+    return arr[n1][n2][n3];
+  }
+};
+//
+template <class type, size_t s1, size_t s2, size_t s3>
+class FixedArray3D2
+{
+  type (&arr)[s1][s2][s3];
+  size_t n1;
+ public:
+  FixedArray3D2(type (&_arr)[s1][s2][s3], size_t _n1) : arr(_arr), n1(_n1) {};
+  FixedArray3D1<type,s1,s2,s3> operator[](size_t n2)
+    { return FixedArray3D1<type,s1,s2,s3>(arr,n1,n2); }
+};
+
+template <class type, size_t s1, size_t s2, size_t s3>
+struct FixedArray3D
+{
+  type arr [s1][s2][s3];
+ public:
+  type& fetch(size_t n1, size_t n2, size_t n3)
+  {
+    check_bounds(n1,s1);
+    check_bounds(n2,s2);
+    check_bounds(n3,s3);
+    return arr[n1][n2][n3];
+  }
+  // chained operator[] dereferencing requires
+  // auxiliary array dereferencing classes,
+  // since the C standard does not allow one to
+  // return an array with fixed dimensions.
+  FixedArray3D2<type,s1,s2,s3> operator[](size_t n1)
+    { return FixedArray3D2<type,s1,s2,s3>(arr,n1); }
+};
+
+// auxiliary classes for chained operator[] dereferencing of FixedArray4D
+//
+template <class type, size_t s1, size_t s2, size_t s3, size_t s4>
+class FixedArray4D1
+{
+  type (&arr)[s1][s2][s3][s4];
+  size_t n1,n2,n3;
+ public:
+  FixedArray4D1(type(&_arr)[s1][s2][s3][s4],size_t _n1,size_t _n2,size_t _n3):
+    arr(_arr), n1(_n1), n2(_n2), n3(_n3){};
+  type& operator[](size_t n4) { return arr[n1][n2][n3][n4]; }
+};
+//
+template <class type, size_t s1, size_t s2, size_t s3, size_t s4>
+class FixedArray4D2
+{
+  type (&arr)[s1][s2][s3][s4];
+  size_t n1,n2;
+ public:
+  FixedArray4D2(type (&_arr)[s1][s2][s3][s4], size_t _n1, size_t _n2) :
+    arr(_arr), n1(_n1), n2(_n2){};
+  FixedArray4D1<type,s1,s2,s3,s4> operator[](size_t n3)
+    { return FixedArray4D1<type,s1,s2,s3,s4>(arr,n1,n2,n3); }
+};
+//
+template <class type, size_t s1, size_t s2, size_t s3, size_t s4>
+class FixedArray4D3
+{
+  type (&arr)[s1][s2][s3][s4];
+  size_t n1;
+ public:
+  FixedArray4D3(type (&_arr)[s1][s2][s3][s4], size_t _n1) :
+    arr(_arr), n1(_n1) {};
+  FixedArray4D2<type,s1,s2,s3,s4> operator[](size_t n2)
+    { return FixedArray4D2<type,s1,s2,s3,s4>(arr,n1,n2); }
+};
+
+template <class type, size_t s1, size_t s2, size_t s3, size_t s4>
+class FixedArray4D
+{
+ public:
+  type arr [s1][s2][s3][s4];
+ public:
+  type& fetch(size_t n1, size_t n2, size_t n3, size_t n4)
+  {
+    check_bounds(n1,s1);
+    check_bounds(n2,s2);
+    check_bounds(n3,s3);
+    check_bounds(n4,s4);
+    return arr[n1][n2][n3][n4];
+  }
+  FixedArray4D3<type,s1,s2,s3,s4> operator[](size_t n1)
+    { return FixedArray4D3<type,s1,s2,s3,s4>(arr,n1); }
+};
+/*** end FixedArray classes for use when dimensions are known at compile time. ***/
+
+#endif
diff --git a/include/asserts.h b/include/asserts.h
index a3cd4584..0a8e3b40 100644
--- a/include/asserts.h
+++ b/include/asserts.h
@@ -107,8 +107,11 @@
   void assert_error(const char* file, int line, const char* func, \
       const char* op, const char* lhs_str, const char* rhs_str, \
       t1 lhs, t2 rhs);
-declare_assert_errmsg(double, double);  // this seems enough for all numbers
-declare_assert_errmsg(int, int);  // but maybe this is more efficient
+declare_assert_errmsg(double, double);
+declare_assert_errmsg(size_t, size_t);
+declare_assert_errmsg(int, size_t);
+declare_assert_errmsg(size_t, int);
+declare_assert_errmsg(int, int);
 declare_assert_errmsg(const char *, const char *);
 // put in assert_string.h:
 // #include "assert.h"
diff --git a/tests/Makefile b/tests/Makefile
new file mode 100644
index 00000000..10559ab9
--- /dev/null
+++ b/tests/Makefile
@@ -0,0 +1,38 @@
+
+INCLUDE = -I../include
+
+OBJECTS = \
+  test_arrays.o \
+  ../utility/asserts.o \
+  debug.o
+
+FLAGS = -O3 -DNO_MPI -fno-exceptions #-DCHECK_BOUNDS -ggdb
+
+COMPILER = g++ #icpc # g++
+
+test: clean_test_arrays test_arrays
+
+test_arrays: clean_test_arrays $(OBJECTS)
+	$(COMPILER) $(FLAGS) $(INCLUDE) $(OBJECTS) -o test_arrays
+
+test_arrays.o:
+	$(COMPILER) $(FLAGS) -c test_arrays.cpp $(INCLUDE) -o test_arrays.o
+
+
+../utility/asserts.o:
+	$(COMPILER) $(FLAGS) -c ../utility/asserts.cpp $(INCLUDE) -o ../utility/asserts.o
+
+debug.o:
+	$(COMPILER) $(FLAGS) -c ../utility/debug.cpp $(INCLUDE) -o debug.o
+
+clean: clean_old_test_arrays clean_test_arrays
+	rm -f test_arrays $(OBJECTS)
+
+clean_test_arrays:
+	rm -f test_arrays test_arrays.o
+
+old_test_arrays: clean_old_test_arrays
+	$(COMPILER) $(FLAGS) old_test_arrays.cpp  -o old_test_arrays
+
+clean_old_test_arrays:
+	rm -f old_test_arrays old_test_arrays.o
diff --git a/tests/stopwatch.h b/tests/stopwatch.h
new file mode 100644
index 00000000..c8bfbc5b
--- /dev/null
+++ b/tests/stopwatch.h
@@ -0,0 +1,97 @@
+#include <sys/time.h>
+#include <assert.h>
+#include <stdint.h>
+
+#define myuint64_t int
+//#define myuint64_t uint64_t
+
+static inline myuint64_t tv_to_sec(struct timeval tv){
+   return tv.tv_sec + tv.tv_usec/1000000;
+}
+
+static inline myuint64_t tv_to_ms(struct timeval tv){
+   return tv.tv_sec*1000 + tv.tv_usec/1000;
+}
+
+static inline myuint64_t tv_to_us(struct timeval tv){
+   return tv.tv_sec*1000000 + tv.tv_usec;
+}
+
+
+static inline struct timeval add_tv(const struct timeval a, const struct timeval b){
+   const struct timeval res = { a.tv_sec+b.tv_sec, a.tv_usec+b.tv_usec };
+   return res;
+}
+
+static inline struct timeval diff_tv(const struct timeval start, const struct timeval stop){
+   const struct timeval diff = {stop.tv_sec - start.tv_sec, stop.tv_usec - start.tv_usec};
+   return diff;
+}
+
+typedef enum {START, STOP, LAP, RESET} sw_action_t;
+static inline int valid_sw_action(sw_action_t t){
+   return t == START || t == STOP || t == LAP || RESET;
+}
+
+
+typedef enum {OFF, STARTED, STOPPED} sw_state_t;
+static inline int valid_sw_state(sw_state_t s){
+   return s == OFF || s == STOPPED || s == STARTED;
+}
+
+
+typedef struct {
+   sw_state_t state;
+   struct timeval total, now, last;
+} stopwatch_t;
+
+static inline struct timeval sw_start(stopwatch_t * const sw ){
+   sw->state = STARTED;
+   gettimeofday( &sw->now, 0 );
+   sw->last = sw->now;
+   return sw->total;
+}
+
+static inline struct timeval sw_stop(stopwatch_t * const sw ){
+   sw->state = STOPPED;
+   gettimeofday( &sw->now, 0 );
+   sw->total = add_tv(sw->total, diff_tv(sw->last, sw->now));
+   sw->last = sw->now;
+   return sw->total;
+}
+
+static inline struct timeval sw_lap(stopwatch_t * const sw ){
+   gettimeofday( &sw->now, 0 );
+   const struct timeval elapsed = diff_tv(sw->last, sw->now);
+   sw->total = add_tv(sw->total, elapsed);
+   sw->last = sw->now;
+   return elapsed;
+}
+
+static inline struct timeval sw_reset(stopwatch_t * const sw ){
+   const static stopwatch_t sw_off = { OFF, {0, 0}, {0, 0}, {0, 0} };
+   sw->state = OFF;
+   *sw = sw_off;
+   return sw->total;
+}
+
+static inline struct timeval stopwatch_mt(stopwatch_t * const sw, sw_action_t action){
+   typedef struct timeval stopwatch_func_t( stopwatch_t *sw );
+
+   static stopwatch_func_t * const  stopwatch_transitions[3][4] = { {sw_start, 0, 0, 0},
+                                                                        {0, sw_stop, sw_lap, 0},
+                                                                        {sw_start, 0, 0, sw_reset}};
+   assert(sw != 0);
+   assert(valid_sw_action(action));
+   assert(valid_sw_state(sw->state));
+   assert(stopwatch_transitions[sw->state][action] != 0);
+   return stopwatch_transitions[sw->state][action](sw);
+}
+
+
+struct timeval stopwatch(sw_action_t action){
+   static stopwatch_t sw = { OFF, {0, 0}, {0, 0}, {0, 0} };
+   return stopwatch_mt(&sw, action);
+}
+
+
diff --git a/tests/test_arrays.cpp b/tests/test_arrays.cpp
new file mode 100644
index 00000000..45cdc92c
--- /dev/null
+++ b/tests/test_arrays.cpp
@@ -0,0 +1,827 @@
+/*
+   Reger Ferrer
+   Vicenç Beltran
+   Alec Johnson
+*/
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include "stopwatch.h"
+#include "arrays.h"
+#include "Alloc.h"
+#include "asserts.h"
+#include "debug.h"
+
+/**** begin Jorge Amaya's array allocation methods ****/
+
+/*! The allocator for 4D array */
+template < class type > type **** newArray4_Amaya(int sz1, int sz2, int sz3, int sz4) {
+
+  type ****all_x;
+  type ***all_y;
+  type **all_z;
+  type *all_r;
+
+  all_x = new type ***[sz1];
+  all_y = new type **[sz1 * sz2];
+  all_z = new type *[sz1 * sz2 * sz3];
+  all_r = new type[sz1 * sz2 * sz3 * sz4];
+
+  type ****result = all_x;
+
+  for (int i = 0; i < sz1; i++, all_y += sz2) {
+    result[i] = all_y;
+    for (int j = 0; j < sz2; j++, all_z += sz3) {
+      result[i][j] = all_z;
+      for (int k = 0; k < sz3; k++, all_r += sz4) {
+        result[i][j][k] = all_r;
+      }
+    }
+  }
+
+  return result;
+}
+
+/*! Deallocator for 4D arrays */
+template < class type > void delArr4_Amaya(type **** arr, int dummyx, int dummyy, int dummyz) {
+  delete[]arr[0][0][0];
+  delete[]arr[0][0];
+  delete[]arr[0];
+  delete[]arr;
+}
+
+/*! The allocator for 3D array */
+template < class type > type *** newArray3_Amaya(int sz1, int sz2, int sz3) {
+
+  type ***all_x;
+  type **all_y;
+  type *all_z;
+
+  all_x = new type **[sz1];
+  all_y = new type *[sz1 * sz2];
+  all_z = new type[sz1 * sz2 * sz3];
+
+  type ***result = all_x;
+
+  for (int i = 0; i < sz1; i++, all_y += sz2) {
+    result[i] = all_y;
+    for (int j = 0; j < sz2; j++, all_z += sz3) {
+      result[i][j] = all_z;
+    }
+  }
+
+  return result;
+
+}
+
+/*! Deallocator for 3D arrays */
+template < class type > void delArr3_Amaya(type *** arr, int dummyx, int dummyy) {
+  delete[]arr[0][0];
+  delete[]arr[0];
+  delete[]arr;
+}
+
+/*! The allocator for 2D array */
+template < class type > type ** newArr2_Amaya(int sz1, int sz2) {
+
+  type **all_x;
+  type *all_y;
+
+  all_x = new type *[sz1];
+  all_y = new type[sz1 * sz2];
+
+  type **result = all_x;
+
+  for (int i = 0; i < sz1; i++, all_y += sz2) {
+    result[i] = all_y;
+  }
+
+  return result;
+
+}
+
+/*! Deallocator for 2D arrays */
+template < class type > void delArr2_Amaya(type ** arr, int dummyx) {
+  delete[]arr[0];
+  delete[]arr;
+}
+
+#define newArr4_Amaya(type,sz1,sz2,sz3,sz4) newArray4_Amaya<type>((sz1),(sz2),(sz3),(sz4))
+#define newArr3_Amaya(type,sz1,sz2,sz3) newArray3_Amaya<type>((sz1),(sz2),(sz3))
+#define newArr2_Amaya(type,sz1,sz2) newArray2_Amaya<type>((sz1),(sz2))
+
+/**** end Jorge Amaya's array allocation methods ****/
+
+/****** begin (i,j) arrays from Reger Ferrer and Vicenç Beltran ******/
+
+template <class type>
+class Rank1
+{
+    const size_t S1;
+    type  * __restrict__ const  arr;
+
+public:
+
+    Rank1(size_t s1) : S1(s1), arr(AlignedAlloc(type, s1)) {}
+
+    //Rank1( const Rank1& other ) : S1( other.S1 ), arr( other.arr ) {}
+
+    type& operator()(size_t n1) const
+    {
+       ALIGNED(arr);
+       return arr[n1];
+    }
+
+    size_t dim1() const { return S1; }
+
+    ~Rank1() { };
+};
+
+template <class type>
+class Rank2
+{
+    const size_t  S1, S2;
+    type * __restrict__ const arr;
+
+
+public:
+    Rank2(size_t s1, size_t s2) : S1(s1), S2(s2), arr(AlignedAlloc(type, s1*s2)) {}
+
+    //Rank2( const Rank2& other ) : S1( other.S1 ), S2( other.S2 ), arr( other.arr ) {}
+
+    type& operator()(size_t n1, size_t n2) const
+    {
+       check_bounds(n1,S1);
+       check_bounds(n2,S2);
+       ALIGNED(arr);
+       return arr[n2+S2*n1];
+    }
+    type& fetch(size_t n1,size_t n2) const
+    {
+       check_bounds(n1,S1);
+       check_bounds(n2,S2);
+       ALIGNED(arr);
+       return arr[n2+S2*n1];
+    }
+
+    size_t dim1() const { return S1; }
+    size_t dim2() const { return S2; }
+    
+    void free() {
+       AlignedFree(arr);
+    }
+
+    ~Rank2() { };
+};
+    
+template <class type>
+class Rank3
+{
+    const size_t S1, S2, S3;
+    type *    const __restrict__ arr;
+
+
+public:
+
+    Rank3(size_t s1, size_t s2, size_t s3) : S1(s1), S2(s2), S3(s3),
+    arr(AlignedAlloc(type, s1*s2*s3)) {}
+
+    //Rank3( const Rank3& other ) : S1( other.S1 ), S2( other.S2 ), S3( other.S3 ),
+    //arr( other.arr ) {}
+
+    type& operator()(size_t n1, size_t n2, size_t n3) const
+    {
+       check_bounds(n1,S1);
+       check_bounds(n2,S2);
+       check_bounds(n3,S3);
+       ALIGNED(arr);
+       return arr[n3+S3*(n2+S2*n1)];
+    }
+    type& fetch(size_t n1, size_t n2, size_t n3) const
+    {
+       check_bounds(n1,S1);
+       check_bounds(n2,S2);
+       check_bounds(n3,S3);
+       ALIGNED(arr);
+       return arr[n3+S3*(n2+S2*n1)];
+    }
+    const type& get(size_t n1, size_t n2, size_t n3) const
+    {
+       check_bounds(n1,S1);
+       check_bounds(n2,S2);
+       check_bounds(n3,S3);
+       ALIGNED(arr);
+       return arr[n3+S3*(n2+S2*n1)];
+    }
+
+    ~Rank3() { }
+
+    size_t dim1() const { return S1; }
+    size_t dim2() const { return S2; }
+    size_t dim3() const { return S3; }
+
+    void free() {
+       AlignedFree(arr);
+    }
+};
+
+template <class type>
+class Rank4
+{
+    const size_t S1, S2, S3, S4;
+    type* __restrict__ const arr;
+
+public:
+
+    Rank4(size_t s1, size_t s2, size_t s3, size_t s4) : S1(s1), S2(s2), S3(s3), S4(s4),
+    arr(AlignedAlloc(type, s1*s2*s3*s4)) {}
+
+    //Rank4( const Rank4& other ) : S1( other.S1 ), S2( other.S2 ), S3( other.S3 ), S4( other.S4 ),
+    //arr( other.arr ) {}
+
+    type& operator()(size_t n1, size_t n2, size_t n3, size_t n4) const
+    {
+       check_bounds(n1,S1);
+       check_bounds(n2,S2);
+       check_bounds(n3,S3);
+       check_bounds(n4,S4);
+       ALIGNED(arr);
+       return arr[n4+S4*(n3+S3*(n2+S2*n1))];
+    }
+
+    ~Rank4() { }
+
+    size_t dim1() const { return S1; } 
+    size_t dim2() const { return S2; }
+    size_t dim3() const { return S3; }
+    size_t dim4() const { return S4; }
+    
+    void free() { AlignedFree(arr); }
+
+};
+
+/******** end (i,j) arrays from Reger Ferrer and Vicenç Beltran ******/
+
+/****** begin [i][j] arrays from Reger Ferrer and Vicenç Beltran ******/
+
+template <class type>
+class BracketRank1
+{
+    const size_t S1;
+    type  * __restrict__ const  arr;
+
+public:
+    BracketRank1(size_t s1, void * __restrict__ const storage) : S1(s1),
+         arr(reinterpret_cast<type * __restrict__ const>(storage)){}
+
+    BracketRank1(size_t s1) : S1(s1), arr(new type[s1]){}
+
+    type& operator[](size_t i) const
+    {
+        return arr[i];
+    }
+
+};
+
+template <class type>
+class BracketRank2
+{
+    const size_t S1, S2;
+    type * __restrict__ const arr;
+
+public:
+    void free(){ delete[] arr; }
+    BracketRank2(size_t s1, size_t s2, void *storage) : S1(s1), S2(s2),
+         arr(reinterpret_cast<type * __restrict__ const>(storage)){}
+
+    BracketRank2(size_t s1, size_t s2) : S1(s1), S2(s2),
+         arr(new type[s1*s2]) {}
+
+    BracketRank1<type> operator[](size_t i) const
+    {
+        return BracketRank1<type>(S2, arr + i * S2);
+    }
+    type& operator()(size_t n1, size_t n2) const
+    {
+       ALIGNED(arr);
+       return arr[n2+S2*n1];
+    }
+};
+
+/******** end [i][j] arrays from Reger Ferrer and Vicenç Beltran ******/
+
+using namespace std;
+
+template <class type>
+void testArr2_diagonal()
+{
+   const int ITERS = 10000;
+   const size_t dim1 = 64;
+   const size_t dim2 = 64;
+
+   BracketRank2<type> Abra(dim1, dim2);
+   BracketRank2<type> Bbra(dim1, dim2);
+   BracketRank2<type> Cbra(dim1, dim2);
+
+   Rank2<type> Apar(dim1, dim2);
+   Rank2<type> Bpar(dim1, dim2);
+   Rank2<type> Cpar(dim1, dim2);
+
+   FixedArray2D<type, dim1, dim2> Afix ;
+   FixedArray2D<type, dim1, dim2> Bfix ;
+   FixedArray2D<type, dim1, dim2> Cfix ;
+
+   type** Aold = newArr2(type, dim1, dim2);
+   type** Bold = newArr2(type, dim1, dim2);
+   type** Cold = newArr2(type, dim1, dim2);
+
+   Arr2<type> Aarr(dim1, dim2);
+   Arr2<type> Barr(dim1, dim2);
+   Arr2<type> Carr(dim1, dim2);
+
+   printf("Initializing data ...\n");
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=i; j<dim2; j++)
+   {
+      Bbra[i][j] = rand();
+      Cbra[i][j] = rand();
+      Bpar(i,j) = Bbra[i][j];
+      Cpar(i,j) = Cbra[i][j];
+      Bfix.fetch(i,j) = Bbra[i][j];
+      Cfix.fetch(i,j) = Cbra[i][j];
+      Bold[i][j] = Bbra[i][j];
+      Cold[i][j] = Cbra[i][j];
+      Barr.fetch(i,j) = Bbra[i][j];
+      Carr.fetch(i,j) = Cbra[i][j];
+   }
+
+   stopwatch(START);
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=i; j<dim2; j++)
+   {
+      Aold[i][j] = Bold[i][j] * Cold[i][j];
+   }
+   printf("%d ms = Total time [i][j] chained-pointer array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=i; j<dim2; j++)
+   {
+      //Afix.fetch(i,j) = Bfix.fetch(i,j) * Cfix.fetch(i,j);
+      Afix[i][j] = Bfix[i][j] * Cfix[i][j];
+   }
+   printf("%d ms = Total time [i][j] fixed-dimension array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=i; j<dim2; j++)
+   {
+      Abra[i][j] = Bbra[i][j] * Cbra[i][j];
+   }
+   printf("%d ms = Total time [i][j] Vicenc array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=i; j<dim2; j++)
+   {
+      Apar.fetch(i,j) = Bpar.fetch(i,j) * Cpar.fetch(i,j);
+      //Apar(i,j) = Bpar(i,j) * Cpar(i,j);
+   }
+   printf("%d ms = Total time (i,j) Vicenc array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=i; j<dim2; j++)
+   {
+      Aarr[i][j] = Barr[i][j] * Carr[i][j];
+   }
+   printf("%d ms = Total time [i][j] access of Arr2\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=i; j<dim2; j++)
+   {
+      //Aarr(i,j) = Barr(i,j) * Carr(i,j);
+      Aarr.fetch(i,j) = Barr.fetch(i,j) * Carr.fetch(i,j);
+   }
+   printf("%d ms = Total time (i,j) access of Arr2\n", tv_to_ms(stopwatch(LAP)));
+
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=i; j<dim2; j++)
+   {
+      assert(Afix.fetch(i,j) == Abra[i][j]);
+      assert(Aold[i][j] == Abra[i][j]);
+      assert(Aarr.get(i,j) == Abra[i][j]);
+   }
+
+   printf("Verification done!\n");
+   stopwatch(STOP);
+
+   delArr2(Aold,dim1);
+   delArr2(Bold,dim1);
+   delArr2(Cold,dim1);
+}
+
+template <class type>
+void testArr2()
+{
+   const int ITERS = 10000;
+   const size_t dim1 = 64;
+   const size_t dim2 = 64;
+
+   BracketRank2<type> Abra(dim1, dim2);
+   BracketRank2<type> Bbra(dim1, dim2);
+   BracketRank2<type> Cbra(dim1, dim2);
+
+   Rank2<type> Apar(dim1, dim2);
+   Rank2<type> Bpar(dim1, dim2);
+   Rank2<type> Cpar(dim1, dim2);
+
+   FixedArray2D<type, dim1, dim2> Afix ;
+   FixedArray2D<type, dim1, dim2> Bfix ;
+   FixedArray2D<type, dim1, dim2> Cfix ;
+
+   type** Aold = newArr2(type, dim1, dim2);
+   type** Bold = newArr2(type, dim1, dim2);
+   type** Cold = newArr2(type, dim1, dim2);
+
+   Arr2<type> Aarr(dim1, dim2);
+   Arr2<type> Barr(dim1, dim2);
+   Arr2<type> Carr(dim1, dim2);
+
+   printf("Initializing data ...\n");
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   {
+      Bbra[i][j] = rand();
+      Cbra[i][j] = rand();
+      Bpar(i,j) = Bbra[i][j];
+      Cpar(i,j) = Cbra[i][j];
+      Bfix.fetch(i,j) = Bbra[i][j];
+      Cfix.fetch(i,j) = Cbra[i][j];
+      Bold[i][j] = Bbra[i][j];
+      Cold[i][j] = Cbra[i][j];
+      Barr.fetch(i,j) = Bbra[i][j];
+      Carr.fetch(i,j) = Cbra[i][j];
+   }
+
+   stopwatch(START);
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   {
+      Aold[i][j] = Bold[i][j] * Cold[i][j];
+   }
+   printf("%d ms = Total time [i][j] chained-pointer array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   {
+      Afix.fetch(i,j) = Bfix.fetch(i,j) * Cfix.fetch(i,j);
+   }
+   printf("%d ms = Total time [i][j] fixed-dimension array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   {
+      Abra[i][j] = Bbra[i][j] * Cbra[i][j];
+   }
+   printf("%d ms = Total time [i][j] Vicenc array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   {
+      Apar(i,j) = Bpar(i,j) * Cpar(i,j);
+   }
+   printf("%d ms = Total time (i,j) Vicenc array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   {
+      Aarr[i][j] = Barr[i][j] * Carr[i][j];
+   }
+   printf("%d ms = Total time [i][j] access of Arr2\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   {
+      Aarr.fetch(i,j) = Barr.get(i,j) * Carr.get(i,j);
+   }
+   printf("%d ms = Total time (i,j) access of Arr2\n", tv_to_ms(stopwatch(LAP)));
+
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   {
+      assert(Afix.fetch(i,j) == Abra[i][j]);
+      assert(Aold[i][j] == Abra[i][j]);
+      assert(Aarr.get(i,j) == Abra[i][j]);
+   }
+
+   printf("Verification done!\n");
+   stopwatch(STOP);
+
+   delArr2(Aold,dim1);
+   delArr2(Bold,dim1);
+   delArr2(Cold,dim1);
+}
+
+#define testArr3nestedFor(arg1, arg2) \
+for(int t=0; t<ITERS; t++) \
+for(size_t i=0; i<dim1; i++) \
+for(size_t j=0; j<dim2; j++) \
+for(size_t k=0; k<dim3; k++) \
+{ \
+   #arg1; \
+} \
+printf("%d ms = Total time " #arg2 "\n", tv_to_ms(stopwatch(LAP)));
+
+template <class type>
+void testArr3()
+{
+   const int ITERS = 100;
+   const size_t dim1 = 64;
+   const size_t dim2 = 64;
+   const size_t dim3 = 64;
+
+   Rank3<type> Apar(dim1, dim2, dim3);
+   Rank3<type> Bpar(dim1, dim2, dim3);
+   Rank3<type> Cpar(dim1, dim2, dim3);
+
+   FixedArray3D<type, dim1, dim2, dim3> Afix ;
+   FixedArray3D<type, dim1, dim2, dim3> Bfix ;
+   FixedArray3D<type, dim1, dim2, dim3> Cfix ;
+
+   type*** Aold = newArr3(type, dim1, dim2, dim3);
+   type*** Bold = newArr3(type, dim1, dim2, dim3);
+   type*** Cold = newArr3(type, dim1, dim2, dim3);
+
+   //Array3<type> Aarr(dim1, dim2, dim3);
+   //Array3<type> Barr(dim1, dim2, dim3);
+   //Array3<type> Carr(dim1, dim2, dim3);
+   Arr3<type> Aarr(dim1, dim2, dim3);
+   Arr3<type> Barr(dim1, dim2, dim3);
+   Arr3<type> Carr(dim1, dim2, dim3);
+
+   printf("Initializing data ...\n");
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   {
+      Barr.fetch(i,j,k) = rand();
+      Carr.fetch(i,j,k) = rand();
+      Bpar.fetch(i,j,k) = Barr.get(i,j,k);
+      Cpar.fetch(i,j,k) = Carr.get(i,j,k);
+      Bfix.fetch(i,j,k) = Barr.get(i,j,k);
+      Cfix.fetch(i,j,k) = Carr.get(i,j,k);
+      Bold[i][j][k] = Barr.get(i,j,k);
+      Cold[i][j][k] = Carr.get(i,j,k);
+   }
+
+   stopwatch(START);
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   {
+      Aold[i][j][k] = Bold[i][j][k] * Cold[i][j][k];
+   }
+   printf("%d ms = Total time [i][j][k] chained-pointer array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   {
+      Afix[i][j][k] = Bfix[i][j][k] * Cfix[i][j][k];
+      //Afix.arr[i][j][k] = Bfix.arr[i][j][k] * Cfix.arr[i][j][k];
+      //Afix.fetch(i,j,k) = Bfix.fetch(i,j,k) * Cfix.fetch(i,j,k);
+   }
+   printf("%d ms = Total time [i][j][k] fixed-dimension array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   {
+      Apar.fetch(i,j,k) = Bpar.fetch(i,j,k) * Cpar.fetch(i,j,k);
+   }
+   printf("%d ms = Total time (i,j,k) Vicenc array\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   {
+      Aarr[i][j][k] = Barr[i][j][k] * Carr[i][j][k];
+   }
+   printf("%d ms = Total time [i][j][k] access of Arr3\n", tv_to_ms(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   {
+      Aarr.fetch(i,j,k) = Barr.get(i,j,k) * Carr.get(i,j,k);
+   }
+   printf("%d ms = Total time (i,j,k) access of Arr3\n", tv_to_ms(stopwatch(LAP)));
+
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   {
+      assert_eq(Aold[i][j][k], Aarr.get(i,j,k));
+      assert_eq(Apar.fetch(i,j,k), Aarr.get(i,j,k));
+      assert_eq(Afix.fetch(i,j,k), Aarr.get(i,j,k));
+   }
+
+   printf("Verification done!\n");
+   stopwatch(STOP);
+
+   // automatic destructor slows array access
+   // unless compiling with -fno-exceptions
+   //
+   Apar.free();
+   Bpar.free();
+   Cpar.free();
+   Aarr.free();
+   Barr.free();
+   Carr.free();
+}
+
+template <class type>
+void testArr4()
+{
+   // For some bizarre reason, if I comment out the code for the
+   // "fbr" and "fpa" arrays below then icpc on knc2 is somehow
+   // able to figure out that each iteration does the same thing
+   // in the case of Arr4, but not in the case of the chained
+   // pointer or fixed-dimension arrays.  Why not?  And why
+   // does this optimization occur for four-dimensional arrays
+   // and not for 3- or 2-dimensional arrays?  And why is this
+   // optimization no longer performed if "fbr" and "fpa" stuff
+   // is included?  The times are baffling.
+   const int ITERS = 1;
+   const size_t dim1 = 16;
+   const size_t dim2 = 16;
+   const size_t dim3 = 16;
+   const size_t dim4 = 16;
+
+   FixedArray4D<type, dim1, dim2, dim3, dim4> Afix;
+   FixedArray4D<type, dim1, dim2, dim3, dim4> Bfix;
+   FixedArray4D<type, dim1, dim2, dim3, dim4> Cfix;
+
+   type**** Aold = newArr4(type, dim1, dim2, dim3, dim4);
+   type**** Bold = newArr4(type, dim1, dim2, dim3, dim4);
+   type**** Cold = newArr4(type, dim1, dim2, dim3, dim4);
+
+   //Array4<type> Afbr(dim1, dim2, dim3, dim4);
+   //Array4<type> Bfbr(dim1, dim2, dim3, dim4);
+   //Array4<type> Cfbr(dim1, dim2, dim3, dim4);
+
+   //Array4<type> Afpa(dim1, dim2, dim3, dim4);
+   //Array4<type> Bfpa(dim1, dim2, dim3, dim4);
+   //Array4<type> Cfpa(dim1, dim2, dim3, dim4);
+
+   Arr4<type> Abra(dim1, dim2, dim3, dim4);
+   Arr4<type> Bbra(dim1, dim2, dim3, dim4);
+   Arr4<type> Cbra(dim1, dim2, dim3, dim4);
+
+   Arr4<type> Apar(dim1, dim2, dim3, dim4);
+   Arr4<type> Bpar(dim1, dim2, dim3, dim4);
+   Arr4<type> Cpar(dim1, dim2, dim3, dim4);
+
+   printf("Initializing data ...\n");
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   for(size_t l=0; l<dim4; l++)
+   {
+      Bbra.fetch(i,j,k,l) = rand();
+      Cbra.fetch(i,j,k,l) = rand();
+      Bpar.fetch(i,j,k,l) = Bbra.get(i,j,k,l);
+      Cpar.fetch(i,j,k,l) = Cbra.get(i,j,k,l);
+      //Bfbr.fetch(i,j,k,l) = Bbra.get(i,j,k,l);
+      //Cfbr.fetch(i,j,k,l) = Cbra.get(i,j,k,l);
+      //Bfpa.fetch(i,j,k,l) = Bbra.get(i,j,k,l);
+      //Cfpa.fetch(i,j,k,l) = Cbra.get(i,j,k,l);
+      Bfix.fetch(i,j,k,l) = Bbra.get(i,j,k,l);
+      Cfix.fetch(i,j,k,l) = Cbra.get(i,j,k,l);
+      Bold[i][j][k][l] = Bbra.get(i,j,k,l);
+      Cold[i][j][k][l] = Cbra.get(i,j,k,l);
+   }
+
+   stopwatch(START);
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   for(size_t l=0; l<dim4; l++)
+   {
+      Aold[i][j][k][l] = Bold[i][j][k][l] * Cold[i][j][k][l];
+   }
+   printf("%d us = Total time [i][j][k][l] chained-pointer array\n", tv_to_us(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   for(size_t l=0; l<dim4; l++)
+   {
+      Afix[i][j][k][l] = Bfix[i][j][k][l] * Cfix[i][j][k][l];
+      //Afix.arr[i][j][k][l] = Bfix.arr[i][j][k][l] * Cfix.arr[i][j][k][l];
+      //Afix.fetch(i,j,k,l) = Bfix.fetch(i,j,k,l) * Cfix.fetch(i,j,k,l);
+   }
+   printf("%d us = Total time [i][j][k][l] fixed-dimension array\n", tv_to_us(stopwatch(LAP)));
+
+   //for(int t=0; t<ITERS; t++)
+   //for(size_t i=0; i<dim1; i++)
+   //for(size_t j=0; j<dim2; j++)
+   //for(size_t k=0; k<dim3; k++)
+   //for(size_t l=0; l<dim4; l++)
+   //{
+   //   Afbr.fetch(i,j,k,l) = Bfbr.get(i,j,k,l) * Cfbr.get(i,j,k,l);
+   //}
+   //printf("%d us = Total time (i,j,k,l) access of Array4\n", tv_to_us(stopwatch(LAP)));
+
+   //for(int t=0; t<ITERS; t++)
+   //for(size_t i=0; i<dim1; i++)
+   //for(size_t j=0; j<dim2; j++)
+   //for(size_t k=0; k<dim3; k++)
+   //for(size_t l=0; l<dim4; l++)
+   //{
+   //   Afpa.fetch(i,j,k,l) = Bfpa.get(i,j,k,l) * Cfpa.get(i,j,k,l);
+   //}
+   //printf("%d us = Total time (i,j,k,l) access of Array4\n", tv_to_us(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   for(size_t l=0; l<dim4; l++)
+   {
+      Abra[i][j][k][l] = Bbra[i][j][k][l] * Cbra[i][j][k][l];
+   }
+   printf("%d us = Total time [i][j][k][l] access of Arr4\n", tv_to_us(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   for(size_t l=0; l<dim4; l++)
+   {
+      Apar.fetch(i,j,k,l) = Bpar.get(i,j,k,l) * Cpar.get(i,j,k,l);
+   }
+   printf("%d us = Total time (i,j,k,l) access of Arr4\n", tv_to_us(stopwatch(LAP)));
+
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   for(size_t l=0; l<dim4; l++)
+   {
+      assert_eq(Aold[i][j][k][l], Abra.get(i,j,k,l));
+      assert_eq(Apar.fetch(i,j,k,l), Abra.get(i,j,k,l));
+      //assert_eq(Afbr[i][j][k][l], Abra.get(i,j,k,l));
+      //assert_eq(Afpa.fetch(i,j,k,l), Abra.get(i,j,k,l));
+      assert_eq(Afix.fetch(i,j,k,l), Abra.get(i,j,k,l));
+   }
+
+   printf("Verification done!\n");
+   stopwatch(STOP);
+
+   Apar.free();
+   Bpar.free();
+   Cpar.free();
+   Abra.free();
+   Bbra.free();
+   Cbra.free();
+}
+
+int main()
+{
+  //printf("=== testing Arr2<int> (diagonal) ===\n");
+  //testArr2_diagonal<int>();
+  //printf("=== testing Arr2<double> (diagonal) ===\n");
+  //testArr2_diagonal<double>();
+  printf("=== testing Arr2<int> ===\n");
+  testArr2<int>();
+  printf("=== testing Arr2<double> ===\n");
+  testArr2<double>();
+  printf("=== testing Arr3<int> ===\n");
+  testArr3<int>();
+  printf("=== testing Arr3<double> ===\n");
+  testArr3<double>();
+  printf("=== testing Arr4<int> ===\n");
+  testArr4<int>();
+  printf("=== testing Arr4<double> ===\n");
+  testArr4<double>();
+}
diff --git a/utility/asserts.cpp b/utility/asserts.cpp
index 576944f4..cf3716b6 100644
--- a/utility/asserts.cpp
+++ b/utility/asserts.cpp
@@ -19,5 +19,8 @@ void assert_error(const char *file, int line, const char *func, const char *op,
       abort(); \
   }
 
+implement_assert_errmsg(size_t, size_t);
+implement_assert_errmsg(int, size_t);
+implement_assert_errmsg(size_t, int);
 implement_assert_errmsg(int, int);
 implement_assert_errmsg(const char *, const char *);
diff --git a/utility/debug.cpp b/utility/debug.cpp
index 306775fb..50ad6e01 100644
--- a/utility/debug.cpp
+++ b/utility/debug.cpp
@@ -1,5 +1,7 @@
 
-#include "MPIdata.h" // for get_rank
+#ifndef NO_MPI
+  #include "MPIdata.h" // for get_rank
+#endif
 #include "debug.h"
 
 #define implement_dprintvar_fileLine(code,type) \
@@ -17,8 +19,13 @@ void dfprintf_fileLine(FILE * fptr, const char *func, const char *file, int line
   fflush(fptr);
   va_list args;
   va_start(args, format);
-  fprintf(fptr, "(%d) DEBUG %s(), %s:%d: ",
+  fprintf(fptr,
+#ifndef NO_MPI
+    "(%d) DEBUG %s(), %s:%d: ",
     MPIdata::get_rank(),
+#else
+    "DEBUG %s(), %s:%d: ",
+#endif
     func, file, // my_basename(file),
     line_number);
   /* print out remainder of message */

From 9b4962a7b08e7fd957ad1c6b2f83946f910ed5ef Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 2 Aug 2013 18:23:24 +0200
Subject: [PATCH 025/118] fixed compiler errors introduced in previous commit
 (g++)

---
 fields/EMfields3D.cpp         | 2 +-
 include/Alloc.h               | 1 -
 include/asserts.h             | 1 +
 particles/Particles3D.cpp     | 2 +-
 particles/Particles3Dcomm.cpp | 3 +--
 utility/asserts.cpp           | 1 +
 6 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 7be94a18..e6600592 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -208,7 +208,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   //
   const long long nop_ll = pcls.getNOP();
   const int nop = pcls.getNOP();
-  assert_le(nop_ll,INT_MAX); // else would need to use long long
+  assert_le(nop_ll, (long long) INT_MAX); // else would need to use long long
   // To make memory use scale to a large number of threads, we
   // could first apply an efficient parallel sorting algorithm
   // to the particles and then accumulate moments in smaller
diff --git a/include/Alloc.h b/include/Alloc.h
index a0837911..33b70b1b 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -1,7 +1,6 @@
 #ifndef IPIC_ALLOC_H
 #define IPIC_ALLOC_H
 #include <cstddef> // for alignment stuff
-#include "ipicdefs.h" // for CHECK_BOUNDS
 #include "asserts.h" // for assert_le, assert_lt
 //#include "arrays.h" // fixed-dimension arrays
 
diff --git a/include/asserts.h b/include/asserts.h
index 0a8e3b40..e46b9c77 100644
--- a/include/asserts.h
+++ b/include/asserts.h
@@ -112,6 +112,7 @@ declare_assert_errmsg(size_t, size_t);
 declare_assert_errmsg(int, size_t);
 declare_assert_errmsg(size_t, int);
 declare_assert_errmsg(int, int);
+declare_assert_errmsg(long long, long long);
 declare_assert_errmsg(const char *, const char *);
 // put in assert_string.h:
 // #include "assert.h"
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index cc83f3f0..7a4d01d0 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -329,7 +329,7 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
 
   const double dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
   const double inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
-  assert_le(nop,INT_MAX); // else would need to use long long
+  assert_le(nop,(long long) INT_MAX); // else would need to use long long
   // don't bother trying to push any particles simultaneously;
   // MIC already does vectorization automatically, and trying
   // to do it by hand only hurts performance.
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 3e5c1ede..e9c73aaa 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -292,13 +292,12 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
   const double nxn = grid->getNXN();
   const double nyn = grid->getNYN();
   const double nzn = grid->getNZN();
-  assert_le(nop,INT_MAX); // else would need to use long long
+  assert_le(nop,(long long)INT_MAX); // else would need to use long long
   // to make memory use scale to a large number of threads we
   // could first apply an efficient parallel sorting algorithm
   // to the particles and then accumulate moments in smaller
   // subarrays.
   {
-    assert_le(nop,INT_MAX); // else would need to use long long
     for (int i = 0; i < nop; i++)
     {
       const int ix = 2 + int (floor((x[i] - xstart) * inv_dx));
diff --git a/utility/asserts.cpp b/utility/asserts.cpp
index cf3716b6..312fe79b 100644
--- a/utility/asserts.cpp
+++ b/utility/asserts.cpp
@@ -23,4 +23,5 @@ implement_assert_errmsg(size_t, size_t);
 implement_assert_errmsg(int, size_t);
 implement_assert_errmsg(size_t, int);
 implement_assert_errmsg(int, int);
+implement_assert_errmsg(long long, long long);
 implement_assert_errmsg(const char *, const char *);

From 522138871a9bcd007c89b8b630823d864e577510 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 5 Aug 2013 15:47:26 +0200
Subject: [PATCH 026/118] implemented use of doubleArr3 for Moments class

---
 fields/EMfields3D.cpp | 44 +++++++++++------------
 fields/Moments.cpp    | 32 -----------------
 include/Alloc.h       | 21 ++++++++---
 include/Moments.h     | 83 ++++++++++++++++++++++++-------------------
 4 files changed, 86 insertions(+), 94 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index e6600592..d430aae2 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -169,8 +169,8 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) {
   momentsArray = new Moments*[sizeMomentsArray];
   for(int i=0;i<sizeMomentsArray;i++)
   {
-    momentsArray[i] = new Moments;
-    momentsArray[i]->init(nxn,nyn,nzn);
+    momentsArray[i] = new Moments(nxn,nyn,nzn);
+    //momentsArray[i]->init(nxn,nyn,nzn);
   }
 }
 
@@ -220,16 +220,16 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
     Moments& speciesMoments = fetch_momentsArray(thread_num);
     speciesMoments.set_to_zero();
     //
-    double*** rho = speciesMoments.fetch_rho();
-    double*** Jx  = speciesMoments.fetch_Jx();
-    double*** Jy  = speciesMoments.fetch_Jy();
-    double*** Jz  = speciesMoments.fetch_Jz();
-    double*** Pxx = speciesMoments.fetch_Pxx();
-    double*** Pxy = speciesMoments.fetch_Pxy();
-    double*** Pxz = speciesMoments.fetch_Pxz();
-    double*** Pyy = speciesMoments.fetch_Pyy();
-    double*** Pyz = speciesMoments.fetch_Pyz();
-    double*** Pzz = speciesMoments.fetch_Pzz();
+    doubleArr3& rho = speciesMoments.fetch_rho();
+    doubleArr3& Jx  = speciesMoments.fetch_Jx();
+    doubleArr3& Jy  = speciesMoments.fetch_Jy();
+    doubleArr3& Jz  = speciesMoments.fetch_Jz();
+    doubleArr3& Pxx = speciesMoments.fetch_Pxx();
+    doubleArr3& Pxy = speciesMoments.fetch_Pxy();
+    doubleArr3& Pxz = speciesMoments.fetch_Pxz();
+    doubleArr3& Pyy = speciesMoments.fetch_Pyy();
+    doubleArr3& Pyz = speciesMoments.fetch_Pyz();
+    doubleArr3& Pzz = speciesMoments.fetch_Pzz();
     // The following loop is expensive, so it is wise to assume that the
     // compiler is stupid.  Therefore we should on the one hand
     // expand things out and on the other hand avoid repeating computations.
@@ -389,16 +389,16 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
     // One-dimensional array access is presumably
     // more efficient on poor compilers.
     //
-    const double*const rho1d = rho[0][0];
-    const double*const Jx1d  = Jx [0][0];
-    const double*const Jy1d  = Jy [0][0];
-    const double*const Jz1d  = Jz [0][0];
-    const double*const Pxx1d = Pxx[0][0];
-    const double*const Pxy1d = Pxy[0][0];
-    const double*const Pxz1d = Pxz[0][0];
-    const double*const Pyy1d = Pyy[0][0];
-    const double*const Pyz1d = Pyz[0][0];
-    const double*const Pzz1d = Pzz[0][0];
+    doubleArr1 rho1d = rho.fetch_Arr1();
+    doubleArr1 Jx1d  = Jx .fetch_Arr1();
+    doubleArr1 Jy1d  = Jy .fetch_Arr1();
+    doubleArr1 Jz1d  = Jz .fetch_Arr1();
+    doubleArr1 Pxx1d = Pxx.fetch_Arr1();
+    doubleArr1 Pxy1d = Pxy.fetch_Arr1();
+    doubleArr1 Pxz1d = Pxz.fetch_Arr1();
+    doubleArr1 Pyy1d = Pyy.fetch_Arr1();
+    doubleArr1 Pyz1d = Pyz.fetch_Arr1();
+    doubleArr1 Pzz1d = Pzz.fetch_Arr1();
     //
     assert_eq(speciesMoments.get_nx(), nxn);
     assert_eq(speciesMoments.get_ny(), nyn);
diff --git a/fields/Moments.cpp b/fields/Moments.cpp
index 5565929b..6c31a2f1 100644
--- a/fields/Moments.cpp
+++ b/fields/Moments.cpp
@@ -1,38 +1,6 @@
 #include "Moments.h"
 #include "Alloc.h"
 
-// construct empty instance (not zeroed)
-void Moments::init(int nx_, int ny_, int nz_)
-{
-  nx = nx_;
-  ny = ny_;
-  nz = nz_;
-  rho = newArr3(double, nx, ny, nz);
-  Jx = newArr3(double, nx, ny, nz);
-  Jy = newArr3(double, nx, ny, nz);
-  Jz = newArr3(double, nx, ny, nz);
-  pXX = newArr3(double, nx, ny, nz);
-  pXY = newArr3(double, nx, ny, nz);
-  pXZ = newArr3(double, nx, ny, nz);
-  pYY = newArr3(double, nx, ny, nz);
-  pYZ = newArr3(double, nx, ny, nz);
-  pZZ = newArr3(double, nx, ny, nz);
-}
-
-Moments::~Moments() {
-  // nodes and species
-  delArr3(rho, nx, ny);
-  delArr3(Jx, nx, ny);
-  delArr3(Jy, nx, ny);
-  delArr3(Jz, nx, ny);
-  delArr3(pXX, nx, ny);
-  delArr3(pXY, nx, ny);
-  delArr3(pXZ, nx, ny);
-  delArr3(pYY, nx, ny);
-  delArr3(pYZ, nx, ny);
-  delArr3(pZZ, nx, ny);
-}
-
 void Moments::set_to_zero() {
   // #pragma omp parallel for collapse(1)
   for (register int i = 0; i < nx; i++)
diff --git a/include/Alloc.h b/include/Alloc.h
index 33b70b1b..8441221e 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -14,6 +14,8 @@
 
     For examples of use of this class,
     see test_arrays.cpp
+
+    An alternative would be to use boost arrays.
 */
 #define ALIGNMENT (64)
 #ifdef __INTEL_COMPILER
@@ -199,14 +201,21 @@ class ArrayRef3
 // }
 //
 // proposed improvements:
+// - allow shifting of the base:
+//   - need "double shift" in each class
+//   - need to implement "arr3.set_bases(b1,b2,b3);"
+//     which calculates "shift".
+//   - need "const size_t b1, b2, b3;" for beginning indices
+//     to allow bounds checking.  Should not incur run-time
+//     penalty, but it so then condition on CHECK_BOUNDS.
 // - methods that use parallel arithmetic for omp and vectorized code
 
 template <class type>
 class Arr1
 {
   private: // data
-    type* const __restrict__ arr;
     const size_t S1;
+    type* const __restrict__ arr;
   public:
     ~Arr1() { }
     void free() { AlignedFree(arr); }
@@ -277,15 +286,19 @@ class Arr2
       { ALIGNED(arr); return arr[getidx(n2,n1)]; }
     void set(size_t n2,size_t n1, type value)
       { ALIGNED(arr); arr[getidx(n2,n1)] = value; }
+    inline Arr1<type>fetch_Arr1(){ return Arr1<type>(arr, S1*S2); }
 };
 
 template <class type>
 class Arr3
 {
   private: // data
-    type* const __restrict__ arr;
     const size_t S3,S2,S1;
+    type* const __restrict__ arr;
   public:
+    size_t dim1()const{return S1;}
+    size_t dim2()const{return S2;}
+    size_t dim3()const{return S3;}
     ~Arr3(){}
     void free() { AlignedFree(arr); }
     Arr3(size_t s3, size_t s2, size_t s1) :
@@ -301,7 +314,6 @@ class Arr3
       check_bounds(n3, S3);
       return ArrayRef2<type>(arr, n3*S2, S2, S1);
     }
-    type* get_arr(){return arr;}
     inline size_t getidx(size_t n3, size_t n2, size_t n1) const
     {
       check_bounds(n3, S3);
@@ -317,6 +329,7 @@ class Arr3
       { ALIGNED(arr); return arr[getidx(n3,n2,n1)]; }
     void set(size_t n3,size_t n2,size_t n1, type value)
       { ALIGNED(arr); arr[getidx(n3,n2,n1)] = value; }
+    inline Arr1<type>fetch_Arr1(){ return Arr1<type>(arr, S1*S2*S3); }
 };
 
 template <class type>
@@ -380,7 +393,6 @@ template <class type>
 struct Array3 : public Arr3<type>
 {
     ~Array3(){Arr3<type>::free();}
-    Arr3<type>& fast_accessor() { return *(Arr3<type>*)this; }
     Array3(size_t s3, size_t s2, size_t s1) : Arr3<type>(s3,s2,s1) { }
 };
 
@@ -406,6 +418,7 @@ typedef Arr1<double> doubleArr1;
 typedef Arr2<double> doubleArr2;
 typedef Arr3<double> doubleArr3;
 typedef Arr4<double> doubleArr4;
+typedef ArrayRef1<double> doubleArrRef1;
 //
 #define newArr4(type,sz1,sz2,sz3,sz4) newArray4<type>((sz1),(sz2),(sz3),(sz4))
 #define newArr3(type,sz1,sz2,sz3) newArray3<type>((sz1),(sz2),(sz3))
diff --git a/include/Moments.h b/include/Moments.h
index cb0018aa..53fe942f 100644
--- a/include/Moments.h
+++ b/include/Moments.h
@@ -1,24 +1,25 @@
 #ifndef Moments_H
 #define Moments_H
+#include "Alloc.h"
 
 // class to accumulate node-centered species moments
 // 
 class Moments {
   private:
-    double ***rho;
+    doubleArr3 rho;
 
     /** current density, defined on nodes */
-    double ***Jx;
-    double ***Jy;
-    double ***Jz;
+    doubleArr3 Jx;
+    doubleArr3 Jy;
+    doubleArr3 Jz;
 
     /** pressure tensor components, defined on nodes */
-    double ***pXX;
-    double ***pXY;
-    double ***pXZ;
-    double ***pYY;
-    double ***pYZ;
-    double ***pZZ;
+    doubleArr3 pXX;
+    doubleArr3 pXY;
+    doubleArr3 pXZ;
+    doubleArr3 pYY;
+    doubleArr3 pYZ;
+    doubleArr3 pZZ;
     int nx;
     int ny;
     int nz;
@@ -27,35 +28,45 @@ class Moments {
     int get_nx() const { return nx; }
     int get_ny() const { return ny; }
     int get_nz() const { return nz; }
-    double get_rho(int i, int j, int k) const { return rho[i][j][k]; }
-    double get_Jx (int i, int j, int k) const { return Jx [i][j][k]; }
-    double get_Jy (int i, int j, int k) const { return Jy [i][j][k]; }
-    double get_Jz (int i, int j, int k) const { return Jz [i][j][k]; }
-    double get_pXX(int i, int j, int k) const { return pXX[i][j][k]; }
-    double get_pXY(int i, int j, int k) const { return pXY[i][j][k]; }
-    double get_pXZ(int i, int j, int k) const { return pXZ[i][j][k]; }
-    double get_pYY(int i, int j, int k) const { return pYY[i][j][k]; }
-    double get_pYZ(int i, int j, int k) const { return pYZ[i][j][k]; }
-    double get_pZZ(int i, int j, int k) const { return pZZ[i][j][k]; }
+    double get_rho(int i, int j, int k) const { return rho.get(i,j,k); }
+    double get_Jx (int i, int j, int k) const { return Jx .get(i,j,k); }
+    double get_Jy (int i, int j, int k) const { return Jy .get(i,j,k); }
+    double get_Jz (int i, int j, int k) const { return Jz .get(i,j,k); }
+    double get_pXX(int i, int j, int k) const { return pXX.get(i,j,k); }
+    double get_pXY(int i, int j, int k) const { return pXY.get(i,j,k); }
+    double get_pXZ(int i, int j, int k) const { return pXZ.get(i,j,k); }
+    double get_pYY(int i, int j, int k) const { return pYY.get(i,j,k); }
+    double get_pYZ(int i, int j, int k) const { return pYZ.get(i,j,k); }
+    double get_pZZ(int i, int j, int k) const { return pZZ.get(i,j,k); }
     // fetch accessors (write access)
-    double*** fetch_rho() { return rho; }
-    double*** fetch_Jx () { return Jx ; }
-    double*** fetch_Jy () { return Jy ; }
-    double*** fetch_Jz () { return Jz ; }
-    double*** fetch_Pxx() { return pXX; }
-    double*** fetch_Pxy() { return pXY; }
-    double*** fetch_Pxz() { return pXZ; }
-    double*** fetch_Pyy() { return pYY; }
-    double*** fetch_Pyz() { return pYZ; }
-    double*** fetch_Pzz() { return pZZ; }
+    doubleArr3& fetch_rho() { return rho; }
+    doubleArr3& fetch_Jx () { return Jx ; }
+    doubleArr3& fetch_Jy () { return Jy ; }
+    doubleArr3& fetch_Jz () { return Jz ; }
+    doubleArr3& fetch_Pxx() { return pXX; }
+    doubleArr3& fetch_Pxy() { return pXY; }
+    doubleArr3& fetch_Pxz() { return pXZ; }
+    doubleArr3& fetch_Pyy() { return pYY; }
+    doubleArr3& fetch_Pyz() { return pYZ; }
+    doubleArr3& fetch_Pzz() { return pZZ; }
   public:
-    Moments() {
+    Moments(int nxn, int nyn, int nzn) :
+      nx(nxn),
+      ny(nyn),
+      nz(nzn),
+      rho (nxn, nyn, nzn),
+      Jx  (nxn, nyn, nzn),
+      Jy  (nxn, nyn, nzn),
+      Jz  (nxn, nyn, nzn),
+      pXX (nxn, nyn, nzn),
+      pXY (nxn, nyn, nzn),
+      pXZ (nxn, nyn, nzn),
+      pYY (nxn, nyn, nzn),
+      pYZ (nxn, nyn, nzn),
+      pZZ (nxn, nyn, nzn)
+    {
     };
-    Moments(int nx_, int ny_, int nz_){
-      init(nx_,ny_,nz_);
-    }
-    void init(int nx_, int ny_, int nz_);
-    ~Moments();
+    ~Moments(){};
     void set_to_zero();
 };
 

From c5328555704eb519601121bc67f9ae6440a972a5 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 7 Aug 2013 13:50:29 +0200
Subject: [PATCH 027/118] iss #43: implemented array classes; fixes iss #44
 (memory leak)

---
 ConfigFile/src/ConfigFile.cpp       |   1 +
 PSKOutput3D/PSKhdf5adaptor.cpp      |  18 +-
 communication/ComInterpNodes3D.cpp  |   3 +-
 communication/ComNodes3D.cpp        |  44 +-
 communication/ComParser3D.cpp       |   1 +
 fields/EMfields3D.cpp               | 779 +++++++++----------------
 grids/Grid3DCU.cpp                  |  55 +-
 iPic3D.cpp                          |   1 +
 include/Alloc.h                     | 859 ++++++++++++++++++----------
 include/Basic.h                     | 139 +++--
 include/ComNodes3D.h                |  27 +-
 include/EMfields3D.h                | 453 +++++++--------
 include/Grid3DCU.h                  |  66 ++-
 include/Moments.h                   |  40 +-
 include/PSKhdf5adaptor.h            |   7 +-
 include/TransArraySpace3D.h         |  19 +-
 include/arraysfwd.h                 |  52 ++
 include/phdf5.h                     |   5 +-
 inputoutput/Collective.cpp          |   1 +
 inputoutput/WriteOutputParallel.cpp |  33 +-
 inputoutput/phdf5.cpp               |   5 +-
 main/iPic3Dlib.cpp                  |   1 +
 particles/Particles3D.cpp           |  64 +--
 tests/Makefile                      |   5 +-
 tests/test_arrays.cpp               | 107 ++--
 25 files changed, 1439 insertions(+), 1346 deletions(-)
 create mode 100644 include/arraysfwd.h

diff --git a/ConfigFile/src/ConfigFile.cpp b/ConfigFile/src/ConfigFile.cpp
index ed9f6b53..173833dd 100644
--- a/ConfigFile/src/ConfigFile.cpp
+++ b/ConfigFile/src/ConfigFile.cpp
@@ -2,6 +2,7 @@
 
 #include "ConfigFile.h"
 #include "errors.h"
+#include "debug.h"
 
 using std::string;
 
diff --git a/PSKOutput3D/PSKhdf5adaptor.cpp b/PSKOutput3D/PSKhdf5adaptor.cpp
index 8a4d2d14..f6033586 100644
--- a/PSKOutput3D/PSKhdf5adaptor.cpp
+++ b/PSKOutput3D/PSKhdf5adaptor.cpp
@@ -658,10 +658,10 @@ void HDF5OutputAdaptor::write(const std::string & tag, const Dimens dimens, cons
   //}
 }
 
-void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, double ***d_array) {
+void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, const_arr3_double d_array) {
   if (dimens.size() != 3) {
     eprintf("Dimens size not 3 for object %s", objname.c_str());
-    //PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(double*** array)");
+    //PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(const_arr3_double array)");
     //throw e;
   }
 
@@ -689,15 +689,15 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
     delete[]d_array_p;
   //}
   //catch(PSK::Exception & e) {
-  //  e.push("In HDF5OutputAdaptor::write(double*** array)");
+  //  e.push("In HDF5OutputAdaptor::write(const_arr3_double array)");
   //  throw e;
   //}
 }
 
-void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, const int ns, double ****d_array) {
+void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, const int ns, const_arr4_double d_array) {
   if (dimens.size() != 3) {
     eprintf("Dimens size not 3 for object %s", objname.c_str());
-    //PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(double**** array)");
+    //PSK::OutputException e("Dimens size not 3 for object " + objname, "HDF5OutputAdaptor::write(const_arr4_double array)");
     //throw e;
   }
 
@@ -724,7 +724,7 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
     delete[]d_array_p;
   //}
   //catch(PSK::Exception & e) {
-  //  e.push("In HDF5OutputAdaptor::write(double**** array)");
+  //  e.push("In HDF5OutputAdaptor::write(const_arr4_double array)");
   //  throw e;
   //}
 }
@@ -752,10 +752,10 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
   //}
 }
 
-void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, const int ns, double ***d_array) {
+void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens, const int ns, const_arr3_double d_array) {
   if (dimens.size() != 2) {
     eprintf("Dimens size not 2 for object %s", objname.c_str());
-    //PSK::OutputException e("Dimens size not 2 for object " + objname, "HDF5OutputAdaptor::write(double*** array)");
+    //PSK::OutputException e("Dimens size not 2 for object " + objname, "HDF5OutputAdaptor::write(const_arr3_double array)");
     //throw e;
   }
 
@@ -770,7 +770,7 @@ void HDF5OutputAdaptor::write(const std::string & objname, const Dimens dimens,
     write(objname, dimens, d_array_p);
     delete[]d_array_p;
   //} catch(PSK::Exception & e) {
-  //  e.push("In HDF5OutputAdaptor::write(double*** array)");
+  //  e.push("In HDF5OutputAdaptor::write(const_arr3_double array)");
   //  throw e;
   //}
 }
diff --git a/communication/ComInterpNodes3D.cpp b/communication/ComInterpNodes3D.cpp
index 1e24dd73..d04e6765 100644
--- a/communication/ComInterpNodes3D.cpp
+++ b/communication/ComInterpNodes3D.cpp
@@ -1,9 +1,10 @@
 
 #include "ComInterpNodes3D.h"
 #include "ipicdefs.h"
+#include "Alloc.h"
 
 /** communicate ghost cells and sum the contribution with a index indicating the number of species*/
-void communicateInterp(int nx, int ny, int nz, int ns, double ****vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateInterp(int nx, int ny, int nz, int ns, double**** vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
diff --git a/communication/ComNodes3D.cpp b/communication/ComNodes3D.cpp
index 977494f4..5d6f4424 100644
--- a/communication/ComNodes3D.cpp
+++ b/communication/ComNodes3D.cpp
@@ -2,10 +2,12 @@
 #include "ComNodes3D.h"
 #include "TimeTasks.h"
 #include "ipicdefs.h"
+#include "Alloc.h"
 
 /** communicate ghost cells (FOR NODES) */
-void communicateNode(int nx, int ny, int nz, double ***vector, VirtualTopology3D * vct) {
+void communicateNode(int nx, int ny, int nz, arr3_double& _vector, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector=_vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -107,8 +109,9 @@ void communicateNode(int nx, int ny, int nz, double ***vector, VirtualTopology3D
   timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR NODES) */
-void communicateNodeBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateNodeBC(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector = _vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -212,8 +215,9 @@ void communicateNodeBC(int nx, int ny, int nz, double ***vector, int bcFaceXrigh
   timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR NODES) with particles BC*/
-void communicateNodeBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateNodeBC_P(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -318,9 +322,10 @@ void communicateNodeBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXri
 }
 
 /** SPECIES: communicate ghost cells */
-void communicateNode(int nx, int ny, int nz, double ****vector, int ns, VirtualTopology3D * vct) {
-
+void communicateNode(int nx, int ny, int nz, arr4_double& _vector, int ns, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ****vector = _vector.fetch_arr4();
+
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -422,8 +427,9 @@ void communicateNode(int nx, int ny, int nz, double ****vector, int ns, VirtualT
 
 // PARTICLES
 /** SPECIES: communicate ghost cells */
-void communicateNode_P(int nx, int ny, int nz, double ****vector, int ns, VirtualTopology3D * vct) {
+void communicateNode_P(int nx, int ny, int nz, arr4_double& _vector, int ns, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ****vector = _vector.fetch_arr4();
 
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -526,9 +532,10 @@ void communicateNode_P(int nx, int ny, int nz, double ****vector, int ns, Virtua
 
 // 
 /** communicate ghost cells (FOR CENTERS) */
-void communicateCenter(int nx, int ny, int nz, double ***vector, VirtualTopology3D * vct) {
-
+void communicateCenter(int nx, int ny, int nz, arr3_double& _vector, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector = _vector.fetch_arr3();
+
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -627,8 +634,9 @@ void communicateCenter(int nx, int ny, int nz, double ***vector, VirtualTopology
   timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
-void communicateCenterBoxStencilBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -659,8 +667,9 @@ void communicateCenterBoxStencilBC(int nx, int ny, int nz, double ***vector, int
 }
 // particles
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
-void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -693,8 +702,9 @@ void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, double ***vector, i
 // 
 
 
-void communicateNodeBoxStencilBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -724,8 +734,9 @@ void communicateNodeBoxStencilBC(int nx, int ny, int nz, double ***vector, int b
   timeTasks.addto_communicate();
 }
 
-void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
   double *ghostXleftFace = new double[(ny - 2) * (nz - 2)];
@@ -758,8 +769,9 @@ void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, double ***vector, int
 
 
 /** SPECIES: communicate ghost cells */
-void communicateCenter(int nx, int ny, int nz, double ****vector, int ns, VirtualTopology3D * vct) {
+void communicateCenter(int nx, int ny, int nz, arr4_double& _vector, int ns, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ****vector=_vector.fetch_arr4();
 
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -858,8 +870,9 @@ void communicateCenter(int nx, int ny, int nz, double ****vector, int ns, Virtua
   timeTasks.addto_communicate();
 }
 // /////////// communication + BC ////////////////////////////
-void communicateCenterBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateCenterBC(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector=_vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -961,8 +974,9 @@ void communicateCenterBC(int nx, int ny, int nz, double ***vector, int bcFaceXri
   timeTasks.addto_communicate();
 }
 // /////////// communication + BC ////////////////////////////
-void communicateCenterBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateCenterBC_P(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
+  double ***vector=_vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
diff --git a/communication/ComParser3D.cpp b/communication/ComParser3D.cpp
index e7996e73..18e374a0 100644
--- a/communication/ComParser3D.cpp
+++ b/communication/ComParser3D.cpp
@@ -1,4 +1,5 @@
 
+#include <mpi.h>
 #include "ComParser3D.h"
 
 /** swap the buffer */
diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index d430aae2..c01b149c 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -7,39 +7,137 @@
 #include "ompdefs.h"
 
 /*! constructor */
-EMfields3D::EMfields3D(Collective * col, Grid * grid) {
-  nxc = grid->getNXC();
-  nxn = grid->getNXN();
-  nyc = grid->getNYC();
-  nyn = grid->getNYN();
-  nzc = grid->getNZC();
-  nzn = grid->getNZN();
-  dx = grid->getDX();
-  dy = grid->getDY();
-  dz = grid->getDZ();
-  invVOL = grid->getInvVOL();
-  xStart = grid->getXstart();
-  xEnd = grid->getXend();
-  yStart = grid->getYstart();
-  yEnd = grid->getYend();
-  zStart = grid->getZstart();
-  zEnd = grid->getZend();
-  Lx = col->getLx();
-  Ly = col->getLy();
-  Lz = col->getLz();
-  ns = col->getNs();
-  c = col->getC();
-  dt = col->getDt();
-  th = col->getTh();
-  ue0 = col->getU0(0);
-  ve0 = col->getV0(0);
-  we0 = col->getW0(0);
-  x_center = col->getx_center();
-  y_center = col->gety_center();
-  z_center = col->getz_center();
-  L_square = col->getL_square();
-
-  delt = c * th * dt;
+//
+// We rely on the following rule from the C++ standard, section 12.6.2.5:
+//
+//   nonstatic data members shall be initialized in the order
+//   they were declared in the class definition
+//
+// in particular, nxc, nyc, nzc and nxn, nyn, nzn are assumed
+// initialized when subsequently used.
+//
+EMfields3D::EMfields3D(Collective * col, Grid * grid) : 
+  nxc(grid->getNXC()),
+  nxn(grid->getNXN()),
+  nyc(grid->getNYC()),
+  nyn(grid->getNYN()),
+  nzc(grid->getNZC()),
+  nzn(grid->getNZN()),
+  dx(grid->getDX()),
+  dy(grid->getDY()),
+  dz(grid->getDZ()),
+  invVOL(grid->getInvVOL()),
+  xStart(grid->getXstart()),
+  xEnd(grid->getXend()),
+  yStart(grid->getYstart()),
+  yEnd(grid->getYend()),
+  zStart(grid->getZstart()),
+  zEnd(grid->getZend()),
+  Lx(col->getLx()),
+  Ly(col->getLy()),
+  Lz(col->getLz()),
+  ns(col->getNs()),
+  c(col->getC()),
+  dt(col->getDt()),
+  th(col->getTh()),
+  ue0(col->getU0(0)),
+  ve0(col->getV0(0)),
+  we0(col->getW0(0)),
+  x_center(col->getx_center()),
+  y_center(col->gety_center()),
+  z_center(col->getz_center()),
+  L_square(col->getL_square()),
+  delt (c*th*dt), // declared after these
+  //
+  // array allocation: nodes
+  //
+  Ex   (nxn, nyn, nzn),
+  Ey   (nxn, nyn, nzn),
+  Ez   (nxn, nyn, nzn),
+  Exth (nxn, nyn, nzn),
+  Eyth (nxn, nyn, nzn),
+  Ezth (nxn, nyn, nzn),
+  Bxn  (nxn, nyn, nzn),
+  Byn  (nxn, nyn, nzn),
+  Bzn  (nxn, nyn, nzn),
+  rhon (nxn, nyn, nzn),
+  Jx   (nxn, nyn, nzn),
+  Jy   (nxn, nyn, nzn),
+  Jz   (nxn, nyn, nzn),
+  Jxh  (nxn, nyn, nzn),
+  Jyh  (nxn, nyn, nzn),
+  Jzh  (nxn, nyn, nzn),
+  //
+  // species-specific quantities
+  //
+  rhons (ns, nxn, nyn, nzn),
+  rhocs (ns, nxc, nyc, nzc),
+  Jxs   (ns, nxn, nyn, nzn),
+  Jys   (ns, nxn, nyn, nzn),
+  Jzs   (ns, nxn, nyn, nzn),
+  pXXsn (ns, nxn, nyn, nzn),
+  pXYsn (ns, nxn, nyn, nzn),
+  pXZsn (ns, nxn, nyn, nzn),
+  pYYsn (ns, nxn, nyn, nzn),
+  pYZsn (ns, nxn, nyn, nzn),
+  pZZsn (ns, nxn, nyn, nzn),
+
+  // array allocation: central points 
+  //
+  PHI  (nxc, nyc, nzc),
+  Bxc  (nxc, nyc, nzc),
+  Byc  (nxc, nyc, nzc),
+  Bzc  (nxc, nyc, nzc),
+  rhoc (nxc, nyc, nzc),
+  rhoh (nxc, nyc, nzc),
+
+  // temporary arrays
+  //
+  tempXC (nxc, nyc, nzc),
+  tempYC (nxc, nyc, nzc),
+  tempZC (nxc, nyc, nzc),
+  //
+  tempXN (nxn, nyn, nzn),
+  tempYN (nxn, nyn, nzn),
+  tempZN (nxn, nyn, nzn),
+  tempC  (nxc, nyc, nzc),
+  tempX  (nxn, nyn, nzn),
+  tempY  (nxn, nyn, nzn),
+  tempZ  (nxn, nyn, nzn),
+  temp2X (nxn, nyn, nzn),
+  temp2Y (nxn, nyn, nzn),
+  temp2Z (nxn, nyn, nzn),
+  imageX (nxn, nyn, nzn),
+  imageY (nxn, nyn, nzn),
+  imageZ (nxn, nyn, nzn),
+  Dx (nxn, nyn, nzn),
+  Dy (nxn, nyn, nzn),
+  Dz (nxn, nyn, nzn),
+  vectX (nxn, nyn, nzn),
+  vectY (nxn, nyn, nzn),
+  vectZ (nxn, nyn, nzn),
+  divC  (nxc, nyc, nzc),
+  // B_ext and J_ext should not be allocated unless used.
+  Bx_ext(nxn,nyn,nzn),
+  By_ext(nxn,nyn,nzn),
+  Bz_ext(nxn,nyn,nzn),
+  Jx_ext(nxn,nyn,nzn),
+  Jy_ext(nxn,nyn,nzn),
+  Jz_ext(nxn,nyn,nzn) 
+{
+  // External imposed fields
+  //
+  B1x = col->getB1x();
+  B1y = col->getB1y();
+  B1z = col->getB1z();
+  if(B1x!=0. || B1y !=0. || B1z!=0.)
+  {
+    eprintf("This functionality has not yet been implemented");
+  }
+  Bx_ext.setall(0.);
+  By_ext.setall(0.);
+  Bz_ext.setall(0.);
+  //
   PoissonCorrection = false;
   if (col->getPoissonCorrection()=="yes") PoissonCorrection = true;
   CGtol = col->getCGtol();
@@ -49,11 +147,11 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) {
     qom[i] = col->getQOM(i);
   // boundary conditions: PHI and EM fields
   bcPHIfaceXright = col->getBcPHIfaceXright();
-  bcPHIfaceXleft = col->getBcPHIfaceXleft();
+  bcPHIfaceXleft  = col->getBcPHIfaceXleft();
   bcPHIfaceYright = col->getBcPHIfaceYright();
-  bcPHIfaceYleft = col->getBcPHIfaceYleft();
+  bcPHIfaceYleft  = col->getBcPHIfaceYleft();
   bcPHIfaceZright = col->getBcPHIfaceZright();
-  bcPHIfaceZleft = col->getBcPHIfaceZleft();
+  bcPHIfaceZleft  = col->getBcPHIfaceZleft();
 
   bcEMfaceXright = col->getBcEMfaceXright();
   bcEMfaceXleft = col->getBcEMfaceXleft();
@@ -65,10 +163,6 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) {
   B0x = col->getB0x();
   B0y = col->getB0y();
   B0z = col->getB0z();
-  // Earth Simulation
-  B1x = col->getB1x();
-  B1y = col->getB1y();
-  B1z = col->getB1z();
   delta = col->getDelta();
   Smooth = col->getSmooth();
   // get the density background for the gem Challange
@@ -96,75 +190,6 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) {
   injFieldsFront  = new injInfoFields(nxn, nyn, nzn);
   injFieldsRear   = new injInfoFields(nxn, nyn, nzn);
 
-  // arrays allocation: nodes
-  Ex = newArr3(double, nxn, nyn, nzn);
-  Ey = newArr3(double, nxn, nyn, nzn);
-  Ez = newArr3(double, nxn, nyn, nzn);
-  Exth = newArr3(double, nxn, nyn, nzn);
-  Eyth = newArr3(double, nxn, nyn, nzn);
-  Ezth = newArr3(double, nxn, nyn, nzn);
-  Bxn = newArr3(double, nxn, nyn, nzn);
-  Byn = newArr3(double, nxn, nyn, nzn);
-  Bzn = newArr3(double, nxn, nyn, nzn);
-  rhon = newArr3(double, nxn, nyn, nzn);
-  Jx = newArr3(double, nxn, nyn, nzn);
-  Jy = newArr3(double, nxn, nyn, nzn);
-  Jz = newArr3(double, nxn, nyn, nzn);
-  Jxh = newArr3(double, nxn, nyn, nzn);
-  Jyh = newArr3(double, nxn, nyn, nzn);
-  Jzh = newArr3(double, nxn, nyn, nzn);
-  // External imposed fields
-  Bx_ext = newArr3(double,nxn,nyn,nzn);
-  By_ext = newArr3(double,nxn,nyn,nzn);
-  Bz_ext = newArr3(double,nxn,nyn,nzn);
-  Jx_ext = newArr3(double,nxn,nyn,nzn);
-  Jy_ext = newArr3(double,nxn,nyn,nzn);
-  Jz_ext = newArr3(double,nxn,nyn,nzn);
-  // involving species
-  rhons = newArr4(double, ns, nxn, nyn, nzn);
-  rhocs = newArr4(double, ns, nxc, nyc, nzc);
-  Jxs = newArr4(double, ns, nxn, nyn, nzn);
-  Jys = newArr4(double, ns, nxn, nyn, nzn);
-  Jzs = newArr4(double, ns, nxn, nyn, nzn);
-  pXXsn = newArr4(double, ns, nxn, nyn, nzn);
-  pXYsn = newArr4(double, ns, nxn, nyn, nzn);
-  pXZsn = newArr4(double, ns, nxn, nyn, nzn);
-  pYYsn = newArr4(double, ns, nxn, nyn, nzn);
-  pYZsn = newArr4(double, ns, nxn, nyn, nzn);
-  pZZsn = newArr4(double, ns, nxn, nyn, nzn);
-  // arrays allocation: central points 
-  PHI = newArr3(double, nxc, nyc, nzc);
-  Bxc = newArr3(double, nxc, nyc, nzc);
-  Byc = newArr3(double, nxc, nyc, nzc);
-  Bzc = newArr3(double, nxc, nyc, nzc);
-  rhoc = newArr3(double, nxc, nyc, nzc);
-  rhoh = newArr3(double, nxc, nyc, nzc);
-
-  // temporary arrays
-  tempXC = newArr3(double, nxc, nyc, nzc);
-  tempYC = newArr3(double, nxc, nyc, nzc);
-  tempZC = newArr3(double, nxc, nyc, nzc);
-
-  tempXN = newArr3(double, nxn, nyn, nzn);
-  tempYN = newArr3(double, nxn, nyn, nzn);
-  tempZN = newArr3(double, nxn, nyn, nzn);
-  tempC = newArr3(double, nxc, nyc, nzc);
-  tempX = newArr3(double, nxn, nyn, nzn);
-  tempY = newArr3(double, nxn, nyn, nzn);
-  tempZ = newArr3(double, nxn, nyn, nzn);
-  temp2X = newArr3(double, nxn, nyn, nzn);
-  temp2Y = newArr3(double, nxn, nyn, nzn);
-  temp2Z = newArr3(double, nxn, nyn, nzn);
-  imageX = newArr3(double, nxn, nyn, nzn);
-  imageY = newArr3(double, nxn, nyn, nzn);
-  imageZ = newArr3(double, nxn, nyn, nzn);
-  Dx = newArr3(double, nxn, nyn, nzn);
-  Dy = newArr3(double, nxn, nyn, nzn);
-  Dz = newArr3(double, nxn, nyn, nzn);
-  vectX = newArr3(double, nxn, nyn, nzn);
-  vectY = newArr3(double, nxn, nyn, nzn);
-  vectZ = newArr3(double, nxn, nyn, nzn);
-  divC = newArr3(double, nxc, nyc, nzc);
   sizeMomentsArray = omp_thread_count();
   momentsArray = new Moments*[sizeMomentsArray];
   for(int i=0;i<sizeMomentsArray;i++)
@@ -195,16 +220,16 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   double const*const q = pcls.getQall();
   //
   const int is = pcls.get_ns();
-  double* rhons1d = rhons[is][0][0];
-  double* Jxs1d   = Jxs  [is][0][0];
-  double* Jys1d   = Jys  [is][0][0];
-  double* Jzs1d   = Jzs  [is][0][0];
-  double* pXXsn1d = pXXsn[is][0][0];
-  double* pXYsn1d = pXYsn[is][0][0];
-  double* pXZsn1d = pXZsn[is][0][0];
-  double* pYYsn1d = pYYsn[is][0][0];
-  double* pYZsn1d = pYZsn[is][0][0];
-  double* pZZsn1d = pZZsn[is][0][0];
+  double* rhons1d = &rhons[is][0][0][0];
+  double* Jxs1d   = &Jxs  [is][0][0][0];
+  double* Jys1d   = &Jys  [is][0][0][0];
+  double* Jzs1d   = &Jzs  [is][0][0][0];
+  double* pXXsn1d = &pXXsn[is][0][0][0];
+  double* pXYsn1d = &pXYsn[is][0][0][0];
+  double* pXZsn1d = &pXZsn[is][0][0][0];
+  double* pYYsn1d = &pYYsn[is][0][0][0];
+  double* pYZsn1d = &pYZsn[is][0][0][0];
+  double* pZZsn1d = &pZZsn[is][0][0][0];
   //
   const long long nop_ll = pcls.getNOP();
   const int nop = pcls.getNOP();
@@ -220,16 +245,16 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
     Moments& speciesMoments = fetch_momentsArray(thread_num);
     speciesMoments.set_to_zero();
     //
-    doubleArr3& rho = speciesMoments.fetch_rho();
-    doubleArr3& Jx  = speciesMoments.fetch_Jx();
-    doubleArr3& Jy  = speciesMoments.fetch_Jy();
-    doubleArr3& Jz  = speciesMoments.fetch_Jz();
-    doubleArr3& Pxx = speciesMoments.fetch_Pxx();
-    doubleArr3& Pxy = speciesMoments.fetch_Pxy();
-    doubleArr3& Pxz = speciesMoments.fetch_Pxz();
-    doubleArr3& Pyy = speciesMoments.fetch_Pyy();
-    doubleArr3& Pyz = speciesMoments.fetch_Pyz();
-    doubleArr3& Pzz = speciesMoments.fetch_Pzz();
+    arr3_double rho = speciesMoments.fetch_rho();
+    arr3_double Jx  = speciesMoments.fetch_Jx();
+    arr3_double Jy  = speciesMoments.fetch_Jy();
+    arr3_double Jz  = speciesMoments.fetch_Jz();
+    arr3_double Pxx = speciesMoments.fetch_Pxx();
+    arr3_double Pxy = speciesMoments.fetch_Pxy();
+    arr3_double Pxz = speciesMoments.fetch_Pxz();
+    arr3_double Pyy = speciesMoments.fetch_Pyy();
+    arr3_double Pyz = speciesMoments.fetch_Pyz();
+    arr3_double Pzz = speciesMoments.fetch_Pzz();
     // The following loop is expensive, so it is wise to assume that the
     // compiler is stupid.  Therefore we should on the one hand
     // expand things out and on the other hand avoid repeating computations.
@@ -389,16 +414,16 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
     // One-dimensional array access is presumably
     // more efficient on poor compilers.
     //
-    doubleArr1 rho1d = rho.fetch_Arr1();
-    doubleArr1 Jx1d  = Jx .fetch_Arr1();
-    doubleArr1 Jy1d  = Jy .fetch_Arr1();
-    doubleArr1 Jz1d  = Jz .fetch_Arr1();
-    doubleArr1 Pxx1d = Pxx.fetch_Arr1();
-    doubleArr1 Pxy1d = Pxy.fetch_Arr1();
-    doubleArr1 Pxz1d = Pxz.fetch_Arr1();
-    doubleArr1 Pyy1d = Pyy.fetch_Arr1();
-    doubleArr1 Pyz1d = Pyz.fetch_Arr1();
-    doubleArr1 Pzz1d = Pzz.fetch_Arr1();
+    double* rho1d = &rho[0][0][0];
+    double* Jx1d  = &Jx [0][0][0];
+    double* Jy1d  = &Jy [0][0][0];
+    double* Jz1d  = &Jz [0][0][0];
+    double* Pxx1d = &Pxx[0][0][0];
+    double* Pxy1d = &Pxy[0][0][0];
+    double* Pxz1d = &Pxz[0][0][0];
+    double* Pyy1d = &Pyy[0][0][0];
+    double* Pyz1d = &Pyz[0][0][0];
+    double* Pzz1d = &Pzz[0][0][0];
     //
     assert_eq(speciesMoments.get_nx(), nxn);
     assert_eq(speciesMoments.get_ny(), nyn);
@@ -432,10 +457,10 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
 void EMfields3D::calculateE(Grid * grid, VirtualTopology3D * vct, Collective *col) {
   if (vct->getCartesian_rank() == 0)
     cout << "*** E CALCULATION ***" << endl;
-  double ***divE = newArr3(double, nxc, nyc, nzc);
-  double ***gradPHIX = newArr3(double, nxn, nyn, nzn);
-  double ***gradPHIY = newArr3(double, nxn, nyn, nzn);
-  double ***gradPHIZ = newArr3(double, nxn, nyn, nzn);
+  array3_double divE     (nxc, nyc, nzc);
+  array3_double gradPHIX (nxn, nyn, nzn);
+  array3_double gradPHIY (nxn, nyn, nzn);
+  array3_double gradPHIZ (nxn, nyn, nzn);
 
   double *xkrylov = new double[3 * (nxn - 2) * (nyn - 2) * (nzn - 2)];  // 3 E components
   double *bkrylov = new double[3 * (nxn - 2) * (nyn - 2) * (nzn - 2)];  // 3 components
@@ -513,11 +538,6 @@ void EMfields3D::calculateE(Grid * grid, VirtualTopology3D * vct, Collective *co
   delete[]bkrylov;
   delete[]xkrylovPoisson;
   delete[]bkrylovPoisson;
-  delArr3(divE, nxc, nyc);
-  delArr3(gradPHIX, nxn, nyn);
-  delArr3(gradPHIY, nxn, nyn);
-  delArr3(gradPHIZ, nxn, nyn);
-
 }
 
 /*! Calculate sorgent for Maxwell solver */
@@ -679,7 +699,7 @@ void EMfields3D::MaxwellImage(double *im, double *vector, Grid * grid, VirtualTo
 }
 
 /*! Calculate PI dot (vectX, vectY, vectZ) */
-void EMfields3D::PIdot(double ***PIdotX, double ***PIdotY, double ***PIdotZ, double ***vectX, double ***vectY, double ***vectZ, int ns, Grid * grid) {
+void EMfields3D::PIdot(arr3_double& PIdotX, arr3_double& PIdotY, arr3_double& PIdotZ, const_arr3_double& vectX, const_arr3_double& vectY, const_arr3_double& vectZ, int ns, Grid * grid) {
   double beta, edotb, omcx, omcy, omcz, denom;
   beta = .5 * qom[ns] * dt / c;
   for (int i = 1; i < nxn - 1; i++)
@@ -688,17 +708,17 @@ void EMfields3D::PIdot(double ***PIdotX, double ***PIdotY, double ***PIdotZ, dou
         omcx = beta * (Bxn[i][j][k] + Bx_ext[i][j][k]);
         omcy = beta * (Byn[i][j][k] + By_ext[i][j][k]);
         omcz = beta * (Bzn[i][j][k] + Bz_ext[i][j][k]);
-        edotb = vectX[i][j][k] * omcx + vectY[i][j][k] * omcy + vectZ[i][j][k] * omcz;
+        edotb = vectX.get(i,j,k) * omcx + vectY.get(i,j,k) * omcy + vectZ.get(i,j,k) * omcz;
         denom = 1 / (1.0 + omcx * omcx + omcy * omcy + omcz * omcz);
-        PIdotX[i][j][k] += (vectX[i][j][k] + (vectY[i][j][k] * omcz - vectZ[i][j][k] * omcy + edotb * omcx)) * denom;
-        PIdotY[i][j][k] += (vectY[i][j][k] + (vectZ[i][j][k] * omcx - vectX[i][j][k] * omcz + edotb * omcy)) * denom;
-        PIdotZ[i][j][k] += (vectZ[i][j][k] + (vectX[i][j][k] * omcy - vectY[i][j][k] * omcx + edotb * omcz)) * denom;
+        PIdotX.fetch(i,j,k) += (vectX.get(i,j,k) + (vectY.get(i,j,k) * omcz - vectZ.get(i,j,k) * omcy + edotb * omcx)) * denom;
+        PIdotY.fetch(i,j,k) += (vectY.get(i,j,k) + (vectZ.get(i,j,k) * omcx - vectX.get(i,j,k) * omcz + edotb * omcy)) * denom;
+        PIdotZ.fetch(i,j,k) += (vectZ.get(i,j,k) + (vectX.get(i,j,k) * omcy - vectY.get(i,j,k) * omcx + edotb * omcz)) * denom;
       }
-
-
 }
 /*! Calculate MU dot (vectX, vectY, vectZ) */
-void EMfields3D::MUdot(double ***MUdotX, double ***MUdotY, double ***MUdotZ, double ***vectX, double ***vectY, double ***vectZ, Grid * grid) {
+void EMfields3D::MUdot(arr3_double& MUdotX, arr3_double& MUdotY, arr3_double& MUdotZ,
+  const_arr3_double& vectX, const_arr3_double& vectY, const_arr3_double& vectZ, Grid * grid)
+{
   double beta, edotb, omcx, omcy, omcz, denom;
   for (int i = 1; i < nxn - 1; i++)
     for (int j = 1; j < nyn - 1; j++)
@@ -715,18 +735,16 @@ void EMfields3D::MUdot(double ***MUdotX, double ***MUdotY, double ***MUdotZ, dou
           omcx = beta * (Bxn[i][j][k] + Bx_ext[i][j][k]);
           omcy = beta * (Byn[i][j][k] + By_ext[i][j][k]);
           omcz = beta * (Bzn[i][j][k] + Bz_ext[i][j][k]);
-          edotb = vectX[i][j][k] * omcx + vectY[i][j][k] * omcy + vectZ[i][j][k] * omcz;
+          edotb = vectX.get(i,j,k) * omcx + vectY.get(i,j,k) * omcy + vectZ.get(i,j,k) * omcz;
           denom = FourPI / 2 * delt * dt / c * qom[is] * rhons[is][i][j][k] / (1.0 + omcx * omcx + omcy * omcy + omcz * omcz);
-          MUdotX[i][j][k] += (vectX[i][j][k] + (vectY[i][j][k] * omcz - vectZ[i][j][k] * omcy + edotb * omcx)) * denom;
-          MUdotY[i][j][k] += (vectY[i][j][k] + (vectZ[i][j][k] * omcx - vectX[i][j][k] * omcz + edotb * omcy)) * denom;
-          MUdotZ[i][j][k] += (vectZ[i][j][k] + (vectX[i][j][k] * omcy - vectY[i][j][k] * omcx + edotb * omcz)) * denom;
+          MUdotX.fetch(i,j,k) += (vectX.get(i,j,k) + (vectY.get(i,j,k) * omcz - vectZ.get(i,j,k) * omcy + edotb * omcx)) * denom;
+          MUdotY.fetch(i,j,k) += (vectY.get(i,j,k) + (vectZ.get(i,j,k) * omcx - vectX.get(i,j,k) * omcz + edotb * omcy)) * denom;
+          MUdotZ.fetch(i,j,k) += (vectZ.get(i,j,k) + (vectX.get(i,j,k) * omcy - vectY.get(i,j,k) * omcx + edotb * omcz)) * denom;
         }
-
   }
-
 }
 /* Interpolation smoothing: Smoothing (vector must already have ghost cells) TO MAKE SMOOTH value as to be different from 1.0 type = 0 --> center based vector ; type = 1 --> node based vector ; */
-void EMfields3D::smooth(double value, double ***vector, int type, Grid * grid, VirtualTopology3D * vct) {
+void EMfields3D::smooth(double value, arr3_double& vector, int type, Grid * grid, VirtualTopology3D * vct) {
 
   int nvolte = 6;
   for (int icount = 1; icount < nvolte + 1; icount++) {
@@ -823,7 +841,7 @@ void EMfields3D::smoothE(double value, VirtualTopology3D * vct, Collective *col)
 }
 
 /* SPECIES: Interpolation smoothing TO MAKE SMOOTH value as to be different from 1.0 type = 0 --> center based vector type = 1 --> node based vector */
-void EMfields3D::smooth(double value, double ****vector, int is, int type, Grid * grid, VirtualTopology3D * vct) {
+void EMfields3D::smooth(double value, arr4_double& vector, int is, int type, Grid * grid, VirtualTopology3D * vct) {
   cout << "Smoothing for Species not implemented in 3D" << endl;
 }
 
@@ -1314,8 +1332,8 @@ void EMfields3D::calculateHatFunctions(Grid * grid, VirtualTopology3D * vct) {
 /*! Image of Poisson Solver */
 void EMfields3D::PoissonImage(double *image, double *vector, Grid * grid, VirtualTopology3D * vct) {
   // allocate 2 three dimensional service vectors
-  double ***temp = newArr3(double, nxc, nyc, nzc);
-  double ***im = newArr3(double, nxc, nyc, nzc);
+  array3_double temp(nxc, nyc, nzc);
+  array3_double im(nxc, nyc, nzc);
   eqValue(0.0, image, (nxc - 2) * (nyc - 2) * (nzc - 2));
   eqValue(0.0, temp, nxc, nyc, nzc);
   eqValue(0.0, im, nxc, nyc, nzc);
@@ -1325,9 +1343,6 @@ void EMfields3D::PoissonImage(double *image, double *vector, Grid * grid, Virtua
   grid->lapC2Cpoisson(im, temp, vct);
   // move from physical space to krylov space
   phys2solver(image, im, nxc, nyc, nzc);
-  // deallocate temporary array and objects
-  delArr3(temp, nxc, nyc);
-  delArr3(im, nxc, nyc);
 }
 /*! interpolate charge density and pressure density from node to center */
 void EMfields3D::interpDensitiesN2C(VirtualTopology3D * vct, Grid * grid) {
@@ -1339,25 +1354,25 @@ void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft,
   // interpolate adding common nodes among processors
   timeTasks.start_communicate();
 
-  communicateInterp(nxn, nyn, nzn, ns, rhons, 0, 0, 0, 0, 0, 0, vct);
-  communicateInterp(nxn, nyn, nzn, ns, Jxs, 0, 0, 0, 0, 0, 0, vct);
-  communicateInterp(nxn, nyn, nzn, ns, Jys, 0, 0, 0, 0, 0, 0, vct);
-  communicateInterp(nxn, nyn, nzn, ns, Jzs, 0, 0, 0, 0, 0, 0, vct);
-  communicateInterp(nxn, nyn, nzn, ns, pXXsn, 0, 0, 0, 0, 0, 0, vct);
-  communicateInterp(nxn, nyn, nzn, ns, pXYsn, 0, 0, 0, 0, 0, 0, vct);
-  communicateInterp(nxn, nyn, nzn, ns, pXZsn, 0, 0, 0, 0, 0, 0, vct);
-  communicateInterp(nxn, nyn, nzn, ns, pYYsn, 0, 0, 0, 0, 0, 0, vct);
-  communicateInterp(nxn, nyn, nzn, ns, pYZsn, 0, 0, 0, 0, 0, 0, vct);
-  communicateInterp(nxn, nyn, nzn, ns, pZZsn, 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, rhons.fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, Jxs  .fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, Jys  .fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, Jzs  .fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, pXXsn.fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, pXYsn.fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, pXZsn.fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, pYYsn.fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, pYZsn.fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
+  communicateInterp(nxn, nyn, nzn, ns, pZZsn.fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
   // calculate the correct densities on the boundaries
   adjustNonPeriodicDensities(ns, vct);
   // put the correct values on ghost cells
   timeTasks.addto_communicate();
 
   communicateNode_P(nxn, nyn, nzn, rhons, ns, vct);
-  communicateNode_P(nxn, nyn, nzn, Jxs, ns, vct);
-  communicateNode_P(nxn, nyn, nzn, Jys, ns, vct);
-  communicateNode_P(nxn, nyn, nzn, Jzs, ns, vct);
+  communicateNode_P(nxn, nyn, nzn, Jxs  , ns, vct);
+  communicateNode_P(nxn, nyn, nzn, Jys  , ns, vct);
+  communicateNode_P(nxn, nyn, nzn, Jzs  , ns, vct);
   communicateNode_P(nxn, nyn, nzn, pXXsn, ns, vct);
   communicateNode_P(nxn, nyn, nzn, pXYsn, ns, vct);
   communicateNode_P(nxn, nyn, nzn, pXZsn, ns, vct);
@@ -2473,7 +2488,10 @@ void EMfields3D::sustensorRightZ(double **susxz, double **susyz, double **suszz)
 }
 
 /*! Perfect conductor boundary conditions: LEFT wall */
-void EMfields3D::perfectConductorLeft(double ***imageX, double ***imageY, double ***imageZ, double ***vectorX, double ***vectorY, double ***vectorZ, int dir, Grid * grid) {
+void EMfields3D::perfectConductorLeft(arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
+  const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ,
+  int dir, Grid * grid)
+{
   double** susxy;
   double** susyy;
   double** suszy;
@@ -2491,9 +2509,9 @@ void EMfields3D::perfectConductorLeft(double ***imageX, double ***imageY, double
       sustensorLeftX(susxx, susyx, suszx);
       for (int i=1; i <  nyn-1;i++)
         for (int j=1; j <  nzn-1;j++){
-          imageX[1][i][j] = vectorX[1][i][j] - (Ex[1][i][j] - susyx[i][j]*vectorY[1][i][j] - suszx[i][j]*vectorZ[1][i][j] - Jxh[1][i][j]*dt*th*FourPI)/susxx[i][j];
-          imageY[1][i][j] = vectorY[1][i][j] - 0.0*vectorY[2][i][j];
-          imageZ[1][i][j] = vectorZ[1][i][j] - 0.0*vectorZ[2][i][j];
+          imageX[1][i][j] = vectorX.get(1,i,j) - (Ex[1][i][j] - susyx[i][j]*vectorY.get(1,i,j) - suszx[i][j]*vectorZ.get(1,i,j) - Jxh[1][i][j]*dt*th*FourPI)/susxx[i][j];
+          imageY[1][i][j] = vectorY.get(1,i,j) - 0.0*vectorY.get(2,i,j);
+          imageZ[1][i][j] = vectorZ.get(1,i,j) - 0.0*vectorZ.get(2,i,j);
         }
       delArr2(susxx,nxn);
       delArr2(susyx,nxn);
@@ -2506,9 +2524,9 @@ void EMfields3D::perfectConductorLeft(double ***imageX, double ***imageY, double
       sustensorLeftY(susxy, susyy, suszy);
       for (int i=1; i < nxn-1;i++)
         for (int j=1; j <  nzn-1;j++){
-          imageX[i][1][j] = vectorX[i][1][j] - 0.0*vectorX[i][2][j];
-          imageY[i][1][j] = vectorY[i][1][j] - (Ey[i][1][j] - susxy[i][j]*vectorX[i][1][j] - suszy[i][j]*vectorZ[i][1][j] - Jyh[i][1][j]*dt*th*FourPI)/susyy[i][j];
-          imageZ[i][1][j] = vectorZ[i][1][j] - 0.0*vectorZ[i][2][j];
+          imageX[i][1][j] = vectorX.get(i,1,j) - 0.0*vectorX.get(i,2,j);
+          imageY[i][1][j] = vectorY.get(i,1,j) - (Ey[i][1][j] - susxy[i][j]*vectorX.get(i,1,j) - suszy[i][j]*vectorZ.get(i,1,j) - Jyh[i][1][j]*dt*th*FourPI)/susyy[i][j];
+          imageZ[i][1][j] = vectorZ.get(i,1,j) - 0.0*vectorZ.get(i,2,j);
         }
       delArr2(susxy,nxn);
       delArr2(susyy,nxn);
@@ -2521,9 +2539,9 @@ void EMfields3D::perfectConductorLeft(double ***imageX, double ***imageY, double
       sustensorLeftZ(susxy, susyy, suszy);
       for (int i=1; i <  nxn-1;i++)
         for (int j=1; j <  nyn-1;j++){
-          imageX[i][j][1] = vectorX[i][j][1];
-          imageY[i][j][1] = vectorX[i][j][1];
-          imageZ[i][j][1] = vectorZ[i][j][1] - (Ez[i][j][1] - susxz[i][j]*vectorX[i][j][1] - susyz[i][j]*vectorY[i][j][1] - Jzh[i][j][1]*dt*th*FourPI)/suszz[i][j];
+          imageX[i][j][1] = vectorX.get(i,j,1);
+          imageY[i][j][1] = vectorX.get(i,j,1);
+          imageZ[i][j][1] = vectorZ.get(i,j,1) - (Ez[i][j][1] - susxz[i][j]*vectorX.get(i,j,1) - susyz[i][j]*vectorY.get(i,j,1) - Jzh[i][j][1]*dt*th*FourPI)/suszz[i][j];
         }
       delArr2(susxz,nxn);
       delArr2(susyz,nxn);
@@ -2533,7 +2551,13 @@ void EMfields3D::perfectConductorLeft(double ***imageX, double ***imageY, double
 }
 
 /*! Perfect conductor boundary conditions: RIGHT wall */
-void EMfields3D::perfectConductorRight(double ***imageX, double ***imageY, double ***imageZ, double ***vectorX, double ***vectorY, double ***vectorZ, int dir, Grid * grid) {
+void EMfields3D::perfectConductorRight(
+  arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
+  const_arr3_double& vectorX,
+  const_arr3_double& vectorY,
+  const_arr3_double& vectorZ,
+  int dir, Grid * grid)
+{
   double beta, omcx, omcy, omcz, denom;
   double** susxy;
   double** susyy;
@@ -2552,9 +2576,9 @@ void EMfields3D::perfectConductorRight(double ***imageX, double ***imageY, doubl
       sustensorRightX(susxx, susyx, suszx);
       for (int i=1; i < nyn-1;i++)
         for (int j=1; j <  nzn-1;j++){
-          imageX[nxn-2][i][j] = vectorX[nxn-2][i][j] - (Ex[nxn-2][i][j] - susyx[i][j]*vectorY[nxn-2][i][j] - suszx[i][j]*vectorZ[nxn-2][i][j] - Jxh[nxn-2][i][j]*dt*th*FourPI)/susxx[i][j];
-          imageY[nxn-2][i][j] = vectorY[nxn-2][i][j] - 0.0 * vectorY[nxn-3][i][j];
-          imageZ[nxn-2][i][j] = vectorZ[nxn-2][i][j] - 0.0 * vectorZ[nxn-3][i][j];
+          imageX[nxn-2][i][j] = vectorX.get(nxn-2,i,j) - (Ex[nxn-2][i][j] - susyx[i][j]*vectorY.get(nxn-2,i,j) - suszx[i][j]*vectorZ.get(nxn-2,i,j) - Jxh[nxn-2][i][j]*dt*th*FourPI)/susxx[i][j];
+          imageY[nxn-2][i][j] = vectorY.get(nxn-2,i,j) - 0.0 * vectorY.get(nxn-3,i,j);
+          imageZ[nxn-2][i][j] = vectorZ.get(nxn-2,i,j) - 0.0 * vectorZ.get(nxn-3,i,j);
         }
       delArr2(susxx,nxn);
       delArr2(susyx,nxn);       
@@ -2567,9 +2591,9 @@ void EMfields3D::perfectConductorRight(double ***imageX, double ***imageY, doubl
       sustensorRightY(susxy, susyy, suszy);
       for (int i=1; i < nxn-1;i++)
         for (int j=1; j < nzn-1;j++){
-          imageX[i][nyn-2][j] = vectorX[i][nyn-2][j] - 0.0*vectorX[i][nyn-3][j];
-          imageY[i][nyn-2][j] = vectorY[i][nyn-2][j] - (Ey[i][nyn-2][j] - susxy[i][j]*vectorX[i][nyn-2][j] - suszy[i][j]*vectorZ[i][nyn-2][j] - Jyh[i][nyn-2][j]*dt*th*FourPI)/susyy[i][j];
-          imageZ[i][nyn-2][j] = vectorZ[i][nyn-2][j] - 0.0*vectorZ[i][nyn-3][j];
+          imageX[i][nyn-2][j] = vectorX.get(i,nyn-2,j) - 0.0*vectorX.get(i,nyn-3,j);
+          imageY[i][nyn-2][j] = vectorY.get(i,nyn-2,j) - (Ey[i][nyn-2][j] - susxy[i][j]*vectorX.get(i,nyn-2,j) - suszy[i][j]*vectorZ.get(i,nyn-2,j) - Jyh[i][nyn-2][j]*dt*th*FourPI)/susyy[i][j];
+          imageZ[i][nyn-2][j] = vectorZ.get(i,nyn-2,j) - 0.0*vectorZ.get(i,nyn-3,j);
         }
       delArr2(susxy,nxn);
       delArr2(susyy,nxn);
@@ -2582,9 +2606,9 @@ void EMfields3D::perfectConductorRight(double ***imageX, double ***imageY, doubl
       sustensorRightZ(susxz, susyz, suszz);
       for (int i=1; i < nxn-1;i++)
         for (int j=1; j < nyn-1;j++){
-          imageX[i][j][nzn-2] = vectorX[i][j][nzn-2];
-          imageY[i][j][nzn-2] = vectorY[i][j][nzn-2];
-          imageZ[i][j][nzn-2] = vectorZ[i][j][nzn-2] - (Ez[i][j][nzn-2] - susxz[i][j]*vectorX[i][j][nzn-2] - susyz[i][j]*vectorY[i][j][nzn-2] - Jzh[i][j][nzn-2]*dt*th*FourPI)/suszz[i][j];
+          imageX[i][j][nzn-2] = vectorX.get(i,j,nzn-2);
+          imageY[i][j][nzn-2] = vectorY.get(i,j,nzn-2);
+          imageZ[i][j][nzn-2] = vectorZ.get(i,j,nzn-2) - (Ez[i][j][nzn-2] - susxz[i][j]*vectorX.get(i,j,nzn-2) - susyz[i][j]*vectorY.get(i,j,nzn-2) - Jzh[i][j][nzn-2]*dt*th*FourPI)/suszz[i][j];
         }
       delArr2(susxz,nxn);
       delArr2(susyz,nxn);       
@@ -2594,7 +2618,7 @@ void EMfields3D::perfectConductorRight(double ***imageX, double ***imageY, doubl
 }
 
 /*! Perfect conductor boundary conditions for source: LEFT WALL */
-void EMfields3D::perfectConductorLeftS(double ***vectorX, double ***vectorY, double ***vectorZ, int dir) {
+void EMfields3D::perfectConductorLeftS(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ, int dir) {
 
   double ebc[3];
 
@@ -2640,7 +2664,7 @@ void EMfields3D::perfectConductorLeftS(double ***vectorX, double ***vectorY, dou
 }
 
 /*! Perfect conductor boundary conditions for source: RIGHT WALL */
-void EMfields3D::perfectConductorRightS(double ***vectorX, double ***vectorY, double ***vectorZ, int dir) {
+void EMfields3D::perfectConductorRightS(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ, int dir) {
 
   double ebc[3];
 
@@ -2805,7 +2829,10 @@ void EMfields3D::updateInfoFields(Grid *grid,VirtualTopology3D *vct,Collective *
 
 }
 
-void EMfields3D::BoundaryConditionsEImage(double ***imageX, double ***imageY, double ***imageZ,double ***vectorX, double ***vectorY, double ***vectorZ,int nx, int ny, int nz, VirtualTopology3D *vct,Grid *grid){
+void EMfields3D::BoundaryConditionsEImage(arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
+  const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ,
+  int nx, int ny, int nz, VirtualTopology3D *vct,Grid *grid)
+{
 
   if(vct->getXleft_neighbor()==MPI_PROC_NULL && bcEMfaceXleft == 2) {
     for (int j=1; j < ny-1;j++)
@@ -2865,7 +2892,7 @@ void EMfields3D::BoundaryConditionsEImage(double ***imageX, double ***imageY, do
 
 }
 
-void EMfields3D::BoundaryConditionsB(double ***vectorX, double ***vectorY, double ***vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct){
+void EMfields3D::BoundaryConditionsB(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct){
 
   if(vct->getXleft_neighbor()==MPI_PROC_NULL && bcEMfaceXleft ==2) {
     for (int j=0; j < ny;j++)
@@ -2948,7 +2975,7 @@ void EMfields3D::BoundaryConditionsB(double ***vectorX, double ***vectorY, doubl
 
 }
 
-void EMfields3D::BoundaryConditionsE(double ***vectorX, double ***vectorY, double ***vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct){
+void EMfields3D::BoundaryConditionsE(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct){
 
   if(vct->getXleft_neighbor()==MPI_PROC_NULL && bcEMfaceXleft ==2) {
     for (int j=0; j < ny;j++)
@@ -3030,338 +3057,105 @@ void EMfields3D::BoundaryConditionsE(double ***vectorX, double ***vectorY, doubl
   }
 }
 
-/*! get Potential array ** */
-double ***EMfields3D::getPHI() {
-  return (PHI);
-}
-/*! get Ex(X,Y,Z) */
-double &EMfields3D::getEx(int indexX, int indexY, int indexZ) const {
-  return (Ex[indexX][indexY][indexZ]);
-}
-/*! get Electric field component X array */
-double ***EMfields3D::getEx() {
-  return (Ex);
-}
 /*! get Electric Field component X array cell without the ghost cells */
-double ***EMfields3D::getExc(Grid3DCU *grid) {
-  double ***arr;
-  double ***tmp;
-
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
-  tmp = newArr3(double,nxc,nyc,nzc);
+void EMfields3D::getExc(arr3_double& arr, Grid3DCU *grid) {
 
+  array3_double tmp(nxc,nyc,nzc);
   grid->interpN2C(tmp, Ex);
 
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=tmp[i][j][k];
-
-  delArr3(tmp,nxc,nyc);
-  return arr;
-}
-/*! get Ey(X,Y,Z) */
-double &EMfields3D::getEy(int indexX, int indexY, int indexZ) const {
-  return (Ey[indexX][indexY][indexZ]);
-}
-/*! get Electric field component Y array */
-double ***EMfields3D::getEy() {
-  return (Ey);
 }
 /*! get Electric Field component Y array cell without the ghost cells */
-double ***EMfields3D::getEyc(Grid3DCU *grid) {
-  double ***arr;
-  double ***tmp;
-
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
-  tmp = newArr3(double,nxc,nyc,nzc);
+void EMfields3D::getEyc(arr3_double& arr, Grid3DCU *grid) {
 
+  array3_double tmp(nxc,nyc,nzc);
   grid->interpN2C(tmp, Ey);
 
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=tmp[i][j][k];
-
-  delArr3(tmp,nxc,nyc);
-  return arr;
-}
-/*! get Ez(X,Y,Z) */
-double &EMfields3D::getEz(int indexX, int indexY, int indexZ) const {
-  return (Ez[indexX][indexY][indexZ]);
-}
-/*! get Electric field component Z array */
-double ***EMfields3D::getEz() {
-  return (Ez);
 }
 /*! get Electric Field component Z array cell without the ghost cells */
-double ***EMfields3D::getEzc(Grid3DCU *grid) {
-  double ***arr;
-  double ***tmp;
-
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
-  tmp = newArr3(double,nxc,nyc,nzc);
+void EMfields3D::getEzc(arr3_double& arr, Grid3DCU *grid) {
 
+  array3_double tmp(nxc,nyc,nzc);
   grid->interpN2C(tmp, Ez);
 
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=tmp[i][j][k];
-
-  delArr3(tmp,nxc,nyc);
-  return arr;
-}
-/*! get Bx(X,Y,Z) */
-double &EMfields3D::getBx(int indexX, int indexY, int indexZ) const {
-  return (Bxn[indexX][indexY][indexZ]);
-}
-/*! get Magnetic Field component X array */
-double ***EMfields3D::getBx() {
-  return (Bxn);
 }
 /*! get Magnetic Field component X array cell without the ghost cells */
-double ***EMfields3D::getBxc() {
-  double ***arr;
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
+void EMfields3D::getBxc(arr3_double& arr) {
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=Bxc[i][j][k];
-  return arr;
-}
-/*! get By(X,Y,Z) */
-double &EMfields3D::getBy(int indexX, int indexY, int indexZ) const {
-  return (Byn[indexX][indexY][indexZ]);
-}
-/*! get Magnetic Field component Y array */
-double ***EMfields3D::getBy() {
-  return (Byn);
 }
 /*! get Magnetic Field component Y array cell without the ghost cells */
-double ***EMfields3D::getByc() {
-  double ***arr;
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
+void EMfields3D::getByc(arr3_double& arr) {
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=Byc[i][j][k];
-  return arr;
-}
-/*! get Bz(X,Y,Z) */
-double &EMfields3D::getBz(int indexX, int indexY, int indexZ) const {
-  return (Bzn[indexX][indexY][indexZ]);
-}
-/*! get Magnetic Field component Z array */
-double ***EMfields3D::getBz() {
-  return (Bzn);
 }
 /*! get Magnetic Field component Z array cell without the ghost cells */
-double ***EMfields3D::getBzc() {
-  double ***arr;
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
+void EMfields3D::getBzc(arr3_double& arr) {
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=Bzc[i][j][k];
-  return arr;
-}
-/*! get rhoc(X,Y,Z) */
-double &EMfields3D::getRHOc(int indexX, int indexY, int indexZ) const {
-  return (rhoc[indexX][indexY][indexZ]);
-} double ***EMfields3D::getRHOc() {
-  return (rhoc);
-}
-/*! get density on node(indexX,indexY,indexZ) */
-double &EMfields3D::getRHOn(int indexX, int indexY, int indexZ) const {
-  return (rhon[indexX][indexY][indexZ]);
-}
-/*! get density array defined on nodes */
-double ***EMfields3D::getRHOn() {
-  return (rhon);
-}
-/*! get rhos(X,Y,Z) : density for species */
-double &EMfields3D::getRHOns(int indexX, int indexY, int indexZ, int is) const {
-  return (rhons[is][indexX][indexY][indexZ]);
-}
-/*! SPECIES: get density array defined on center cells */
-double &EMfields3D::getRHOcs(int indexX, int indexY, int indexZ, int is) const {
-  return (rhocs[is][indexX][indexY][indexZ]);
-}
-/*! get density array defined on nodes */
-double ****EMfields3D::getRHOns() {
-  return (rhons);
 }
 /*! get species density component X array cell without the ghost cells */
-double ***EMfields3D::getRHOcs(Grid3DCU *grid, int is) {
-  double ***arr;
-  double ****tmp;
-
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
-  tmp = newArr4(double,ns,nxc,nyc,nzc);
+void EMfields3D::getRHOcs(arr3_double& arr, Grid3DCU *grid, int is) {
 
+  array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, rhons);
 
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=tmp[is][i][j][k];
-
-  delArr4(tmp,nxc,nyc,nzc);
-  return arr;
-}
-
-/*! get Bx_ext(X,Y,Z)  */
-double &EMfields3D::getBx_ext(int indexX, int indexY, int indexZ) const{
-  return(Bx_ext[indexX][indexY][indexZ]);
-}
-/*!  get By_ext(X,Y,Z) */
-double &EMfields3D::getBy_ext(int indexX, int indexY, int indexZ) const{
-  return(By_ext[indexX][indexY][indexZ]);
-}
-/*!  get Bz_ext(X,Y,Z) */
-double &EMfields3D::getBz_ext(int indexX, int indexY, int indexZ) const{
-  return(Bz_ext[indexX][indexY][indexZ]);
 }
 
-/*! get Bx_ext  */
-double ***EMfields3D::getBx_ext() {
-  return(Bx_ext);
-}
-/*!  get By_ext */
-double ***EMfields3D::getBy_ext() {
-  return(By_ext);
-}
-/*!  get Bz_ext */
-double ***EMfields3D::getBz_ext() {
-  return(Bz_ext);
-}
-
-/*! SPECIES: get pressure tensor component XX defined on nodes */
-double ****EMfields3D::getpXXsn() {
-  return (pXXsn);
-}
-/*! SPECIES: get pressure tensor component XY defined on nodes */
-double ****EMfields3D::getpXYsn() {
-  return (pXYsn);
-}
-/*! SPECIES: get pressure tensor component XZ defined on nodes */
-double ****EMfields3D::getpXZsn() {
-  return (pXZsn);
-}
-/*! SPECIES: get pressure tensor component YY defined on nodes */
-double ****EMfields3D::getpYYsn() {
-  return (pYYsn);
-}
-/*! SPECIES: get pressure tensor component YZ defined on nodes */
-double ****EMfields3D::getpYZsn() {
-  return (pYZsn);
-}
-/*! SPECIES: get pressure tensor component ZZ defined on nodes */
-double ****EMfields3D::getpZZsn() {
-  return (pZZsn);
-}
-/*! get current -Direction X */
-double &EMfields3D::getJx(int indexX, int indexY, int indexZ) const {
-  return (Jx[indexX][indexY][indexZ]);
-}
-/*! get current array X component * */
-double ***EMfields3D::getJx() {
-  return (Jx);
-}
-/*! get current -Direction Y */
-double &EMfields3D::getJy(int indexX, int indexY, int indexZ) const {
-  return (Jy[indexX][indexY][indexZ]);
-}
-/*! get current array Y component * */
-double ***EMfields3D::getJy() {
-  return (Jy);
-}
-/*! get current -Direction Z */
-double &EMfields3D::getJz(int indexX, int indexY, int indexZ) const {
-  return (Jz[indexX][indexY][indexZ]);
-}
-/*! get current array Z component * */
-double ***EMfields3D::getJz() {
-  return (Jz);
-}
-/*!SPECIES: get current array X component */
-double ****EMfields3D::getJxs() {
-  return (Jxs);
-}
-/*! get Jxs(X,Y,Z,is) : density for species */
-double &EMfields3D::getJxs(int indexX, int indexY, int indexZ, int is) const {
-  return (Jxs[is][indexX][indexY][indexZ]);
-}
 /*! get Magnetic Field component X array species is cell without the ghost cells */
-double ***EMfields3D::getJxsc(Grid3DCU *grid, int is) {
-  double ***arr;
-  double ****tmp;
-
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
-  tmp = newArr4(double,ns,nxc,nyc,nzc);
+void EMfields3D::getJxsc(arr3_double& arr, Grid3DCU *grid, int is) {
 
+  array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, Jxs);
 
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=tmp[is][i][j][k];
-
-  delArr4(tmp,nxc,nyc,nzc);
-  return arr;
-}
-/*! SPECIES: get current array Y component */
-double ****EMfields3D::getJys() {
-  return (Jys);
 }
-/*! get Jxs(X,Y,Z,is) : density for species */
-double &EMfields3D::getJys(int indexX, int indexY, int indexZ, int is) const {
-  return (Jys[is][indexX][indexY][indexZ]);
-}
-/*! get current component Y array species is cell without the ghost cells */
-double ***EMfields3D::getJysc(Grid3DCU *grid, int is) {
-  double ***arr;
-  double ****tmp;
 
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
-  tmp = newArr4(double,ns,nxc,nyc,nzc);
+/*! get current component Y array species is cell without the ghost cells */
+void EMfields3D::getJysc(arr3_double& arr, Grid3DCU *grid, int is) {
 
+  array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, Jys);
 
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=tmp[is][i][j][k];
-
-  delArr4(tmp,nxc,nyc,nzc);
-  return arr;
-}
-/*!SPECIES: get current array Z component */
-double ****EMfields3D::getJzs() {
-  return (Jzs);
-}
-/*! get Jxs(X,Y,Z,is) : density for species */
-double &EMfields3D::getJzs(int indexX, int indexY, int indexZ, int is) const {
-  return (Jzs[is][indexX][indexY][indexZ]);
 }
 /*! get current component Z array species is cell without the ghost cells */
-double ***EMfields3D::getJzsc(Grid3DCU *grid, int is) {
-  double ***arr;
-  double ****tmp;
-
-  arr = newArr3(double,nxc-2,nyc-2,nzc-2);
-  tmp = newArr4(double,ns,nxc,nyc,nzc);
+void EMfields3D::getJzsc(arr3_double& arr, Grid3DCU *grid, int is) {
 
+  array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, Jzs);
 
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=tmp[is][i][j][k];
-
-  delArr4(tmp,nxc,nyc,nzc);
-  return arr;
 }
 /*! get the electric field energy */
 double EMfields3D::getEenergy(void) {
@@ -3401,67 +3195,16 @@ double EMfields3D::getBenergy(void) {
 void EMfields3D::print(void) const {
 }
 
-/*! destructor: deallocate arrays */
+/*! destructor*/
 EMfields3D::~EMfields3D() {
-  // nodes
-  delArr3(Ex, nxn, nyn);
-  delArr3(Ey, nxn, nyn);
-  delArr3(Ez, nxn, nyn);
-  delArr3(Exth, nxn, nyn);
-  delArr3(Eyth, nxn, nyn);
-  delArr3(Ezth, nxn, nyn);
-  delArr3(Bxn, nxn, nyn);
-  delArr3(Byn, nxn, nyn);
-  delArr3(Bzn, nxn, nyn);
-  delArr3(rhon, nxn, nyn);
-  delArr3(Jx, nxn, nyn);
-  delArr3(Jy, nxn, nyn);
-  delArr3(Jz, nxn, nyn);
-  delArr3(Jxh, nxn, nyn);
-  delArr3(Jyh, nxn, nyn);
-  delArr3(Jzh, nxn, nyn);
-  // nodes and species
-  delArr4(rhons, ns, nxn, nyn);
-  delArr4(Jxs, ns, nxn, nyn);
-  delArr4(Jys, ns, nxn, nyn);
-  delArr4(Jzs, ns, nxn, nyn);
-  delArr4(pXXsn, ns, nxn, nyn);
-  delArr4(pXYsn, ns, nxn, nyn);
-  delArr4(pXZsn, ns, nxn, nyn);
-  delArr4(pYYsn, ns, nxn, nyn);
-  delArr4(pYZsn, ns, nxn, nyn);
-  delArr4(pZZsn, ns, nxn, nyn);
-  // central points
-  delArr3(PHI, nxc, nyc);
-  delArr3(Bxc, nxc, nyc);
-  delArr3(Byc, nxc, nyc);
-  delArr3(Bzc, nxc, nyc);
-  delArr3(rhoc, nxc, nyc);
-  delArr3(rhoh, nxc, nyc);
-  // various stuff needs to be deallocated too
-  delArr3(tempXC, nxc, nyc);
-  delArr3(tempYC, nxc, nyc);
-  delArr3(tempZC, nxc, nyc);
-  delArr3(tempXN, nxn, nyn);
-  delArr3(tempYN, nxn, nyn);
-  delArr3(tempZN, nxn, nyn);
-  delArr3(tempC, nxc, nyc);
-  delArr3(tempX, nxn, nyn);
-  delArr3(tempY, nxn, nyn);
-  delArr3(tempZ, nxn, nyn);
-  delArr3(temp2X, nxn, nyn);
-  delArr3(temp2Y, nxn, nyn);
-  delArr3(temp2Z, nxn, nyn);
-  delArr3(imageX, nxn, nyn);
-  delArr3(imageY, nxn, nyn);
-  delArr3(imageZ, nxn, nyn);
-  delArr3(Dx, nxn, nyn);
-  delArr3(Dy, nxn, nyn);
-  delArr3(Dz, nxn, nyn);
-  delArr3(vectX, nxn, nyn);
-  delArr3(vectY, nxn, nyn);
-  delArr3(vectZ, nxn, nyn);
-  delArr3(divC, nxc, nyc);
+  delete [] qom;
+  delete [] rhoINIT;
+  delete injFieldsLeft;
+  delete injFieldsRight;
+  delete injFieldsTop;
+  delete injFieldsBottom;
+  delete injFieldsFront;
+  delete injFieldsRear;
   for(int i=0;i<sizeMomentsArray;i++)
   {
     delete momentsArray[i];
diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index b222e8d5..a533f481 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -88,7 +88,7 @@ void Grid3DCU::print(VirtualTopology3D * ptVCT) {
 }
 
 /** calculate gradient on nodes, given a scalar field defined on central points  */
-void Grid3DCU::gradC2N(double ***gradXN, double ***gradYN, double ***gradZN, double ***scFieldC) {
+void Grid3DCU::gradC2N(arr3_double& gradXN, arr3_double& gradYN, arr3_double& gradZN, const_arr3_double& scFieldC) {
   for (register int i = 1; i < nxn - 1; i++)
     for (register int j = 1; j < nyn - 1; j++)
       for (register int k = 1; k < nzn - 1; k++) {
@@ -99,7 +99,7 @@ void Grid3DCU::gradC2N(double ***gradXN, double ***gradYN, double ***gradZN, dou
 }
 
 /** calculate gradient on nodes, given a scalar field defined on central points  */
-void Grid3DCU::gradN2C(double ***gradXC, double ***gradYC, double ***gradZC, double ***scFieldN) {
+void Grid3DCU::gradN2C(arr3_double& gradXC, arr3_double& gradYC, arr3_double& gradZC, const_arr3_double& scFieldN) {
   for (register int i = 1; i < nxc - 1; i++)
     for (register int j = 1; j < nyc - 1; j++)
       for (register int k = 1; k < nzc - 1; k++) {
@@ -110,7 +110,7 @@ void Grid3DCU::gradN2C(double ***gradXC, double ***gradYC, double ***gradZC, dou
 }
 
 /** calculate divergence on central points, given a vector field defined on nodes  */
-void Grid3DCU::divN2C(double ***divC, double ***vecFieldXN, double ***vecFieldYN, double ***vecFieldZN) {
+void Grid3DCU::divN2C(arr3_double& divC, const_arr3_double& vecFieldXN, const_arr3_double& vecFieldYN, const_arr3_double& vecFieldZN) {
   double compX;
   double compY;
   double compZ;
@@ -125,7 +125,7 @@ void Grid3DCU::divN2C(double ***divC, double ***vecFieldXN, double ***vecFieldYN
 }
 
 /** calculate divergence on central points, given a Tensor field defined on nodes  */
-void Grid3DCU::divSymmTensorN2C(double ***divCX, double ***divCY, double ***divCZ, double ****pXX, double ****pXY, double ****pXZ, double ****pYY, double ****pYZ, double ****pZZ, int ns) {
+void Grid3DCU::divSymmTensorN2C(arr3_double& divCX, arr3_double& divCY, arr3_double& divCZ, const_arr4_double& pXX, const_arr4_double& pXY, const_arr4_double& pXZ, const_arr4_double& pYY, const_arr4_double& pYZ, const_arr4_double& pZZ, int ns) {
   double comp1X, comp2X, comp3X;
   double comp1Y, comp2Y, comp3Y;
   double comp1Z, comp2Z, comp3Z;
@@ -148,7 +148,7 @@ void Grid3DCU::divSymmTensorN2C(double ***divCX, double ***divCY, double ***divC
 }
 
 /** calculate divergence on nodes, given a vector field defined on central points  */
-void Grid3DCU::divC2N(double ***divN, double ***vecFieldXC, double ***vecFieldYC, double ***vecFieldZC) {
+void Grid3DCU::divC2N(arr3_double& divN, const_arr3_double& vecFieldXC, const_arr3_double& vecFieldYC, const_arr3_double& vecFieldZC) {
   double compX;
   double compY;
   double compZ;
@@ -163,7 +163,7 @@ void Grid3DCU::divC2N(double ***divN, double ***vecFieldXC, double ***vecFieldYC
 }
 
 /** calculate curl on nodes, given a vector field defined on central points  */
-void Grid3DCU::curlC2N(double ***curlXN, double ***curlYN, double ***curlZN, double ***vecFieldXC, double ***vecFieldYC, double ***vecFieldZC) {
+void Grid3DCU::curlC2N(arr3_double& curlXN, arr3_double& curlYN, arr3_double& curlZN, const_arr3_double& vecFieldXC, const_arr3_double& vecFieldYC, const_arr3_double& vecFieldZC) {
   double compZDY, compYDZ;
   double compXDZ, compZDX;
   double compYDX, compXDY;
@@ -187,7 +187,9 @@ void Grid3DCU::curlC2N(double ***curlXN, double ***curlYN, double ***curlZN, dou
 }
 
 /** calculate curl on central points, given a vector field defined on nodes  */
-void Grid3DCU::curlN2C(double ***curlXC, double ***curlYC, double ***curlZC, double ***vecFieldXN, double ***vecFieldYN, double ***vecFieldZN) {
+void Grid3DCU::curlN2C(arr3_double& curlXC, arr3_double& curlYC, arr3_double& curlZC,
+  const_arr3_double& vecFieldXN, const_arr3_double& vecFieldYN, const_arr3_double& vecFieldZN)
+{
   double compZDY, compYDZ;
   double compXDZ, compZDX;
   double compYDX, compXDY;
@@ -215,12 +217,12 @@ void Grid3DCU::curlN2C(double ***curlXC, double ***curlYC, double ***curlZC, dou
 }
 
 /** calculate laplacian on nodes, given a scalar field defined on nodes */
-void Grid3DCU::lapN2N(double ***lapN, double ***scFieldN, VirtualTopology3D * vct) {
+void Grid3DCU::lapN2N(arr3_double& lapN, const_arr3_double& scFieldN, VirtualTopology3D * vct) {
   // calculate laplacian as divercence of gradient
   // allocate 3 gradients: defined on central points
-  double ***gradXC = newArr3(double, nxc, nyc, nzc);
-  double ***gradYC = newArr3(double, nxc, nyc, nzc);
-  double ***gradZC = newArr3(double, nxc, nyc, nzc);
+  array3_double gradXC(nxc, nyc, nzc);
+  array3_double gradYC(nxc, nyc, nzc);
+  array3_double gradZC(nxc, nyc, nzc);
 
   gradN2C(gradXC, gradYC, gradZC, scFieldN);
   // communicate with BC
@@ -228,19 +230,15 @@ void Grid3DCU::lapN2N(double ***lapN, double ***scFieldN, VirtualTopology3D * vc
   communicateCenterBC(nxc, nyc, nzc, gradYC, 1, 1, 1, 1, 1, 1, vct);
   communicateCenterBC(nxc, nyc, nzc, gradZC, 1, 1, 1, 1, 1, 1, vct);
   divC2N(lapN, gradXC, gradYC, gradZC);
-  // deallocate
-  delArr3(gradXC, nxc, nyc);
-  delArr3(gradYC, nxc, nyc);
-  delArr3(gradZC, nxc, nyc);
 }
 
 /** calculate laplacian on central points, given a scalar field defined on central points */
-void Grid3DCU::lapC2C(double ***lapC, double ***scFieldC, VirtualTopology3D * vct) {
+void Grid3DCU::lapC2C(arr3_double& lapC, const_arr3_double& scFieldC, VirtualTopology3D * vct) {
   // calculate laplacian as divercence of gradient
   // allocate 3 gradients: defined on nodes
-  double ***gradXN = newArr3(double, nxn, nyn, nzn);
-  double ***gradYN = newArr3(double, nxn, nyn, nzn);
-  double ***gradZN = newArr3(double, nxn, nyn, nzn);
+  array3_double gradXN(nxn, nyn, nzn);
+  array3_double gradYN(nxn, nyn, nzn);
+  array3_double gradZN(nxn, nyn, nzn);
 
   gradC2N(gradXN, gradYN, gradZN, scFieldC);
   if (vct->getYleft_neighbor() == MPI_PROC_NULL) {
@@ -268,15 +266,10 @@ void Grid3DCU::lapC2C(double ***lapC, double ***scFieldC, VirtualTopology3D * vc
       }
   }
   divN2C(lapC, gradXN, gradYN, gradZN);
-
-  delArr3(gradXN, nxn, nyn);
-  delArr3(gradYN, nxn, nyn);
-  delArr3(gradZN, nxn, nyn);
-
 }
 
 /** calculate laplacian on central points, given a scalar field defined on central points for Poisson */
-void Grid3DCU::lapC2Cpoisson(double ***lapC, double ***scFieldC, VirtualTopology3D * vct) {
+void Grid3DCU::lapC2Cpoisson(arr3_double& lapC, arr3_double& scFieldC, VirtualTopology3D * vct) {
   // communicate first the scFieldC
   communicateCenterBoxStencilBC(nxc, nyc, nzc, scFieldC, 1, 1, 1, 1, 1, 1, vct);
   for (register int i = 1; i < nxc - 1; i++)
@@ -286,7 +279,7 @@ void Grid3DCU::lapC2Cpoisson(double ***lapC, double ***scFieldC, VirtualTopology
 }
 
 /** calculate divergence on  boundaries */
-void Grid3DCU::divBCleft(double ***divBC, double ***vectorX, double ***vectorY, double ***vectorZ, int leftActiveNode, int dirDER) {
+void Grid3DCU::divBCleft(arr3_double& divBC, const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ, int leftActiveNode, int dirDER) {
   double compX, compY, compZ;
   switch (dirDER) {
     case 0:                    // DIVERGENCE DIRECTION X
@@ -323,7 +316,7 @@ void Grid3DCU::divBCleft(double ***divBC, double ***vectorX, double ***vectorY,
 }
 
 /** calculate divergence on  boundaries */
-void Grid3DCU::divBCright(double ***divBC, double ***vectorX, double ***vectorY, double ***vectorZ, int rightActiveNode, int dirDER) {
+void Grid3DCU::divBCright(arr3_double& divBC, const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ, int rightActiveNode, int dirDER) {
   double compX, compY, compZ;
 
 
@@ -362,7 +355,7 @@ void Grid3DCU::divBCright(double ***divBC, double ***vectorX, double ***vectorY,
 }
 
 /** calculate derivative on left boundary */
-void Grid3DCU::derBC(double ***derBC, double ***vector, int leftActiveNode, int dirDER) {
+void Grid3DCU::derBC(arr3_double& derBC, const_arr3_double& vector, int leftActiveNode, int dirDER) {
   switch (dirDER) {
     case 0:                    // DERIVATIVE DIRECTION X
       for (register int j = 1; j < nyc - 1; j++)
@@ -385,7 +378,7 @@ void Grid3DCU::derBC(double ***derBC, double ***vector, int leftActiveNode, int
 }
 
 /** interpolate on nodes from central points: do this for the magnetic field*/
-void Grid3DCU::interpC2N(double ***vecFieldN, double ***vecFieldC) {
+void Grid3DCU::interpC2N(arr3_double& vecFieldN, const_arr3_double& vecFieldC) {
   for (register int i = 1; i < nxn - 1; i++)
     for (register int j = 1; j < nyn - 1; j++)
       for (register int k = 1; k < nzn - 1; k++)
@@ -393,7 +386,7 @@ void Grid3DCU::interpC2N(double ***vecFieldN, double ***vecFieldC) {
 }
 
 /** interpolate on central points from nodes */
-void Grid3DCU::interpN2C(double ***vecFieldC, double ***vecFieldN) {
+void Grid3DCU::interpN2C(arr3_double& vecFieldC, const_arr3_double& vecFieldN) {
   for (register int i = 1; i < nxc - 1; i++)
     for (register int j = 1; j < nyc - 1; j++)
       for (register int k = 1; k < nzc - 1; k++)
@@ -401,7 +394,7 @@ void Grid3DCU::interpN2C(double ***vecFieldC, double ***vecFieldN) {
 }
 
 /** interpolate on central points from nodes */
-void Grid3DCU::interpN2C(double ****vecFieldC, int ns, double ****vecFieldN) {
+void Grid3DCU::interpN2C(arr4_double& vecFieldC, int ns, const_arr4_double& vecFieldN) {
   for (register int i = 1; i < nxc - 1; i++)
     for (register int j = 1; j < nyc - 1; j++)
       for (register int k = 1; k < nzc - 1; k++)
diff --git a/iPic3D.cpp b/iPic3D.cpp
index d670264c..ec9b59ba 100644
--- a/iPic3D.cpp
+++ b/iPic3D.cpp
@@ -2,6 +2,7 @@
 #include <mpi.h>
 #include <iomanip>
 #include "iPic3D.h"
+#include "debug.h"
 
 using namespace iPic3D;
 
diff --git a/include/Alloc.h b/include/Alloc.h
index 8441221e..5c3c5b5d 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -2,6 +2,8 @@
 #define IPIC_ALLOC_H
 #include <cstddef> // for alignment stuff
 #include "asserts.h" // for assert_le, assert_lt
+//#include "errors.h" // for assert_le, assert_lt
+#include "arraysfwd.h"
 //#include "arrays.h" // fixed-dimension arrays
 
 /*
@@ -15,7 +17,47 @@
     For examples of use of this class,
     see test_arrays.cpp
 
-    An alternative would be to use boost arrays.
+    Compiler options:
+    -DCHECK_BOUNDS: check bounds when performing array access
+      (major performance penalty).
+    -DFLAT_ARRAYS: use calculated 1d subscript to dereference
+      even for arr[i][j][k] notation.
+    -DCHAINED_ARRAYS: use hierarchy of pointers to dereference
+      even for arr.get(i,j,k) notation.
+
+    By default, chained pointers are used for arr[i][j][k]
+    notation (unless -DCHECK_BOUNDS is turned on, in which case
+    we don't care about performance anyway), and calculated 1d
+    subscript is used for arr.get(i,j,k) notation.
+
+    An alternative would have been use boost arrays.  Use of our
+    own array class allows flexibility for our choice of array
+    implementation, including the possibility of using boost
+    for the implementation, while avoiding boost as an external
+    dependency.  On some systems, it may be preferable to use
+    native arrays with hard-coded dimensions; this could suit us
+    well, since all arrays are approximately the same size, but
+    would require a recompile when changing the maximum array size.
+
+    Rather than using these templates directly, the typedefs
+    declared in "arraysfwd.h" should be used:
+
+    * const_arr3_double = const_array_ref3<double>
+    * arr3_double = array_ref3<double>
+    * array3_double = array3<double>
+
+    The point is that we do not want to hard-code the fact that
+    we are using templates, and we may well wish to eliminate use
+    of templates in the future.  (Alternatives are to use the
+    preprocessor or to have separate implementations for each
+    type (double, int, possibly float) if we go to use of mixed
+    precision).  Support for templates is notoriously buggy in
+    compilers, particularly when it comes to inheritance, and I
+    in fact had to eliminate inheriting from the base_arr class
+    and use the "protected" hack below in order to get this
+    code to compile on the latest intel compiler (2013) and on
+    g++ 4.0 (2005); g++ 4.2 (2007) compiled (but unfortunately,
+    for my g++ 4.2, iPic3D suffered from stack frame corruption.)
 */
 #define ALIGNMENT (64)
 #ifdef __INTEL_COMPILER
@@ -85,6 +127,48 @@ inline type **** newArray4(size_t sz1, size_t sz2, size_t sz3, size_t sz4)
   return arr;
 }
 
+// build chained pointer hierarchy for pre-existing bottom level
+//
+template <class type>
+inline type **** newArray4(type * in, size_t sz1, size_t sz2, size_t sz3, size_t sz4)
+{
+  type****arr = newArray3<type*>(sz1,sz2,sz3);
+  type**arr2 = **arr;
+  type *ptr = in;
+  size_t szarr2 = sz1*sz2*sz3;
+  for(size_t i=0;i<szarr2;i++) {
+    arr2[i] = ptr;
+    ptr += sz4;
+  }
+  return arr;
+}
+template <class type>
+inline type *** newArray3(type * in, size_t sz1, size_t sz2, size_t sz3)
+{
+  type***arr = newArray2<type*>(sz1,sz2);
+  type**arr2 = *arr;
+  type *ptr = in;
+  size_t szarr2 = sz1*sz2;
+  for(size_t i=0;i<szarr2;i++) {
+    arr2[i] = ptr;
+    ptr += sz3;
+  }
+  return arr;
+}
+template <class type>
+inline type ** newArray2(type * in, size_t sz1, size_t sz2)
+{
+  type**arr = newArray2<type*>(sz1);
+  type**arr2 = arr;
+  type *ptr = in;
+  size_t szarr2 = sz1;
+  for(size_t i=0;i<szarr2;i++) {
+    arr2[i] = ptr;
+    ptr += sz2;
+  }
+  return arr;
+}
+
 // methods to deallocate arrays
 //
 template < class type > inline void delArray1(type * arr)
@@ -107,319 +191,504 @@ template <class type> inline void delArr3(type *** arr, size_t sz1, size_t sz2)
 template <class type> inline void delArr4(type **** arr,
   size_t sz1, size_t sz2, size_t sz3)
 { delArray3(arr); }
-
-// classes to dereference arrays.
-//
-// ArrayRefN is essentially a dumbed-down version of ArrN with
-// an index shift applied to the underlying array.  The purpose
-// of ArrayRefN is to allow elements of multidimensional arrays
-// to be accessed with a calculated one-dimensional index while
-// using chained operator[] syntax (e.g. myarr[i][j]), i.e. the
-// same syntax as is used for native or nested arrays.  This
-// implementation is likely to be slow unless optimization is
-// turned on, allowing the compiler to figure out that the whole
-// chain of calls to the operator[] methods and to the ArrayRefN
-// constructors reduces to computing a one-dimensional subscript
-// used to access a one-dimensional array.
-//
-template <class type>
-class ArrayRef1
+  
+namespace iPic3D
 {
-  type* const __restrict__ arr;
-  const size_t S1;
-  const size_t shift;
- public:
-  inline ArrayRef1(type*const arr_, size_t k, size_t s1) :
-    arr(arr_), shift(k), S1(s1)
-  {}
-  inline type& operator[](size_t n1){
-    check_bounds(n1, S1);
-    ALIGNED(arr);
-    return arr[shift+n1];
-  }
-};
-
-template <class type>
-class ArrayRef2
-{
-  type* const __restrict__ arr;
-  const size_t shift;
-  const size_t S2, S1;
- public:
-  inline ArrayRef2(type*const arr_, size_t k, size_t s2, size_t s1) :
-    arr(arr_), shift(k), S2(s2), S1(s1)
-  {}
-  inline ArrayRef1<type> operator[](size_t n2){
-    check_bounds(n2,S2);
-    return ArrayRef1<type>(arr, (shift+n2)*S1, S1);
-  }
-};
-
-template <class type>
-class ArrayRef3
-{
-  type* const __restrict__ arr;
-  const size_t shift;
-  const size_t S3, S2, S1;
- public:
-  inline ArrayRef3(type*const arr_, size_t k, size_t s3, size_t s2, size_t s1) :
-    arr(arr_), shift(k), S3(s3), S2(s2), S1(s1)
-  {}
-  inline ArrayRef2<type> operator[](size_t n3){
-    check_bounds(n3, S3);
-    return ArrayRef2<type>(arr, (shift+n3)*S2, S2, S1);
-  }
-};
-
-// ArrN can adopt an array allocated by newArrN
-//
-// The purpose of these classes is to provide more efficient
-// and more regulated access to array elements.  The idea is to
-// maintain backward compatibility while allowing us to move
-// toward a proper array abstraction.
-//
-// The user of ArrN is responsible for memory management.
-// The ArrayN classes are the version of this class
-// with automatic deallocation.
-//
-// Examples:
-//
-// Using constructor to create array:
-// {
-//   Arr2 arr<int>(16, 16);
-//   arr[1][2] = 5;
-//   arr.free();
-// }
-// Using ArrN to adopt an array allocated by newArrN
-// {
-//   int** array = newArray2<int>(16,16)
-//   Arr2 arr(array,16,16); // adopt array
-//   arr[1][2] = 5;
-//   assert_eq(arr[1][2],array[1][2]);
-//   // arr.free(); // should not do both this and next line.
-//   delArray2<int>(array);
-// }
-//
-// proposed improvements:
-// - allow shifting of the base:
-//   - need "double shift" in each class
-//   - need to implement "arr3.set_bases(b1,b2,b3);"
-//     which calculates "shift".
-//   - need "const size_t b1, b2, b3;" for beginning indices
-//     to allow bounds checking.  Should not incur run-time
-//     penalty, but it so then condition on CHECK_BOUNDS.
-// - methods that use parallel arithmetic for omp and vectorized code
-
-template <class type>
-class Arr1
-{
-  private: // data
-    const size_t S1;
+  // underlying 1-dimensional array class for arrays
+  
+  template <class type>
+  class base_arr
+  {
+    private:
+      size_t size;
+    protected:
+      type* const __restrict__ arr;
+      type* get_arr()const{return arr;}
+    public:
+      base_arr(size_t s) : size(s), arr(AlignedAlloc(type, s)) {}
+      base_arr(type* in, size_t s) : size(s), arr(in) {}
+      ~base_arr(){}
+      void free() { AlignedFree(arr); }
+      void setall(type val){
+        for(size_t i=0;i<size;i++) arr[i]=val;
+      }
+      //type* fetch_arr(){return arr;}
+  };
+  
+  // classes to dereference arrays.
+  //
+  // ArrayGetN is essentially a dumbed-down version of ArrN with
+  // an index shift applied to the underlying array.  The purpose
+  // of ArrayGetN is to allow elements of multidimensional arrays
+  // to be accessed with a calculated one-dimensional index while
+  // using chained operator[] syntax (e.g. myarr[i][j]), i.e. the
+  // same syntax as is used for native or nested arrays.  This
+  // implementation is likely to be slow unless optimization is
+  // turned on, allowing the compiler to figure out that the whole
+  // chain of calls to the operator[] methods and to the ArrayGetN
+  // constructors reduces to computing a one-dimensional subscript
+  // used to access a one-dimensional array.
+  //
+  template <class type>
+  class ArrayGet1
+  {
     type* const __restrict__ arr;
-  public:
-    ~Arr1() { }
-    void free() { AlignedFree(arr); }
-    Arr1(size_t s1) :
-      S1(s1),
-      arr(AlignedAlloc(type, s1))
-    { }
-    Arr1(type* in,
-      size_t s1) :
-      S1(s1),
-      arr(in)
-    { }
+    const size_t S1;
+    const size_t shift;
+   public:
+    inline ArrayGet1(type*const arr_, size_t k, size_t s1) :
+      arr(arr_), shift(k), S1(s1)
+    {}
     inline type& operator[](size_t n1){
       check_bounds(n1, S1);
       ALIGNED(arr);
-      return arr[n1];
-    }
-    inline size_t getidx(size_t n1) const
-    {
-      check_bounds(n1, S1);
-      return n1;
+      return arr[shift+n1];
     }
-    const type& get(size_t n1) const
-      { ALIGNED(arr); return arr[getidx(n1)]; }
-    type& fetch(size_t n2,size_t n1) const
-      { ALIGNED(arr); return arr[getidx(n1)]; }
-    void set(size_t n1, type value)
-      { ALIGNED(arr); arr[getidx(n1)] = value; }
-};
-
-template <class type>
-class Arr2
-{
-  private: // data
-    const size_t S2,S1;
+  };
+  
+  template <class type>
+  class ArrayGet2
+  {
     type* const __restrict__ arr;
-  public:
-    ~Arr2(){}
-    void free() { AlignedFree(arr); }
-    Arr2(size_t s2, size_t s1) :
-      S2(s2), S1(s1),
-      arr(AlignedAlloc(type, s2*s1))
-    {
-    }
-    Arr2(type*const* in,
-      size_t s2, size_t s1) :
-      S2(s2), S1(s1),
-      arr(*in)
-    { }
-    // for backwards compatibility support bracket notation
-    inline ArrayRef1<type> operator[](size_t n2){
-      check_bounds(n2, S2);
-      return ArrayRef1<type>(arr, n2*S1, S1);
+    const size_t shift;
+    const size_t S2, S1;
+   public:
+    inline ArrayGet2(type*const arr_, size_t k, size_t s2, size_t s1) :
+      arr(arr_), shift(k), S2(s2), S1(s1)
+    {}
+    inline ArrayGet1<type> operator[](size_t n2){
+      check_bounds(n2,S2);
+      return ArrayGet1<type>(arr, (shift+n2)*S1, S1);
     }
-    inline size_t getidx(size_t n2, size_t n1) const
-    {
-      check_bounds(n2, S2);
-      check_bounds(n1, S1);
-      return n2*S1+n1;
-    }
-    // I prefer "fetch" over operator() to hilight read/write access
-    //type& operator()(size_t n2, size_t n1) const
-    //  { ALIGNED(arr); return arr[n1+S1*n2]; }
-    type& fetch(size_t n2,size_t n1) const
-      { ALIGNED(arr); return arr[getidx(n2,n1)]; }
-    // better to use accessors that distinguish read from write:
-    const type& get(size_t n2,size_t n1) const
-      { ALIGNED(arr); return arr[getidx(n2,n1)]; }
-    void set(size_t n2,size_t n1, type value)
-      { ALIGNED(arr); arr[getidx(n2,n1)] = value; }
-    inline Arr1<type>fetch_Arr1(){ return Arr1<type>(arr, S1*S2); }
-};
-
-template <class type>
-class Arr3
-{
-  private: // data
-    const size_t S3,S2,S1;
+  };
+  
+  template <class type>
+  class ArrayGet3
+  {
     type* const __restrict__ arr;
-  public:
-    size_t dim1()const{return S1;}
-    size_t dim2()const{return S2;}
-    size_t dim3()const{return S3;}
-    ~Arr3(){}
-    void free() { AlignedFree(arr); }
-    Arr3(size_t s3, size_t s2, size_t s1) :
-      S3(s3), S2(s2), S1(s1),
-      arr(AlignedAlloc(type, s3*s2*s1))
-    { }
-    Arr3(type*const*const* in,
-      size_t s3, size_t s2, size_t s1) :
-      S3(s3), S2(s2), S1(s1),
-      arr(**in)
-    { }
-    inline ArrayRef2<type> operator[](size_t n3){
+    const size_t shift;
+    const size_t S3, S2, S1;
+   public:
+    inline ArrayGet3(type*const arr_, size_t k, size_t s3, size_t s2, size_t s1) :
+      arr(arr_), shift(k), S3(s3), S2(s2), S1(s1)
+    {}
+    inline ArrayGet2<type> operator[](size_t n3){
       check_bounds(n3, S3);
-      return ArrayRef2<type>(arr, n3*S2, S2, S1);
+      return ArrayGet2<type>(arr, (shift+n3)*S2, S2, S1);
     }
-    inline size_t getidx(size_t n3, size_t n2, size_t n1) const
-    {
-      check_bounds(n3, S3);
-      check_bounds(n2, S2);
+  };
+  
+  // const versions
+  
+  template <class type>
+  class const_array_get1
+  {
+    type* const __restrict__ arr;
+    const size_t S1;
+    const size_t shift;
+   public:
+    inline const_array_get1(type*const arr_, size_t k, size_t s1) :
+      arr(arr_), shift(k), S1(s1)
+    {}
+    inline const type& operator[](size_t n1)const{
       check_bounds(n1, S1);
-      return (n3*S2+n2)*S1+n1;
+      ALIGNED(arr);
+      return arr[shift+n1];
     }
-    //type& operator()(size_t n3, size_t n2, size_t n1) const
-    //{ ALIGNED(arr); return arr[getidx(n3,n2,n1)]; }
-    type& fetch(size_t n3,size_t n2,size_t n1) const
-      { ALIGNED(arr); return arr[getidx(n3,n2,n1)]; }
-    const type& get(size_t n3,size_t n2,size_t n1) const
-      { ALIGNED(arr); return arr[getidx(n3,n2,n1)]; }
-    void set(size_t n3,size_t n2,size_t n1, type value)
-      { ALIGNED(arr); arr[getidx(n3,n2,n1)] = value; }
-    inline Arr1<type>fetch_Arr1(){ return Arr1<type>(arr, S1*S2*S3); }
-};
-
-template <class type>
-class Arr4
-{
-  private: // data
-    const size_t S4,S3,S2,S1;
+  };
+  
+  template <class type>
+  class const_array_get2
+  {
     type* const __restrict__ arr;
-  public:
-    ~Arr4(){} // nonempty destructor would kill performance
-    void free() { AlignedFree(arr); }
-    Arr4(size_t s4, size_t s3, size_t s2, size_t s1) :
-      arr(AlignedAlloc(type, s4*s3*s2*s1)),
-      S4(s4), S3(s3), S2(s2), S1(s1)
-    { }
-    Arr4(type*const*const*const* in,
-      size_t s4, size_t s3, size_t s2, size_t s1) :
-      S4(s4), S3(s3), S2(s2), S1(s1),
-      arr(***in)
-    { }
-    inline ArrayRef3<type> operator[](size_t n4){
-      check_bounds(n4, S4);
-      return ArrayRef3<type>(arr, n4*S3, S3, S2, S1);
+    const size_t shift;
+    const size_t S2, S1;
+   public:
+    inline const_array_get2(type*const arr_, size_t k, size_t s2, size_t s1) :
+      arr(arr_), shift(k), S2(s2), S1(s1)
+    {}
+    inline const const_array_get1<type> operator[](size_t n2)const{
+      check_bounds(n2,S2);
+      return const_array_get1<type>(arr, (shift+n2)*S1, S1);
     }
-    inline size_t getidx(size_t n4, size_t n3, size_t n2, size_t n1) const
-    {
-      check_bounds(n4, S4);
+  };
+  
+  template <class type>
+  class const_array_get3
+  {
+    type* const __restrict__ arr;
+    const size_t shift;
+    const size_t S3, S2, S1;
+   public:
+    const_array_get3(type*const arr_, size_t k, size_t s3, size_t s2, size_t s1) :
+      arr(arr_), shift(k), S3(s3), S2(s2), S1(s1)
+    {}
+    inline const const_array_get2<type> operator[](size_t n3)const{
       check_bounds(n3, S3);
-      check_bounds(n2, S2);
-      check_bounds(n1, S1);
-      return ((n4*S3+n3)*S2+n2)*S1+n1;
+      return const_array_get2<type>(arr, (shift+n3)*S2, S2, S1);
     }
-    const type& get(size_t n4,size_t n3,size_t n2,size_t n1) const
-      { ALIGNED(arr); return arr[getidx(n4,n3,n2,n1)]; }
-    type& fetch(size_t n4,size_t n3,size_t n2,size_t n1) const
-      { ALIGNED(arr); return arr[getidx(n4,n3,n2,n1)]; }
-    void set(size_t n4,size_t n3,size_t n2,size_t n1, type value)
-      { ALIGNED(arr); arr[getidx(n4,n3,n2,n1)] = value; }
-};
-
-// Versions of array classes which automatically free memory.
-//
-// Note that the nonempty destructor kills performance
-// unless compiling with -fno-exceptions
-
-template <class type>
-struct Array1 : public Arr1<type>
-{
-    ~Array1(){Arr1<type>::free();}
-    Array1(size_t s1) : Arr1<type>(s1) { }
-};
-
-template <class type>
-struct Array2 : public Arr2<type>
-{
-    ~Array2(){Arr2<type>::free();}
-    Array2(size_t s2, size_t s1) : Arr2<type>(s2,s1) { }
-};
-
-template <class type>
-struct Array3 : public Arr3<type>
-{
-    ~Array3(){Arr3<type>::free();}
-    Array3(size_t s3, size_t s2, size_t s1) : Arr3<type>(s3,s2,s1) { }
-};
+  };
+  
+  // ArrN corresponds to multi_array_ref in the boost library.
+  //
+  // ArrN can adopt an array allocated by newArrN
+  //
+  // The purpose of these classes is to provide more efficient
+  // and more regulated access to array elements.  The idea is to
+  // maintain backward compatibility while allowing us to move
+  // toward a proper array abstraction.
+  //
+  // The user of ArrN is responsible for memory management.
+  // The ArrayN classes are the version of this class
+  // with automatic deallocation.
+  //
+  // Examples:
+  //
+  // Using constructor to create array:
+  // {
+  //   array_ref2 arr<int>(16, 16);
+  //   arr[1][2] = 5;
+  //   arr.free();
+  // }
+  // Using ArrN to adopt an array allocated by newArrN
+  // {
+  //   int** array = newArray2<int>(16,16)
+  //   array_ref2 arr(array,16,16); // adopt array
+  //   arr[1][2] = 5;
+  //   assert_eq(arr[1][2],array[1][2]);
+  //   // arr.free(); // should not do both this and next line.
+  //   delArray2<int>(array);
+  // }
+  //
+  // proposed improvements:
+  // - allow shifting of the base:
+  //   - need "double shift" in each class
+  //   - need to implement "arr3.set_bases(b1,b2,b3);"
+  //     which calculates "shift".
+  //   - need "const size_t b1, b2, b3;" for beginning indices
+  //     to allow bounds checking.  Should not incur run-time
+  //     penalty, but it so then condition on CHECK_BOUNDS.
+  // - methods that use parallel arithmetic for omp and vectorized code
+  
+  template <class type>
+  class array_ref1
+  {
+    private: // data
+      const size_t S1;
+      type* const __restrict__ arr;
+    public:
+      ~array_ref1() { }
+      void free() { AlignedFree(arr); }
+      array_ref1(size_t s1) :
+        S1(s1),
+        arr(AlignedAlloc(type, s1))
+      { }
+      array_ref1(type* in,
+        size_t s1) :
+        S1(s1),
+        arr(in)
+      { }
+      inline type& operator[](size_t n1){
+        check_bounds(n1, S1);
+        ALIGNED(arr);
+        return arr[n1];
+      }
+      inline size_t getidx(size_t n1) const
+      {
+        check_bounds(n1, S1);
+        return n1;
+      }
+      const type& get(size_t n1) const
+        { ALIGNED(arr); return arr[getidx(n1)]; }
+      type& fetch(size_t n2,size_t n1) const
+        { ALIGNED(arr); return arr[getidx(n1)]; }
+      void set(size_t n1, type value)
+        { ALIGNED(arr); arr[getidx(n1)] = value; }
+  };
+  
+  template <class type>
+  class array_ref2
+  {
+    private: // data
+      const size_t S2,S1;
+      type* const __restrict__ arr;
+    public:
+      ~array_ref2(){}
+      void free() { AlignedFree(arr); }
+      array_ref2(size_t s2, size_t s1) :
+        S2(s2), S1(s1),
+        arr(AlignedAlloc(type, s2*s1))
+      {
+      }
+      array_ref2(type*const* in,
+        size_t s2, size_t s1) :
+        S2(s2), S1(s1),
+        arr(*in)
+      { }
+      // dereference via calculated index
+      inline ArrayGet1<type> operator[](size_t n2){
+        check_bounds(n2, S2);
+        return ArrayGet1<type>(arr, n2*S1, S1);
+      }
+      inline size_t getidx(size_t n2, size_t n1) const
+      {
+        check_bounds(n2, S2);
+        check_bounds(n1, S1);
+        return n2*S1+n1;
+      }
+      type& fetch(size_t n2, size_t n1) const
+        { ALIGNED(arr); return arr[n1+S1*n2]; }
+      // better to use accessors that distinguish read from write:
+      const type& get(size_t n2,size_t n1) const
+        { ALIGNED(arr); return arr[getidx(n2,n1)]; }
+      void set(size_t n2,size_t n1, type value)
+        { ALIGNED(arr); arr[getidx(n2,n1)] = value; }
+      //inline array_ref1<type>fetch_Arr1(){ return array_ref1<type>(arr, S1*S2); }
+  };
+  
+  template <class type>
+  class const_array_ref3 // : public base_arr<type>
+  {
+      //using base_arr<type>::get_arr;
+      //using base_arr<type>::arr;
+    protected: // data
+      size_t size;
+      const size_t S3,S2,S1;
+      type* const __restrict__ arr;
+      type*const*const*const arr3;
+    public:
+      ~const_array_ref3(){}
+      const_array_ref3(size_t s3, size_t s2, size_t s1) :
+        size(s3*s2*s1), arr(AlignedAlloc(type, size)),
+        //base_arr<type>(s3*s2*s1),
+        S3(s3), S2(s2), S1(s1),
+        arr3(newArray3<type>(arr,s3,s2,s1))
+      { }
+      const_array_ref3(type*const*const* in,
+        size_t s3, size_t s2, size_t s1) :
+        size(s3*s2*s1), arr(**in),
+        //base_arr<type>(**in, s3*s2*s1),
+        S3(s3), S2(s2), S1(s1),
+        arr3(in)
+      { }
+    #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
+      const const_array_get2<type> operator[](size_t n3)const{
+        check_bounds(n3, S3);
+        return const_array_get2<type>(arr, n3*S2, S2, S1);
+      }
+    #else
+      // this causes operator[] to dereference via chained pointer
+      operator type***(){ return (type***) arr3; }
+    #endif
+      void check_idx_bounds(size_t n3, size_t n2, size_t n1) const
+      {
+        check_bounds(n3, S3);
+        check_bounds(n2, S2);
+        check_bounds(n1, S1);
+      }
+      inline size_t getidx(size_t n3, size_t n2, size_t n1) const
+        { check_idx_bounds(n3,n2,n1); return (n3*S2+n2)*S1+n1; }
+    #ifdef CHAINED_ARRAYS
+      const type& get(size_t n3,size_t n2,size_t n1) const
+        { check_idx_bounds(n3,n2,n1); return arr3[n3][n2][n1]; }
+    protected: // hack: not in const_array_ref3 due to icpc compile error
+      type& fetch(size_t n3,size_t n2,size_t n1) const
+        { check_idx_bounds(n3,n2,n1); return arr3[n3][n2][n1]; }
+      void set(size_t n3,size_t n2,size_t n1, type value)
+        { check_idx_bounds(n3,n2,n1); arr3[n3][n2][n1] = value; }
+    #else
+      const type& get(size_t n3,size_t n2,size_t n1) const
+        { ALIGNED(arr); return arr[getidx(n3,n2,n1)]; }
+    protected: // hack: not in const_array_ref3 due to icpc compile error
+      type& fetch(size_t n3,size_t n2,size_t n1) const
+        { ALIGNED(arr); return arr[getidx(n3,n2,n1)]; }
+      void set(size_t n3,size_t n2,size_t n1, type value)
+        { ALIGNED(arr); arr[getidx(n3,n2,n1)] = value; }
+    #endif
+  };
+  
+  template <class type>
+  class array_ref3 : public const_array_ref3<type>
+  {
+      //using base_arr<type>::arr;
+      //using base_arr<type>::get_arr;
+      using const_array_ref3<type>::size;
+      using const_array_ref3<type>::arr;
+      using const_array_ref3<type>::S3;
+      using const_array_ref3<type>::S2;
+      using const_array_ref3<type>::S1;
+      using const_array_ref3<type>::arr3;
+      using const_array_ref3<type>::getidx;
+    public:
+      ~array_ref3(){}
+      array_ref3(size_t s3, size_t s2, size_t s1) :
+        const_array_ref3<type>(s3,s2,s1)
+      { }
+      array_ref3(type*const*const* in,
+        size_t s3, size_t s2, size_t s1) :
+        const_array_ref3<type>(in,s3,s2,s1)
+      { }
+      void free(){ delArray3<type>((type***)arr3); }
+    #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
+      inline ArrayGet2<type> operator[](size_t n3){
+        check_bounds(n3, S3);
+        return ArrayGet2<type>(arr, n3*S2, S2, S1);
+      }
+    #else
+      // this causes operator[] to dereference via chained pointer
+      operator type***(){ return (type***) arr3; }
+    #endif
+      type& fetch(size_t n3,size_t n2,size_t n1) const
+        { return const_array_ref3<type>::fetch(n3,n2,n1); }
+      void set(size_t n3,size_t n2,size_t n1, type value)
+        { const_array_ref3<type>::set(n3,n2,n1, value); }
+      void setall(type val){
+        for(size_t i=0;i<size;i++) arr[i]=val;
+      }
+      type*** fetch_arr3(){ return (type***) arr3; }
+  };
+  
+  // inheriting from base_arr<type> causes problems in g++ 4.0 (2005).
+  template <class type>
+  class const_array_ref4 //: public base_arr<type>
+  {
+      //using base_arr<type>::get_arr;
+    protected: // data
+      size_t size;
+      const size_t S4,S3,S2,S1;
+      type* const __restrict__ arr;
+      type*const*const*const*const arr4;
+    public:
+      ~const_array_ref4(){}
+      const_array_ref4(size_t s4, size_t s3, size_t s2, size_t s1) :
+        size(s4*s3*s2*s1), arr(AlignedAlloc(type, size)),
+        //base_arr<type>(s4*s3*s2*s1),
+        S4(s4), S3(s3), S2(s2), S1(s1),
+        arr4(newArray4<type>(arr,s4,s3,s2,s1))
+      { }
+      const_array_ref4(type*const*const*const* in,
+        size_t s4, size_t s3, size_t s2, size_t s1) :
+        size(s4*s3*s2*s1), arr(***in),
+        //base_arr<type>(***in, s4*s3*s2*s1),
+        S4(s4), S3(s3), S2(s2), S1(s1),
+        arr4(in)
+      { }
+    #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
+      const const_array_get3<type> operator[](size_t n4)const{
+        check_bounds(n4, S4);
+        return const_array_get3<type>(arr, n4*S3, S3, S2, S1);
+      }
+    #else
+      // this causes operator[] to dereference via chained pointer
+      operator type****(){ return (type****) arr4; }
+    #endif
+      void check_idx_bounds(size_t n4, size_t n3, size_t n2, size_t n1) const
+      {
+        check_bounds(n4, S4);
+        check_bounds(n3, S3);
+        check_bounds(n2, S2);
+        check_bounds(n1, S1);
+      }
+      inline size_t getidx(size_t n4, size_t n3, size_t n2, size_t n1) const
+        { check_idx_bounds(n4,n3,n2,n1); return ((n4*S3+n3)*S2+n2)*S1+n1; }
+    #ifdef CHAINED_ARRAYS
+      const type& get(size_t n4,size_t n3,size_t n2,size_t n1) const
+        { ALIGNED(arr); return arr[getidx(n4,n3,n2,n1)]; }
+    protected: // hack: not in const_array_ref4 due to icpc compile error
+      type& fetch(size_t n4,size_t n3,size_t n2,size_t n1) const
+        { ALIGNED(arr); return arr[getidx(n4,n3,n2,n1)]; }
+      void set(size_t n4,size_t n3,size_t n2,size_t n1, type value)
+        { ALIGNED(arr); arr[getidx(n4,n3,n2,n1)] = value; }
+    #else
+      const type& get(size_t n4,size_t n3,size_t n2,size_t n1) const
+        { check_idx_bounds(n4,n3,n2,n1); return arr4[n4][n3][n2][n1]; }
+    protected: // hack: not in const_array_ref4 due to icpc compile error
+      type& fetch(size_t n4,size_t n3,size_t n2,size_t n1) const
+        { check_idx_bounds(n4,n3,n2,n1); return arr4[n4][n3][n2][n1]; }
+      void set(size_t n4,size_t n3,size_t n2,size_t n1, type value)
+        { check_idx_bounds(n4,n3,n2,n1); arr4[n4][n3][n2][n1] = value; }
+    #endif
+  };
+  
+  template <class type>
+  class array_ref4 : public const_array_ref4<type>
+  {
+      //using base_arr<type>::get_arr;
+      using const_array_ref4<type>::arr;
+      using const_array_ref4<type>::S4;
+      using const_array_ref4<type>::S3;
+      using const_array_ref4<type>::S2;
+      using const_array_ref4<type>::S1;
+      using const_array_ref4<type>::arr4;
+      using const_array_ref4<type>::getidx;
+    public:
+      ~array_ref4(){}
+      array_ref4(size_t s4, size_t s3, size_t s2, size_t s1) :
+        const_array_ref4<type>(s4,s3,s2,s1)
+      { }
+      array_ref4(type*const*const*const* in,
+        size_t s4, size_t s3, size_t s2, size_t s1) :
+        const_array_ref4<type>(in,s4,s3,s2,s1)
+      { }
+    #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
+      inline ArrayGet3<type> operator[](size_t n4){
+        check_bounds(n4, S4);
+        return ArrayGet3<type>(arr, n4*S3, S3, S2, S1);
+      }
+    #else
+      operator type****(){ return (type****) arr4; }
+    #endif
+      type& fetch(size_t n4,size_t n3,size_t n2,size_t n1) const
+        { return const_array_ref4<type>::fetch(n4,n3,n2,n1); }
+      void set(size_t n4,size_t n3,size_t n2,size_t n1, type value)
+        { const_array_ref4<type>::set(n4,n3,n2,n1, value); }
+      void free(){ delArray4<type>((type****)arr4); }
+      type**** fetch_arr4(){ return (type****) arr4; }
+      //bool verify_dims(size_t s4, size_t s3, size_t s2, size_t s1){
+      //  if(s4==S4 && s3==S3 && s2==S2 && s1==S1) return true;
+      //  Wprintf("%d==%d && %d==%d && %d==%d && %d==%d failed",
+      //     s4, S4, s3, S3, s2, S2, s1, S1);
+      //  return false;
+      //}
+  };
+  
+  // Versions of array classes which automatically free memory
+  // (corresponding to multi_array in the boost library).
+  //
+  // Note that the nonempty destructor kills performance
+  // unless compiling with -fno-exceptions
+  
+  template <class type>
+  struct array1 : public array_ref1<type>
+  {
+      ~array1(){array_ref1<type>::free();}
+      array1(size_t s1) : array_ref1<type>(s1) { }
+  };
+  
+  template <class type>
+  struct array2 : public array_ref2<type>
+  {
+      ~array2(){array_ref2<type>::free();}
+      array2(size_t s2, size_t s1) : array_ref2<type>(s2,s1) { }
+  };
+  
+  template <class type>
+  struct array3 : public array_ref3<type>
+  {
+      ~array3(){array_ref3<type>::free();}
+      array3(size_t s3, size_t s2, size_t s1) : array_ref3<type>(s3,s2,s1) { }
+  };
+  
+  template <class type>
+  struct array4 : public array_ref4<type>
+  {
+      ~array4(){array_ref4<type>::free();}
+      array4(size_t s4, size_t s3, size_t s2, size_t s1)
+        : array_ref4<type>(s4,s3,s2,s1) { }
+  };
 
-template <class type>
-struct Array4 : public Arr4<type>
-{
-    ~Array4(){Arr4<type>::free();}
-    Array4(size_t s4, size_t s3, size_t s2, size_t s1)
-      : Arr4<type>(s4,s3,s2,s1) { }
-};
+}
 
-// These aliases are defined for the following flexibilization purposes:
-// - to avoid filling the code with template brackets
-//   (i.e., to minimize explicitly template-dependent code).
-// - so that they can be redefined according to the user's
-//   preferred array implementation.
-//
-typedef Arr1<int> intArr1;
-typedef Arr2<int> intArr2;
-typedef Arr3<int> intArr3;
-typedef Arr4<int> intArr4;
-typedef Arr1<double> doubleArr1;
-typedef Arr2<double> doubleArr2;
-typedef Arr3<double> doubleArr3;
-typedef Arr4<double> doubleArr4;
-typedef ArrayRef1<double> doubleArrRef1;
-//
 #define newArr4(type,sz1,sz2,sz3,sz4) newArray4<type>((sz1),(sz2),(sz3),(sz4))
 #define newArr3(type,sz1,sz2,sz3) newArray3<type>((sz1),(sz2),(sz3))
 #define newArr2(type,sz1,sz2) newArray2<type>((sz1),(sz2))
diff --git a/include/Basic.h b/include/Basic.h
index 323af124..f444b216 100644
--- a/include/Basic.h
+++ b/include/Basic.h
@@ -11,6 +11,7 @@ developers: Stefano Markidis, Giovanni Lapenta
 
 #include "MPIdata.h"
 #include "EllipticF.h"
+#include "Alloc.h"
 
 using std::cout;
 using std::endl;
@@ -63,11 +64,11 @@ inline double norm2(double **vect, int nx, int ny) {
   return (result);
 }
 /** method to calculate the square norm of a vector */
-inline double norm2(double ***vect, int nx, int ny) {
+inline double norm2(const arr3_double& vect, int nx, int ny) {
   double result = 0;
   for (int i = 0; i < nx; i++)
     for (int j = 0; j < ny; j++)
-      result += vect[i][j][0] * vect[i][j][0];
+      result += vect.get(i,j,0) * vect.get(i,j,0);
   return (result);
 }
 /** method to calculate the square norm of a vector */
@@ -81,13 +82,13 @@ inline double norm2(double *vect, int nx) {
 
 
 /** method to calculate the parallel dot product */
-inline double norm2P(double ***vect, int nx, int ny, int nz) {
+inline double norm2P(const arr3_double& vect, int nx, int ny, int nz) {
   double result = 0;
   double local_result = 0;
   for (int i = 0; i < nx; i++)
     for (int j = 0; j < ny; j++)
       for (int k = 0; k < nz; k++)
-        local_result += vect[i][j][k] * vect[i][j][k];
+        local_result += vect.get(i,j,k) * vect.get(i,j,k);
 
   MPI_Allreduce(&local_result, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   return (result);
@@ -127,60 +128,56 @@ inline void sum(double *vect1, double *vect2, int n) {
 
 }
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(double ***vect1, double ***vect2, int nx, int ny, int nz) {
+inline void sum(arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] += vect2[i][j][k];
+        vect1.fetch(i,j,k) += vect2.get(i,j,k);
 }
 
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(double ***vect1, double ***vect2, int nx, int ny) {
+inline void sum(arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect1[i][j][0] += vect2[i][j][0];
+      vect1.fetch(i,j,0) += vect2.get(i,j,0);
 }
 
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(double ***vect1, double ****vect2, int nx, int ny, int nz, int ns) {
+inline void sum(arr3_double& vect1, const arr4_double& vect2, int nx, int ny, int nz, int ns) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] += vect2[ns][i][j][k];
+        vect1.fetch(i,j,k) += vect2.get(ns,i,j,k);
 }
 
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(double ***vect1, double ****vect2, int nx, int ny, int ns) {
+inline void sum(arr3_double& vect1, const arr4_double& vect2, int nx, int ny, int ns) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect1[i][j][0] += vect2[ns][i][j][0];
+      vect1.fetch(i,j,0) += vect2.get(ns,i,j,0);
 }
 /** method to calculate the subtraction of two vectors vector1 = vector1 - vector2*/
-inline void sub(double ***vect1, double ***vect2, int nx, int ny, int nz) {
+inline void sub(arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] -= vect2[i][j][k];
-
-
+        vect1.fetch(i,j,k) -= vect2.get(i,j,k);
 }
 
 /** method to calculate the subtraction of two vectors vector1 = vector1 - vector2*/
-inline void sub(double ***vect1, double ***vect2, int nx, int ny) {
+inline void sub(arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect1[i][j][0] -= vect2[i][j][0];
-
-
+      vect1.fetch(i,j,0) -= vect2.get(i,j,0);
 }
 
 
 /** method to sum 4 vectors vector1 = alfa*vector1 + beta*vector2 + gamma*vector3 + delta*vector4 */
-inline void sum4(double ***vect1, double alfa, double ***vect2, double beta, double ***vect3, double gamma, double ***vect4, double delta, double ***vect5, int nx, int ny, int nz) {
+inline void sum4(arr3_double& vect1, double alfa, const arr3_double& vect2, double beta, const arr3_double& vect3, double gamma, const arr3_double& vect4, double delta, const arr3_double& vect5, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = alfa * (vect2[i][j][k] + beta * vect3[i][j][k] + gamma * vect4[i][j][k] + delta * vect5[i][j][k]);
+        vect1.fetch(i,j,k) = alfa * (vect2.get(i,j,k) + beta * vect3.get(i,j,k) + gamma * vect4.get(i,j,k) + delta * vect5.get(i,j,k));
 
 }
 /** method to calculate the scalar-vector product */
@@ -190,19 +187,19 @@ inline void scale(double *vect, double alfa, int n) {
 }
 
 /** method to calculate the scalar-vector product */
-inline void scale(double ***vect, double alfa, int nx, int ny) {
+inline void scale(arr3_double& vect, double alfa, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect[i][j][0] *= alfa;
+      vect.fetch(i,j,0) *= alfa;
 }
 
 
 /** method to calculate the scalar-vector product */
-inline void scale(double ***vect, double alfa, int nx, int ny, int nz) {
+inline void scale(arr3_double& vect, double alfa, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect[i][j][k] *= alfa;
+        vect.fetch(i,j,k) *= alfa;
 }
 /** method to calculate the scalar product */
 inline void scale(double vect[][2][2], double alfa, int nx, int ny, int nz) {
@@ -212,18 +209,18 @@ inline void scale(double vect[][2][2], double alfa, int nx, int ny, int nz) {
         vect[i][j][k] *= alfa;
 }
 /** method to calculate the scalar-vector product */
-inline void scale(double ***vect1, double ***vect2, double alfa, int nx, int ny, int nz) {
+inline void scale(arr3_double& vect1, const arr3_double& vect2, double alfa, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = vect2[i][j][k] * alfa;
+        vect1.fetch(i,j,k) = vect2.get(i,j,k) * alfa;
 }
 
 /** method to calculate the scalar-vector product */
-inline void scale(double ***vect1, double ***vect2, double alfa, int nx, int ny) {
+inline void scale(arr3_double& vect1, const arr3_double& vect2, double alfa, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect1[i][j][0] = vect2[i][j][0] * alfa;
+      vect1.fetch(i,j,0) = vect2.get(i,j,0) * alfa;
 }
 
 /** method to calculate the scalar-vector product */
@@ -233,11 +230,11 @@ inline void scale(double *vect1, double *vect2, double alfa, int n) {
 }
 
 /** method to calculate vector1 = vector1 + alfa*vector2   */
-inline void addscale(double alfa, double ***vect1, double ***vect2, int nx, int ny, int nz) {
+inline void addscale(double alfa, arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = vect1[i][j][k] + alfa * vect2[i][j][k];
+        vect1.fetch(i,j,k) = vect1.get(i,j,k) + alfa * vect2.get(i,j,k);
 }
 /** add scale for weights */
 inline void addscale(double alfa, double vect1[][2][2], double vect2[][2][2], int nx, int ny, int nz) {
@@ -248,10 +245,10 @@ inline void addscale(double alfa, double vect1[][2][2], double vect2[][2][2], in
 
 }
 /** method to calculate vector1 = vector1 + alfa*vector2   */
-inline void addscale(double alfa, double ***vect1, double ***vect2, int nx, int ny) {
+inline void addscale(double alfa, arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect1[i][j][0] += alfa * vect2[i][j][0];
+      vect1.fetch(i,j,0) += alfa * vect2.get(i,j,0);
 }
 /** method to calculate vector1 = vector1 + alfa*vector2   */
 inline void addscale(double alfa, double *vect1, double *vect2, int n) {
@@ -266,90 +263,88 @@ inline void addscale(double alfa, double beta, double *vect1, double *vect2, int
 
 }
 /** method to calculate vector1 = beta*vector1 + alfa*vector2 */
-inline void addscale(double alfa, double beta, double ***vect1, double ***vect2, int nx, int ny, int nz) {
+inline void addscale(double alfa, double beta, arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
 
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++) {
-        vect1[i][j][k] = beta * vect1[i][j][k] + alfa * vect2[i][j][k];
+        vect1.fetch(i,j,k) = beta * vect1.get(i,j,k) + alfa * vect2.get(i,j,k);
       }
 
 }
 /** method to calculate vector1 = beta*vector1 + alfa*vector2 */
-inline void addscale(double alfa, double beta, double ***vect1, double ***vect2, int nx, int ny) {
+inline void addscale(double alfa, double beta, arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect1[i][j][0] = beta * vect1[i][j][0] + alfa * vect2[i][j][0];
+      vect1.fetch(i,j,0) = beta * vect1.get(i,j,0) + alfa * vect2.get(i,j,0);
 
 }
 
 
 /** method to calculate vector1 = alfa*vector2 + beta*vector3 */
-inline void scaleandsum(double ***vect1, double alfa, double beta, double ***vect2, double ***vect3, int nx, int ny, int nz) {
+inline void scaleandsum(arr3_double& vect1, double alfa, double beta, const arr3_double& vect2, const arr3_double& vect3, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = alfa * vect2[i][j][k] + beta * vect3[i][j][k];
+        vect1.fetch(i,j,k) = alfa * vect2.get(i,j,k) + beta * vect3.get(i,j,k);
 }
 /** method to calculate vector1 = alfa*vector2 + beta*vector3 with vector2 depending on species*/
-inline void scaleandsum(double ***vect1, double alfa, double beta, double ****vect2, double ***vect3, int ns, int nx, int ny, int nz) {
+inline void scaleandsum(arr3_double& vect1, double alfa, double beta, const arr4_double& vect2, const arr3_double& vect3, int ns, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = alfa * vect2[ns][i][j][k] + beta * vect3[i][j][k];
+        vect1.fetch(i,j,k) = alfa * vect2.get(ns,i,j,k) + beta * vect3.get(i,j,k);
 }
 /** method to calculate vector1 = alfa*vector2*vector3 with vector2 depending on species*/
-inline void prod(double ***vect1, double alfa, double ****vect2, int ns, double ***vect3, int nx, int ny, int nz) {
+inline void prod(arr3_double& vect1, double alfa, const arr4_double& vect2, int ns, const arr3_double& vect3, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = alfa * vect2[ns][i][j][k] * vect3[i][j][k];
+        vect1.fetch(i,j,k) = alfa * vect2.get(ns,i,j,k) * vect3.get(i,j,k);
 
 }
 /** method to calculate vect1 = vect2/alfa */
-inline void div(double ***vect1, double alfa, double ***vect2, int nx, int ny, int nz) {
+inline void div(arr3_double& vect1, double alfa, const arr3_double& vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = vect2[i][j][k] / alfa;
+        vect1.fetch(i,j,k) = vect2.get(i,j,k) / alfa;
 
 }
-inline void prod6(double ***vect1, double ***vect2, double ***vect3, double ***vect4, double ***vect5, double ***vect6, double ***vect7, int nx, int ny, int nz) {
+inline void prod6(arr3_double& vect1, const arr3_double& vect2, const arr3_double& vect3, const arr3_double& vect4, const arr3_double& vect5, const arr3_double& vect6, const arr3_double& vect7, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = vect2[i][j][k] * vect3[i][j][k] + vect4[i][j][k] * vect5[i][j][k] + vect6[i][j][k] * vect7[i][j][k];
+        vect1.fetch(i,j,k) = vect2.get(i,j,k) * vect3.get(i,j,k) + vect4.get(i,j,k) * vect5.get(i,j,k) + vect6.get(i,j,k) * vect7.get(i,j,k);
 }
 /** method used for calculating PI */
-inline void proddiv(double ***vect1, double ***vect2, double alfa, double ***vect3, double ***vect4, double ***vect5, double ***vect6, double beta, double ***vect7, double ***vect8, double gamma, double ***vect9, int nx, int ny, int nz) {
+inline void proddiv(arr3_double& vect1, const arr3_double& vect2, double alfa, const arr3_double& vect3, const arr3_double& vect4, const arr3_double& vect5, const arr3_double& vect6, double beta, const arr3_double& vect7, const arr3_double& vect8, double gamma, const arr3_double& vect9, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = (vect2[i][j][k] + alfa * (vect3[i][j][k] * vect4[i][j][k] - vect5[i][j][k] * vect6[i][j][k]) + beta * vect7[i][j][k] * vect8[i][j][k]) / (1 + gamma * vect9[i][j][k]);
+        vect1.fetch(i,j,k) = (vect2.get(i,j,k) + alfa * (vect3.get(i,j,k) * vect4.get(i,j,k) - vect5.get(i,j,k) * vect6.get(i,j,k)) + beta * vect7.get(i,j,k) * vect8.get(i,j,k)) / (1 + gamma * vect9.get(i,j,k));
 
   // questo mi convince veramente poco!!!!!!!!!!!!!! CAZZO!!!!!!!!!!!!!!!!!!
   // ***vect1++ = (***vect2++ + alfa*((***vect3++)*(***vect4++) - (***vect5++)*(***vect6++)) + beta*(***vect7++)*(***vect8++))/(1+gamma*(***vect9++));
 }
 /** method to calculate the opposite of a vector */
-inline void neg(double ***vect, int nx, int ny, int nz) {
+inline void neg(arr3_double& vect, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect[i][j][k] = -vect[i][j][k];
-
-
+        vect.fetch(i,j,k) = -vect.get(i,j,k);
 }
 
 /** method to calculate the opposite of a vector */
-inline void neg(double ***vect, int nx, int ny) {
+inline void neg(arr3_double& vect, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect[i][j][0] = -vect[i][j][0];
+      vect.fetch(i,j,0) = -vect.get(i,j,0);
 }
 /** method to calculate the opposite of a vector */
-inline void neg(double ***vect, int nx) {
+inline void neg(arr3_double& vect, int nx) {
   for (register int i = 0; i < nx; i++)
-    vect[i][0][0] = -vect[i][0][0];
+    vect.fetch(i,0,0) = -vect.get(i,0,0);
 }
 /** method to calculate the opposite of a vector */
 inline void neg(double *vect, int n) {
@@ -359,34 +354,34 @@ inline void neg(double *vect, int n) {
 
 }
 /** method to set equal two vectors */
-inline void eq(double ***vect1, double ***vect2, int nx, int ny, int nz) {
+inline void eq(arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[i][j][k] = vect2[i][j][k];
+        vect1.fetch(i,j,k) = vect2.get(i,j,k);
 
 }
 /** method to set equal two vectors */
-inline void eq(double ***vect1, double ***vect2, int nx, int ny) {
+inline void eq(arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect1[i][j][0] = vect2[i][j][0];
+      vect1.fetch(i,j,0) = vect2.get(i,j,0);
 
 }
 
 /** method to set equal two vectors */
-inline void eq(double ****vect1, double ***vect2, int nx, int ny, int is) {
+inline void eq(arr4_double& vect1, const arr3_double& vect2, int nx, int ny, int is) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect1[is][i][j][0] = vect2[i][j][0];
+      vect1.fetch(is,i,j,0) = vect2.get(i,j,0);
 
 }
 /** method to set equal two vectors */
-inline void eq(double ****vect1, double ***vect2, int nx, int ny, int nz, int is) {
+inline void eq(arr4_double& vect1, const arr3_double& vect2, int nx, int ny, int nz, int is) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect1[is][i][j][k] = vect2[i][j][k];
+        vect1.fetch(is,i,j,k) = vect2.get(i,j,k);
 
 }
 
@@ -395,11 +390,11 @@ inline void eq(double *vect1, double *vect2, int n) {
     vect1[i] = vect2[i];
 }
 /** method to set a vector to a Value */
-inline void eqValue(double value, double ***vect, int nx, int ny, int nz) {
+inline void eqValue(double value, arr3_double& vect, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
-        vect[i][j][k] = value;
+        vect.fetch(i,j,k) = value;
 
 }
 inline void eqValue(double value, double vect[][2][2], int nx, int ny, int nz) {
@@ -410,16 +405,16 @@ inline void eqValue(double value, double vect[][2][2], int nx, int ny, int nz) {
 
 }
 /** method to set a vector to a Value */
-inline void eqValue(double value, double ***vect, int nx, int ny) {
+inline void eqValue(double value, arr3_double& vect, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
-      vect[i][j][0] = value;
+      vect.fetch(i,j,0) = value;
 
 }
 /** method to set a vector to a Value */
-inline void eqValue(double value, double ***vect, int nx) {
+inline void eqValue(double value, arr3_double& vect, int nx) {
   for (register int i = 0; i < nx; i++)
-    vect[i][0][0] = value;
+    vect.fetch(i,0,0) = value;
 
 }
 /** method to set a vector to a Value */
diff --git a/include/ComNodes3D.h b/include/ComNodes3D.h
index 8e1636e5..7360a02e 100644
--- a/include/ComNodes3D.h
+++ b/include/ComNodes3D.h
@@ -10,6 +10,7 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #ifndef ComNodes3D_H
 #define ComNodes_H
 
+#include "arraysfwd.h"
 #include "ComBasic3D.h"
 //#include "TimeTasks.h"
 
@@ -19,45 +20,45 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #include "BcFields3D.h"
 
 /** communicate ghost cells (FOR NODES) */
-void communicateNode(int nx, int ny, int nz, double ***vector, VirtualTopology3D * vct);
+void communicateNode(int nx, int ny, int nz, arr3_double& vector, VirtualTopology3D * vct);
 
 /** communicate ghost cells (FOR NODES) */
-void communicateNodeBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateNodeBC(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 /** communicate ghost cells (FOR NODES) with particles BC*/
-void communicateNodeBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateNodeBC_P(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 /** SPECIES: communicate ghost cells */
-void communicateNode(int nx, int ny, int nz, double ****vector, int ns, VirtualTopology3D * vct);
+void communicateNode(int nx, int ny, int nz, arr4_double& vector, int ns, VirtualTopology3D * vct);
 
 // PARTICLES
 /** SPECIES: communicate ghost cells */
-void communicateNode_P(int nx, int ny, int nz, double ****vector, int ns, VirtualTopology3D * vct);
+void communicateNode_P(int nx, int ny, int nz, arr4_double& vector, int ns, VirtualTopology3D * vct);
 
 // 
 /** communicate ghost cells (FOR CENTERS) */
-void communicateCenter(int nx, int ny, int nz, double ***vector, VirtualTopology3D * vct);
+void communicateCenter(int nx, int ny, int nz, arr3_double& vector, VirtualTopology3D * vct);
 
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
-void communicateCenterBoxStencilBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 // particles
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
-void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 // 
 
-void communicateNodeBoxStencilBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
-void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 /** SPECIES: communicate ghost cells */
-void communicateCenter(int nx, int ny, int nz, double ****vector, int ns, VirtualTopology3D * vct);
+void communicateCenter(int nx, int ny, int nz, arr4_double& vector, int ns, VirtualTopology3D * vct);
 
 // /////////// communication + BC ////////////////////////////
-void communicateCenterBC(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateCenterBC(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 // /////////// communication + BC ////////////////////////////
-void communicateCenterBC_P(int nx, int ny, int nz, double ***vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateCenterBC_P(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 #endif
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 3d09049c..9a086d1f 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -91,9 +91,11 @@ class EMfields3D                // :public Field
     void fixBforcefree(Grid * grid, VirtualTopology3D * vct);
 
     /*! Calculate the three components of Pi(implicit pressure) cross image vector */
-    void PIdot(double ***PIdotX, double ***PIdotY, double ***PIdotZ, double ***vectX, double ***vectY, double ***vectZ, int ns, Grid * grid);
+    void PIdot(arr3_double& PIdotX, arr3_double& PIdotY, arr3_double& PIdotZ,
+      const_arr3_double& vectX, const_arr3_double& vectY, const_arr3_double& vectZ, int ns, Grid * grid);
     /*! Calculate the three components of mu (implicit permeattivity) cross image vector */
-    void MUdot(double ***MUdotX, double ***MUdotY, double ***MUdotZ, double ***vectX, double ***vectY, double ***vectZ, Grid * grid);
+    void MUdot(arr3_double& MUdotX, arr3_double& MUdotY, arr3_double& MUdotZ,
+      const_arr3_double& vectX, const_arr3_double& vectY, const_arr3_double& vectZ, Grid * grid);
     /*! Calculate rho hat, Jx hat, Jy hat, Jz hat */
     void calculateHatFunctions(Grid * grid, VirtualTopology3D * vct);
 
@@ -107,9 +109,9 @@ class EMfields3D                // :public Field
     /*! Sum current over different species */
     void sumOverSpeciesJ();
     /*! Smoothing after the interpolation* */
-    void smooth(double value, double ***vector, int type, Grid * grid, VirtualTopology3D * vct);
+    void smooth(double value, arr3_double& vector, int type, Grid * grid, VirtualTopology3D * vct);
     /*! SPECIES: Smoothing after the interpolation for species fields* */
-    void smooth(double value, double ****vector, int is, int type, Grid * grid, VirtualTopology3D * vct);
+    void smooth(double value, arr4_double& vector, int is, int type, Grid * grid, VirtualTopology3D * vct);
     /*! smooth the electric field */
     void smoothE(double value, VirtualTopology3D * vct, Collective *col);
 
@@ -145,13 +147,20 @@ class EMfields3D                // :public Field
 
 
     /*! Perfect conductor boundary conditions LEFT wall */
-    void perfectConductorLeft(double ***imageX, double ***imageY, double ***imageZ, double ***vectorX, double ***vectorY, double ***vectorZ, int dir, Grid * grid);
+    void perfectConductorLeft(arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
+      const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ,
+      int dir, Grid * grid);
     /*! Perfect conductor boundary conditions RIGHT wall */
-    void perfectConductorRight(double ***imageX, double ***imageY, double ***imageZ, double ***vectorX, double ***vectorY, double ***vectorZ, int dir, Grid * grid);
+    void perfectConductorRight(
+      arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
+      const_arr3_double& vectorX,
+      const_arr3_double& vectorY,
+      const_arr3_double& vectorZ,
+      int dir, Grid * grid);
     /*! Perfect conductor boundary conditions for source LEFT wall */
-    void perfectConductorLeftS(double ***vectorX, double ***vectorY, double ***vectorZ, int dir);
+    void perfectConductorLeftS(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ, int dir);
     /*! Perfect conductor boundary conditions for source RIGHT wall */
-    void perfectConductorRightS(double ***vectorX, double ***vectorY, double ***vectorZ, int dir);
+    void perfectConductorRightS(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ, int dir);
 
     /*! Calculate the sysceptibility tensor on the boundary */
     void sustensorRightX(double **susxx, double **susyx, double **suszx);
@@ -161,129 +170,97 @@ class EMfields3D                // :public Field
     void sustensorRightZ(double **susxz, double **susyz, double **suszz);
     void sustensorLeftZ (double **susxz, double **susyz, double **suszz);
 
+    /*** accessor methods ***/
+
     /*! get Potential array */
-    double ***getPHI();
-    /*! get Electric Field component X defined on node(indexX,indexY,indexZ) */
-    double &getEx(int indexX, int indexY, int indexZ) const;
-    /*! get Electric field X component array */
-    double ***getEx();
-    /*! get Electric field X component cell array without the ghost cells */
-    double ***getExc(Grid3DCU *grid);
-    /*! get Electric Field component Y defined on node(indexX,indexY,indexZ) */
-    double &getEy(int indexX, int indexY, int indexZ) const;
-    /*! get Electric field Y component array */
-    double ***getEy();
-    /*! get Electric field Y component cell array without the ghost cells */
-    double ***getEyc(Grid3DCU *grid);
-    /*! get Electric Field component Z defined on node(indexX,indexY,indexZ) */
-    double &getEz(int indexX, int indexY, int indexZ) const;
-    /*! get Electric field Z component array */
-    double ***getEz();
-    /*! get Electric field Z component cell array without the ghost cells */
-    double ***getEzc(Grid3DCU *grid);
-    /*! get Magnetic Field component X defined on node(indexX,indexY,indexZ) */
-    double &getBx(int indexX, int indexY, int indexZ) const;
-    /*! get Magnetic field X component array */
-    double ***getBx();
-    /*! get Magnetic field X component cell array without the ghost cells */
-    double ***getBxc();
-    /*! get Magnetic Field component Y defined on node(indexX,indexY,indexZ) */
-    double &getBy(int indexX, int indexY, int indexZ) const;
-    /*! get Magnetic field Y component array */
-    double ***getBy();
-    /*! get Magnetic field Y component cell array without the ghost cells */
-    double ***getByc();
-    /*! get Magnetic Field component Z defined on node(indexX,indexY,indexZ) */
-    double &getBz(int indexX, int indexY, int indexZ) const;
-    /*! get Magnetic field Z component array */
-    double ***getBz();
-    /*! get Magnetic field Z component cell array without the ghost cells */
-    double ***getBzc();
-    /*! get density on cell(indexX,indexY,indexZ) */
-    double &getRHOc(int indexX, int indexY, int indexZ) const;
-    /*! get density array on center cell */
-    double ***getRHOc();
-    /*! get density on nodes(indexX,indexY,indexZ) */
-    double &getRHOn(int indexX, int indexY, int indexZ) const;
-    /*! get density array on nodes */
-    double ***getRHOn();
-    /*! SPECIES: get density on nodes(indexX,indexY,indexZ) */
-    double &getRHOns(int indexX, int indexY, int indexZ, int is) const;
-    /*! SPECIES: get density on center cell(indexX,indexY,indexZ) */
-    double &getRHOcs(int indexX, int indexY, int indexZ, int is) const;
-    /*! SPECIES: get density array on nodes */
-    double ****getRHOns();
-    /*! SPECIES: get density array on cells without the ghost cells */
-    double ***getRHOcs(Grid3DCU *grid, int is);
-
-    /** get Magnetic Field component X defined on node(indexX,indexY,indexZ) */
-    double &getBx_ext(int indexX, int indexY, int indexZ) const;
-    /** get Magnetic Field component Y defined on node(indexX,indexY,indexZ) */
-    double &getBy_ext(int indexX, int indexY, int indexZ) const;
-    /** get Magnetic Field component Z defined on node(indexX,indexY,indexZ) */
-    double &getBz_ext(int indexX, int indexY, int indexZ) const;
-
-    /** get Magnetic Field component X */
-    double ***getBx_ext();
-    /** get Magnetic Field component Y */
-    double ***getBy_ext();
-    /** get Magnetic Field component Z */
-    double ***getBz_ext();
-
-    /*! get pressure tensor XX for species */
-    double ****getpXXsn();
-    /*! get pressure tensor XY for species */
-    double ****getpXYsn();
-    /*! get pressure tensor XZ for species */
-    double ****getpXZsn();
-    /*! get pressure tensor YY for species */
-    double ****getpYYsn();
-    /*! get pressure tensor YZ for species */
-    double ****getpYZsn();
-    /*! get pressure tensor ZZ for species */
-    double ****getpZZsn();
-
-    /*! get Jx(X,Y,Z) */
-    double &getJx(int indexX, int indexY, int indexZ) const;
-    /*! get current -Direction X */
-    double ***getJx();
-    /*! get Jxs(X,Y,Z,is) */
-    double &getJxs(int indexX, int indexY, int indexZ, int is) const;
-    /*! SPECIES: get current -Direction X */
-    double ****getJxs();
-    /*! SPECIES: get current X component for species is in all cells except ghost */
-    double ***getJxsc(Grid3DCU *grid, int is);
-    /*! get Jy(X,Y,Z) */
-    double &getJy(int indexX, int indexY, int indexZ) const;
-    /*! get current -Direction Y */
-    double ***getJy();
-    /*! get Jys(X,Y,Z,is) */
-    double &getJys(int indexX, int indexY, int indexZ, int is) const;
-    /*! SPECIES: get current -Direction Y */
-    double ****getJys();
-    /*! SPECIES: get current Y component for species is in all cells except ghost */
-    double ***getJysc(Grid3DCU *grid, int is);
-    /*! get Jz(X,Y,Z) */
-    double &getJz(int indexX, int indexY, int indexZ) const;
-    /*! get current -Direction Z */
-    double ***getJz();
-    /*! get Jzs(X,Y,Z,is) */
-    double &getJzs(int indexX, int indexY, int indexZ, int is) const;
-    /*! SPECIES: get current -Direction Z */
-    double ****getJzs();
-    /*! SPECIES: get current Z component for species is in all cells except ghost */
-    double ***getJzsc(Grid3DCU *grid, int is);
+    arr3_double getPHI() {return PHI;}
+
+    // field components defined on nodes
+    //
+    double getEx(int X, int Y, int Z) const { return Ex.get(X,Y,Z);}
+    double getEy(int X, int Y, int Z) const { return Ey.get(X,Y,Z);}
+    double getEz(int X, int Y, int Z) const { return Ez.get(X,Y,Z);}
+    double getBx(int X, int Y, int Z) const { return Bxn.get(X,Y,Z);}
+    double getBy(int X, int Y, int Z) const { return Byn.get(X,Y,Z);}
+    double getBz(int X, int Y, int Z) const { return Bzn.get(X,Y,Z);}
+    //
+    arr3_double getEx() { return Ex; }
+    arr3_double getEy() { return Ey; }
+    arr3_double getEz() { return Ez; }
+    arr3_double getBx() { return Bxn; }
+    arr3_double getBy() { return Byn; }
+    arr3_double getBz() { return Bzn; }
+
+    // field components without ghost cells
+    //
+    void getExc(arr3_double& arr, Grid3DCU *grid);
+    void getEyc(arr3_double& arr, Grid3DCU *grid);
+    void getEzc(arr3_double& arr, Grid3DCU *grid);
+    void getBxc(arr3_double& arr);
+    void getByc(arr3_double& arr);
+    void getBzc(arr3_double& arr);
+
+    arr3_double getRHOc() { return rhoc; }
+    arr3_double getRHOn() { return rhon; }
+    double getRHOc(int X, int Y, int Z) const { return rhoc.get(X,Y,Z);}
+    double getRHOn(int X, int Y, int Z) const { return rhon.get(X,Y,Z);}
+
+    // densities per species:
+    //
+    double getRHOcs(int X,int Y,int Z,int is)const{return rhocs.get(is,X,Y,Z);}
+    double getRHOns(int X,int Y,int Z,int is)const{return rhons.get(is,X,Y,Z);}
+    arr4_double getRHOns(){return rhons;}
+    /* density on cells without ghost cells */
+    void getRHOcs(arr3_double& arr, Grid3DCU *grid, int is);
+
+    double getBx_ext(int X, int Y, int Z) const{return Bx_ext.get(X,Y,Z);}
+    double getBy_ext(int X, int Y, int Z) const{return By_ext.get(X,Y,Z);}
+    double getBz_ext(int X, int Y, int Z) const{return Bz_ext.get(X,Y,Z);}
+    
+    arr3_double getBx_ext() { return Bx_ext; }
+    arr3_double getBy_ext() { return By_ext; }
+    arr3_double getBz_ext() { return Bz_ext; }
+
+    arr4_double getpXXsn() { return pXXsn; }
+    arr4_double getpXYsn() { return pXYsn; }
+    arr4_double getpXZsn() { return pXZsn; }
+    arr4_double getpYYsn() { return pYYsn; }
+    arr4_double getpYZsn() { return pYZsn; }
+    arr4_double getpZZsn() { return pZZsn; }
+
+    double getJx(int X, int Y, int Z) const { return Jx.get(X,Y,Z);}
+    double getJy(int X, int Y, int Z) const { return Jy.get(X,Y,Z);}
+    double getJz(int X, int Y, int Z) const { return Jz.get(X,Y,Z);}
+    arr3_double getJx() { return Jx; }
+    arr3_double getJy() { return Jy; }
+    arr3_double getJz() { return Jz; }
+    arr4_double getJxs() { return Jxs; }
+    arr4_double getJys() { return Jys; }
+    arr4_double getJzs() { return Jzs; }
+
+    double getJxs(int X,int Y,int Z,int is)const{return Jxs.get(is,X,Y,Z);}
+    double getJys(int X,int Y,int Z,int is)const{return Jys.get(is,X,Y,Z);}
+    double getJzs(int X,int Y,int Z,int is)const{return Jzs.get(is,X,Y,Z);}
+
+    /*** accessor that require computing ***/
+
+    // get current for species in all cells except ghost
+    //
+    void getJxsc(arr3_double& arr, Grid3DCU *grid, int is);
+    void getJysc(arr3_double& arr, Grid3DCU *grid, int is);
+    void getJzsc(arr3_double& arr, Grid3DCU *grid, int is);
+
     /*! get the electric field energy */
     double getEenergy();
     /*! get the magnetic field energy */
     double getBenergy();
 
-  /*! fetch array for summing moments of thread i */
-  Moments& fetch_momentsArray(int i){
-    assert_le(0,i);
-    assert_le(i,sizeMomentsArray);
-    return *momentsArray[i];
-  }
+    /*! fetch array for summing moments of thread i */
+    Moments& fetch_momentsArray(int i){
+      assert_le(0,i);
+      assert_le(i,sizeMomentsArray);
+      return *momentsArray[i];
+    }
 
     /*! print electromagnetic fields info */
     void print(void) const;
@@ -350,61 +327,61 @@ class EMfields3D                // :public Field
     double L_square;
 
     /*! PHI: electric potential (indexX, indexY, indexZ), defined on central points between nodes */
-    double ***PHI;
-    /*! Ex: electric field X-component (indexX, indexY, indexZ), defined on nodes */
-    double ***Ex;
-    /*! Exth: implicit electric field X-component (indexX, indexY, indexZ), defined on nodes */
-    double ***Exth;
-    /*! Ey: electric field Y-component (indexX, indexY, indexZ), defined on nodes */
-    double ***Ey;
-    /*! Eyth: implicit electric field Y-component (indexX, indexY, indexZ), defined on nodes */
-    double ***Eyth;
-    /*! Ez: electric field Z-component (indexX, indexY, indexZ, #species), defined on nodes */
-    double ***Ez;
-    /*! Ezth: implicit electric field Z-component (indexX, indexY, indexZ), defined on nodes */
-    double ***Ezth;
-    /*! Bxc: magnetic field X-component (indexX, indexY, indexZ), defined on central points between nodes */
-    double ***Bxc;
-    /*! Byc: magnetic field Y-component (indexX, indexY, indexZ), defined on central points between nodes */
-    double ***Byc;
-    /*! Bzc: magnetic field Z-component (indexX, indexY, indexZ), defined on central points between nodes */
-    double ***Bzc;
-    /*! Bxn: magnetic field X-component (indexX, indexY, indexZ), defined on nodes */
-    double ***Bxn;
-    /*! Byn: magnetic field Y-component (indexX, indexY, indexZ), defined on nodes */
-    double ***Byn;
-    /*! Bzn: magnetic field Z-component (indexX, indexY, indexZ), defined on nodes */
-    double ***Bzn;
+    array3_double PHI;
+
+    // Electric field components defined on nodes
+    //
+    array3_double Ex;
+    array3_double Ey;
+    array3_double Ez;
+
+    // implicit electric field components defined on nodes
+    //
+    array3_double Exth;
+    array3_double Eyth;
+    array3_double Ezth;
+
+    // magnetic field components defined on central points between nodes
+    //
+    array3_double Bxc;
+    array3_double Byc;
+    array3_double Bzc;
+
+    // magnetic field components defined on nodes
+    //
+    array3_double Bxn;
+    array3_double Byn;
+    array3_double Bzn;
 
     // *************************************
     // TEMPORARY ARRAY
     // ************************************
     /*!some temporary arrays (for calculate hat functions) */
-    double ***tempXC;
-    double ***tempYC;
-    double ***tempZC;
-    double ***tempXN;
-    double ***tempYN;
-    double ***tempZN;
+    array3_double tempXC;
+    array3_double tempYC;
+    array3_double tempZC;
+    array3_double tempXN;
+    array3_double tempYN;
+    array3_double tempZN;
     /*! other temporary arrays (in MaxwellSource) */
-    double ***tempC;
-    double ***tempX;
-    double ***tempY;
-    double ***tempZ;
-    double ***temp2X;
-    double ***temp2Y;
-    double ***temp2Z;
+    array3_double tempC;
+    array3_double tempX;
+    array3_double tempY;
+    array3_double tempZ;
+    array3_double temp2X;
+    array3_double temp2Y;
+    array3_double temp2Z;
     /*! and some for MaxwellImage */
-    double ***imageX;
-    double ***imageY;
-    double ***imageZ;
-    double ***Dx;
-    double ***Dy;
-    double ***Dz;
-    double ***vectX;
-    double ***vectY;
-    double ***vectZ;
-    double ***divC;
+    array3_double imageX;
+    array3_double imageY;
+    array3_double imageZ;
+    array3_double Dx;
+    array3_double Dy;
+    array3_double Dz;
+    array3_double vectX;
+    array3_double vectY;
+    array3_double vectZ;
+    array3_double divC;
     /* temporary arrays for summing moments */
     int sizeMomentsArray;
     Moments **momentsArray;
@@ -415,87 +392,78 @@ class EMfields3D                // :public Field
     // *******************************************************************************
 
     /*! Charge density, defined on central points of the cell */
-    double ***rhoc;
+    array3_double rhoc;
     /*! Charge density, defined on nodes */
-    double ***rhon;
+    array3_double rhon;
     /*! Implicit charge density, defined on central points of the cell */
-    double ***rhoh;
+    array3_double rhoh;
     /*! SPECIES: charge density for each species, defined on nodes */
-    double ****rhons;
+    array4_double rhons;
     /*! SPECIES: charge density for each species, defined on central points of the cell */
-    double ****rhocs;
-    /*! Current density component-X, defined on nodes */
-    double ***Jx;
-    /*! Current density component-Y, defined on nodes */
-    double ***Jy;
-    /*! Current density component-Z, defined on nodes */
-    double ***Jz;
-    /*! Implicit current density X-component, defined on nodes */
-    double ***Jxh;
-    /*! Implicit current density Y-component, defined on nodes */
-    double ***Jyh;
-    /*! Implicit current density Z-component, defined on nodes */
-    double ***Jzh;
-    /*! SPECIES: current density component-X for species, defined on nodes */
-    double ****Jxs;
-    /*! SPECIES: current density component-Y for species, defined on nodes */
-    double ****Jys;
-    /*! SPECIES: current density component-Z for species, defined on nodes */
-    double ****Jzs;
-    /*! External magnetic field component-X, defined on nodes */
-    double***  Bx_ext;
-    /*! External magnetic field component-Y, defined on nodes */
-    double***  By_ext;
-    /*! External magnetic field component-Z, defined on nodes */
-    double***  Bz_ext;
-    /*! External current field component-X, defined on nodes */
-    double***  Jx_ext;
-    /*! External current field component-Y, defined on nodes */
-    double***  Jy_ext;
-    /*! External current field component-Z, defined on nodes */
-    double***  Jz_ext;
-
-    /*! SPECIES: pressure tensor component-XX, defined on nodes */
-    double ****pXXsn;
-    /*! SPECIES: pressure tensor component-XY, defined on nodes */
-    double ****pXYsn;
-    /*! SPECIES: pressure tensor component-XZ, defined on nodes */
-    double ****pXZsn;
-    /*! SPECIES: pressure tensor component-XZ, defined on nodes */
-    double ****pYYsn;
-    /*! SPECIES: pressure tensor component-YZ, defined on nodes */
-    double ****pYZsn;
-    /*! SPECIES: pressure tensor component-ZZ, defined on nodes */
-    double ****pZZsn;
-
-
-    /*! Field Boundary Condition 0 = Dirichlet Boundary Condition: specifies the value to take on the boundary of the domain 1 = Neumann Boundary Condition: specifies the value of derivative to take on the boundary of the domain 2 = Periodic condition */
-
-    /*! Boundary Condition Electrostatic Potential: FaceXright */
+    array4_double rhocs;
+
+    // current density defined on nodes
+    //
+    array3_double Jx;
+    array3_double Jy;
+    array3_double Jz;
+
+    // implicit current density defined on nodes
+    //
+    array3_double Jxh;
+    array3_double Jyh;
+    array3_double Jzh;
+
+    // species-specific current densities defined on nodes
+    //
+    array4_double Jxs;
+    array4_double Jys;
+    array4_double Jzs;
+
+    // magnetic field components defined on nodes
+    //
+    array3_double   Bx_ext;
+    array3_double   By_ext;
+    array3_double   Bz_ext;
+
+    // external current, defined on nodes
+    array3_double   Jx_ext;
+    array3_double   Jy_ext;
+    array3_double   Jz_ext;
+
+    // pressure tensor components, defined on nodes
+    array4_double pXXsn;
+    array4_double pXYsn;
+    array4_double pXZsn;
+    array4_double pYYsn;
+    array4_double pYZsn;
+    array4_double pZZsn;
+
+    /*! Field Boundary Condition
+      0 = Dirichlet Boundary Condition: specifies the
+          value on the boundary of the domain
+      1 = Neumann Boundary Condition: specifies the value of
+          derivative on the boundary of the domain
+      2 = Periodic boundary condition */
+
+    // boundary conditions for electrostatic potential
+    //
     int bcPHIfaceXright;
-    /*! Boundary Condition Electrostatic Potential:FaceXleft */
     int bcPHIfaceXleft;
-    /*! Boundary Condition Electrostatic Potential:FaceYright */
     int bcPHIfaceYright;
-    /*! Boundary Condition Electrostatic Potential:FaceYleft */
     int bcPHIfaceYleft;
-    /*! Boundary Condition Electrostatic Potential:FaceZright */
     int bcPHIfaceZright;
-    /*! Boundary Condition Electrostatic Potential:FaceZleft */
     int bcPHIfaceZleft;
 
     /*! Boundary condition for electric field 0 = perfect conductor 1 = magnetic mirror */
-    /*! Boundary Condition EM Field: FaceXright */
+    //
+    // boundary conditions for EM field
+    //
     int bcEMfaceXright;
-    /*! Boundary Condition EM Field: FaceXleft */
     int bcEMfaceXleft;
-    /*! Boundary Condition EM Field: FaceYright */
     int bcEMfaceYright;
-    /*! Boundary Condition EM Field: FaceYleft */
     int bcEMfaceYleft;
-    /*! Boundary Condition EM Field: FaceZright */
     int bcEMfaceZright;
-    /*! Boundary Condition EM Field: FaceZleft */
     int bcEMfaceZleft;
 
 
@@ -529,10 +497,13 @@ class EMfields3D                // :public Field
     injInfoFields* get_InfoFieldsRear();
     injInfoFields* get_InfoFieldsRight();
 
-    void BoundaryConditionsB(double ***vectorX, double ***vectorY, double ***vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct);
-    void BoundaryConditionsE(double ***vectorX, double ***vectorY, double ***vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct);
-    void BoundaryConditionsEImage(double ***imageX, double ***imageY, double ***imageZ,double ***vectorX, double ***vectorY, double ***vectorZ,int nx, int ny, int nz, VirtualTopology3D *vct,Grid *grid);
-
+    void BoundaryConditionsB(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ,
+      int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct);
+    void BoundaryConditionsE(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ,
+      int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct);
+    void BoundaryConditionsEImage(arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
+      const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ,
+      int nx, int ny, int nz, VirtualTopology3D *vct,Grid *grid);
 };
 
 inline void EMfields3D::addRho(double weight[][2][2], int X, int Y, int Z, int is) {
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index 9d2e2f01..c8cd883d 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -42,44 +42,76 @@ class Grid3DCU                  // :public Grid
   /** print grid info */
   void print(VirtualTopology3D * ptVCT);
   /** calculate a derivative along a direction on nodes */
-  void derivN(double ***derN, double ****scFieldC, int ns, int dir);
+  void derivN(arr3_double& derN,
+    const_arr4_double& scFieldC, int ns, int dir);
   /** calculate gradient on nodes, given a scalar field defined on central points  */
-  void gradC2N(double ***gradXN, double ***gradYN, double ***gradZN, double ***scFieldC);
+  void gradC2N(arr3_double& gradXN, arr3_double& gradYN, arr3_double& gradZN,
+    const_arr3_double& scFieldC);
   /** calculate gradient on nodes, given a scalar field defined on central points  */
-  void gradN2C(double ***gradXC, double ***gradYC, double ***gradZC, double ***scFieldN);
+  void gradN2C(arr3_double& gradXC, arr3_double& gradYC, arr3_double& gradZC,
+    const_arr3_double& scFieldN);
   /** calculate divergence on central points, given a vector field defined on nodes  */
-  void divN2C(double ***divC, double ***vecFieldXN, double ***vecFieldYN, double ***vecFieldZN);
+  void divN2C(arr3_double& divC,
+    const_arr3_double& vecFieldXN,
+    const_arr3_double& vecFieldYN,
+    const_arr3_double& vecFieldZN);
   /** calculate divergence on nodes, given a vector field defined on central points  */
-  void divC2N(double ***divN, double ***vecFieldXC, double ***vecFieldYC, double ***vecFieldZC);
+  void divC2N(arr3_double& divN,
+    const_arr3_double& vecFieldXC,
+    const_arr3_double& vecFieldYC,
+    const_arr3_double& vecFieldZC);
   /** calculate curl on nodes, given a vector field defined on central points  */
-  void curlC2N(double ***curlXN, double ***curlYN, double ***curlZN, double ***vecFieldXC, double ***vecFieldYC, double ***vecFieldZC);
+  void curlC2N(arr3_double& curlXN, arr3_double& curlYN,
+    arr3_double& curlZN,
+    const_arr3_double& vecFieldXC,
+    const_arr3_double& vecFieldYC,
+    const_arr3_double& vecFieldZC);
   /** calculate curl on central points, given a vector field defined on nodes  */
-  void curlN2C(double ***curlXC, double ***curlYC, double ***curlZC, double ***vecFieldXN, double ***vecFieldYN, double ***vecFieldZN);
+  void curlN2C(arr3_double& curlXC, arr3_double& curlYC, arr3_double& curlZC,
+    const_arr3_double& vecFieldXN,
+    const_arr3_double& vecFieldYN,
+    const_arr3_double& vecFieldZN);
 
   /** calculate divergence on central points, given a Tensor field defined on nodes  */
-  void divSymmTensorN2C(double ***divCX, double ***divCY, double ***divCZ, double ****pXX, double ****pXY, double ****pXZ, double ****pYY, double ****pYZ, double ****pZZ, int ns);
+  void divSymmTensorN2C(arr3_double& divCX, arr3_double& divCY, arr3_double& divCZ,
+    const_arr4_double& pXX,
+    const_arr4_double& pXY,
+    const_arr4_double& pXZ,
+    const_arr4_double& pYY,
+    const_arr4_double& pYZ,
+    const_arr4_double& pZZ, int ns);
 
   /** calculate laplacian on nodes, given a scalar field defined on nodes */
-  void lapN2N(double ***lapN, double ***scFieldN, VirtualTopology3D * vct);
+  void lapN2N(arr3_double& lapN,
+    const_arr3_double& scFieldN, VirtualTopology3D * vct);
   /** calculate laplacian on central points, given a scalar field defined on central points for Poisson */
-  void lapC2Cpoisson(double ***lapC, double ***scFieldC, VirtualTopology3D * vct);
+  void lapC2Cpoisson(arr3_double& lapC,
+    arr3_double& scFieldC, VirtualTopology3D * vct);
   /** calculate laplacian on central points, given a scalar field defined on central points */
-  void lapC2C(double ***lapC, double ***scFieldC, VirtualTopology3D * vct);
+  void lapC2C(arr3_double& lapC,
+    const_arr3_double& scFieldC, VirtualTopology3D * vct);
 
   /** calculate divergence on boundaries */
-  void divBCleft(double ***divBC, double ***vectorX, double ***vectorY, double ***vectorZ, int leftActiveNode, int dirDER);
+  void divBCleft(arr3_double& divBC,
+    const_arr3_double& vectorX,
+    const_arr3_double& vectorY,
+    const_arr3_double& vectorZ, int leftActiveNode, int dirDER);
   /** calculate divergence on boundaries */
-  void divBCright(double ***divBC, double ***vectorX, double ***vectorY, double ***vectorZ, int rightActiveNode, int dirDER);
+  void divBCright(arr3_double& divBC,
+    const_arr3_double& vectorX,
+    const_arr3_double& vectorY,
+    const_arr3_double& vectorZ, int rightActiveNode, int dirDER);
   /** calculate derivative on boundaries */
-  void derBC(double ***derBC, double ***vector, int leftActiveNode, int dirDER);
+  void derBC(arr3_double& derBC,
+    const_arr3_double& vector, int leftActiveNode, int dirDER);
 
 
   /** interpolate on nodes from central points */
-  void interpC2N(double ***vecFieldN, double ***vecFieldC);
+  void interpC2N(arr3_double& vecFieldN, const_arr3_double& vecFieldC);
   /** interpolate on central points from nodes */
-  void interpN2C(double ***vecFieldC, double ***vecFieldN);
+  void interpN2C(arr3_double& vecFieldC, const_arr3_double& vecFieldN);
   /** interpolate on central points from nodes */
-  void interpN2C(double ****vecFieldC, int ns, double ****vecFieldN);
+  void interpN2C(arr4_double& vecFieldC, int ns, const_arr4_double& vecFieldN);
 
   // /////////// PRIVATE VARIABLES //////////////
 private:
diff --git a/include/Moments.h b/include/Moments.h
index 53fe942f..981ee15b 100644
--- a/include/Moments.h
+++ b/include/Moments.h
@@ -6,20 +6,20 @@
 // 
 class Moments {
   private:
-    doubleArr3 rho;
+    arr3_double rho;
 
     /** current density, defined on nodes */
-    doubleArr3 Jx;
-    doubleArr3 Jy;
-    doubleArr3 Jz;
+    arr3_double Jx;
+    arr3_double Jy;
+    arr3_double Jz;
 
     /** pressure tensor components, defined on nodes */
-    doubleArr3 pXX;
-    doubleArr3 pXY;
-    doubleArr3 pXZ;
-    doubleArr3 pYY;
-    doubleArr3 pYZ;
-    doubleArr3 pZZ;
+    arr3_double pXX;
+    arr3_double pXY;
+    arr3_double pXZ;
+    arr3_double pYY;
+    arr3_double pYZ;
+    arr3_double pZZ;
     int nx;
     int ny;
     int nz;
@@ -39,16 +39,16 @@ class Moments {
     double get_pYZ(int i, int j, int k) const { return pYZ.get(i,j,k); }
     double get_pZZ(int i, int j, int k) const { return pZZ.get(i,j,k); }
     // fetch accessors (write access)
-    doubleArr3& fetch_rho() { return rho; }
-    doubleArr3& fetch_Jx () { return Jx ; }
-    doubleArr3& fetch_Jy () { return Jy ; }
-    doubleArr3& fetch_Jz () { return Jz ; }
-    doubleArr3& fetch_Pxx() { return pXX; }
-    doubleArr3& fetch_Pxy() { return pXY; }
-    doubleArr3& fetch_Pxz() { return pXZ; }
-    doubleArr3& fetch_Pyy() { return pYY; }
-    doubleArr3& fetch_Pyz() { return pYZ; }
-    doubleArr3& fetch_Pzz() { return pZZ; }
+    arr3_double& fetch_rho() { return rho; }
+    arr3_double& fetch_Jx () { return Jx ; }
+    arr3_double& fetch_Jy () { return Jy ; }
+    arr3_double& fetch_Jz () { return Jz ; }
+    arr3_double& fetch_Pxx() { return pXX; }
+    arr3_double& fetch_Pxy() { return pXY; }
+    arr3_double& fetch_Pxz() { return pXZ; }
+    arr3_double& fetch_Pyy() { return pYY; }
+    arr3_double& fetch_Pyz() { return pYZ; }
+    arr3_double& fetch_Pzz() { return pZZ; }
   public:
     Moments(int nxn, int nyn, int nzn) :
       nx(nxn),
diff --git a/include/PSKhdf5adaptor.h b/include/PSKhdf5adaptor.h
index 88b3712a..b9faf69c 100644
--- a/include/PSKhdf5adaptor.h
+++ b/include/PSKhdf5adaptor.h
@@ -9,6 +9,7 @@
 
 #include "hdf5.h"
 #include "hdf5_hl.h"
+#include "arraysfwd.h"
 
 namespace PSK {
 
@@ -50,12 +51,12 @@ namespace PSK {
     void write(const std::string & objname, double d);
     void write(const std::string & objname, const Dimens dimens, const double *d_array);
     void write(const std::string & objname, const Dimens dimens, const std::vector < double >&d_array);
-    void write(const std::string & objname, const Dimens dimens, double ***d_array);
-    void write(const std::string & objname, const Dimens dimens, const int i, double ****d_array);
+    void write(const std::string & objname, const Dimens dimens, const_arr3_double d_array);
+    void write(const std::string & objname, const Dimens dimens, const int i, const_arr4_double d_array);
 
     void write(const std::string & objname, const Dimens dimens, double **d_array);
 
-    void write(const std::string & objname, const Dimens dimens, const int i, double ***d_array);
+    void write(const std::string & objname, const Dimens dimens, const int i, const_arr3_double d_array);
 
   };
 
diff --git a/include/TransArraySpace3D.h b/include/TransArraySpace3D.h
index 43483fa4..c83710f3 100644
--- a/include/TransArraySpace3D.h
+++ b/include/TransArraySpace3D.h
@@ -11,7 +11,7 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #define TransArraySpace3D_H
 
 /** method to convert a 1D field in a 3D field not considering guard cells*/
-inline void solver2phys(double ***vectPhys, double *vectSolver, int nx, int ny, int nz) {
+inline void solver2phys(arr3_double& vectPhys, double *vectSolver, int nx, int ny, int nz) {
   for (register int i = 1; i < nx - 1; i++)
     for (register int j = 1; j < ny - 1; j++)
       for (register int k = 1; k < nz - 1; k++)
@@ -19,7 +19,7 @@ inline void solver2phys(double ***vectPhys, double *vectSolver, int nx, int ny,
 
 }
 /** method to convert a 1D field in a 3D field not considering guard cells*/
-inline void solver2phys(double ***vectPhys1, double ***vectPhys2, double ***vectPhys3, double *vectSolver, int nx, int ny, int nz) {
+inline void solver2phys(arr3_double& vectPhys1, arr3_double& vectPhys2, arr3_double& vectPhys3, double *vectSolver, int nx, int ny, int nz) {
   for (register int i = 1; i < nx - 1; i++)
     for (register int j = 1; j < ny - 1; j++)
       for (register int k = 1; k < nz - 1; k++) {
@@ -29,23 +29,20 @@ inline void solver2phys(double ***vectPhys1, double ***vectPhys2, double ***vect
       }
 }
 /** method to convert a 3D field in a 1D field not considering guard cells*/
-inline void phys2solver(double *vectSolver, double ***vectPhys, int nx, int ny, int nz) {
+inline void phys2solver(double *vectSolver, const arr3_double& vectPhys, int nx, int ny, int nz) {
   for (register int i = 1; i < nx - 1; i++)
     for (register int j = 1; j < ny - 1; j++)
       for (register int k = 1; k < nz - 1; k++)
-        *vectSolver++ = vectPhys[i][j][k];
-
-
+        *vectSolver++ = vectPhys.get(i,j,k);
 }
 /** method to convert a 3D field in a 1D field not considering guard cells*/
-inline void phys2solver(double *vectSolver, double ***vectPhys1, double ***vectPhys2, double ***vectPhys3, int nx, int ny, int nz) {
+inline void phys2solver(double *vectSolver, const arr3_double& vectPhys1, const arr3_double& vectPhys2, const arr3_double& vectPhys3, int nx, int ny, int nz) {
   for (register int i = 1; i < nx - 1; i++)
     for (register int j = 1; j < ny - 1; j++)
       for (register int k = 1; k < nz - 1; k++) {
-        *vectSolver++ = vectPhys1[i][j][k];
-        *vectSolver++ = vectPhys2[i][j][k];
-        *vectSolver++ = vectPhys3[i][j][k];
+        *vectSolver++ = vectPhys1.get(i,j,k);
+        *vectSolver++ = vectPhys2.get(i,j,k);
+        *vectSolver++ = vectPhys3.get(i,j,k);
       }
-
 }
 #endif
diff --git a/include/arraysfwd.h b/include/arraysfwd.h
new file mode 100644
index 00000000..41fdbc19
--- /dev/null
+++ b/include/arraysfwd.h
@@ -0,0 +1,52 @@
+/* forward declaration for array classes */
+#ifndef arraysfwd_h
+#define arraysfwd_h
+
+namespace iPic3D
+{
+  template <class T>
+  class const_array_ref3;
+  template <class T>
+  class const_array_ref4;
+  template <class T>
+  class array_ref1;
+  template <class T>
+  class array_ref2;
+  template <class T>
+  class array_ref3;
+  template <class T>
+  class array_ref4;
+  template <class T>
+  class array1;
+  template <class T>
+  class array2;
+  template <class T>
+  class array3;
+  template <class T>
+  class array4;
+}
+
+// These aliases are defined for the following flexibilization purposes:
+// - to avoid filling the code with template brackets
+//   (i.e., to minimize explicitly template-dependent code).
+// - so that they can be redefined according to the user's
+//   preferred array implementation.
+//
+//typedef array_ref1<int> intArr1;
+//typedef array_ref2<int> intArr2;
+//typedef array_ref3<int> intArr3;
+//typedef array_ref4<int> intArr4;
+//typedef const_array_ref1<double> arr1_double;
+//typedef const_array_ref2<double> arr2_double;
+//
+typedef iPic3D::const_array_ref3<double> const_arr3_double;
+typedef iPic3D::const_array_ref4<double> const_arr4_double;
+typedef iPic3D::array_ref1<double> arr1_double;
+typedef iPic3D::array_ref2<double> arr2_double;
+typedef iPic3D::array_ref3<double> arr3_double;
+typedef iPic3D::array_ref4<double> arr4_double;
+typedef iPic3D::array1<double> array1_double;
+typedef iPic3D::array2<double> array2_double;
+typedef iPic3D::array3<double> array3_double;
+typedef iPic3D::array4<double> array4_double;
+#endif
diff --git a/include/phdf5.h b/include/phdf5.h
index a6465e98..39dba569 100644
--- a/include/phdf5.h
+++ b/include/phdf5.h
@@ -9,6 +9,7 @@ using namespace std;
 #include "mpi.h"
 #include "hdf5.h"
 #include "hdf5_hl.h"
+#include "arraysfwd.h"
 
 class PHDF5fileClass{
 
@@ -21,9 +22,9 @@ class PHDF5fileClass{
     void CreatePHDF5file(double *L, int *dglob, int *dlocl, bool bp);
     void ClosePHDF5file();
     void OpenPHDF5file();
-    void ReadPHDF5dataset_double(string dataset, double ***data);
+    void ReadPHDF5dataset_double(string dataset, arr3_double& data);
     void ReadPHDF5param();
-    int  WritePHDF5dataset(string grpname, string datasetname, double ***data, int nx, int ny, int nz);
+    int  WritePHDF5dataset(string grpname, string datasetname, const_arr3_double& data, int nx, int ny, int nz);
 
     int  getPHDF5ndim();
     int  getPHDF5ncx();
diff --git a/inputoutput/Collective.cpp b/inputoutput/Collective.cpp
index 9f11e3de..140e1647 100644
--- a/inputoutput/Collective.cpp
+++ b/inputoutput/Collective.cpp
@@ -1,6 +1,7 @@
 
 #include <mpi.h>
 #include "Collective.h"
+#include "debug.h"
 
 /*! Read the input file from text file and put the data in a collective wrapper: if it's a restart read from input file basic sim data and load particles and EM field from restart file */
 void Collective::ReadInput(string inputfile) {
diff --git a/inputoutput/WriteOutputParallel.cpp b/inputoutput/WriteOutputParallel.cpp
index 6de0d2ea..c024a069 100644
--- a/inputoutput/WriteOutputParallel.cpp
+++ b/inputoutput/WriteOutputParallel.cpp
@@ -41,17 +41,22 @@ void WriteOutputParallel(Grid3DCU *grid, EMfields3D *EMf, CollectiveIO *col, VCt
   /* Write the Electric field */
   /* ------------------------ */
 
+  array3_double arr3(nxc-2,nyc-2,nzc-2);
+
   grpname = "Fields";
   dtaname = "Ex";
-  outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getExc(grid), nxc-2, nyc-2, nzc-2);
+  EMf->getExc(arr3,grid);
+  outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
 
   grpname = "Fields";
   dtaname = "Ey";
-  outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getEyc(grid), nxc-2, nyc-2, nzc-2);
+  EMf->getEyc(arr3,grid);
+  outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
 
   grpname = "Fields";
   dtaname = "Ez";
-  outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getEzc(grid), nxc-2, nyc-2, nzc-2);
+  EMf->getEzc(arr3,grid);
+  outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
 
   /* ------------------------ */
   /* Write the Magnetic field */
@@ -59,15 +64,18 @@ void WriteOutputParallel(Grid3DCU *grid, EMfields3D *EMf, CollectiveIO *col, VCt
 
   grpname = "Fields";
   dtaname = "Bx";
-  outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getBxc(), nxc-2, nyc-2, nzc-2);
+  EMf->getBxc(arr3);
+  outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
 
   grpname = "Fields";
   dtaname = "By";
-  outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getByc(), nxc-2, nyc-2, nzc-2);
+  EMf->getByc(arr3);
+  outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
 
   grpname = "Fields";
   dtaname = "Bz";
-  outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getBzc(), nxc-2, nyc-2, nzc-2);
+  EMf->getBzc(arr3);
+  outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
 
   /* ----------------------------------------------- */
   /* Write the Charge Density field for each species */
@@ -80,7 +88,9 @@ void WriteOutputParallel(Grid3DCU *grid, EMfields3D *EMf, CollectiveIO *col, VCt
 
     grpname = "Fields";
     dtaname = "Rho_" + snmbr.str();
-    outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getRHOcs(grid, is), nxc-2, nyc-2, nzc-2);
+    EMf->getRHOcs(arr3,grid, is);
+    EMf->getRHOcs(arr3, grid, is);
+    outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
   }
 
   /* ---------------------------------------- */
@@ -94,15 +104,18 @@ void WriteOutputParallel(Grid3DCU *grid, EMfields3D *EMf, CollectiveIO *col, VCt
 
     grpname = "Fields";
     dtaname = "Jx_" + snmbr.str();
-    outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getJxsc(grid, is), nxc-2, nyc-2, nzc-2);
+    EMf->getJxsc(arr3, grid, is);
+    outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
 
     grpname = "Fields";
     dtaname = "Jy_" + snmbr.str();
-    outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getJysc(grid, is), nxc-2, nyc-2, nzc-2);
+    EMf->getJysc(arr3, grid, is);
+    outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
 
     grpname = "Fields";
     dtaname = "Jz_" + snmbr.str();
-    outputfile.WritePHDF5dataset(grpname, dtaname, EMf->getJzsc(grid, is), nxc-2, nyc-2, nzc-2);
+    EMf->getJzsc(arr3, grid, is);
+    outputfile.WritePHDF5dataset(grpname, dtaname, arr3, nxc-2, nyc-2, nzc-2);
   }
 
   outputfile.ClosePHDF5file();
diff --git a/inputoutput/phdf5.cpp b/inputoutput/phdf5.cpp
index 3eb39397..5b8368ab 100644
--- a/inputoutput/phdf5.cpp
+++ b/inputoutput/phdf5.cpp
@@ -3,6 +3,7 @@
 #include "phdf5.h"
 #include "ipicdefs.h"
 #include "errors.h"
+#include "Alloc.h"
 
 PHDF5fileClass::PHDF5fileClass(string filestr, int nd, int *coord, MPI_Comm mpicomm){
 
@@ -111,7 +112,7 @@ void PHDF5fileClass::ClosePHDF5file(){
 
 }
 
-int PHDF5fileClass::WritePHDF5dataset(string grpname, string datasetname, double ***data, int nx, int ny, int nz){
+int PHDF5fileClass::WritePHDF5dataset(string grpname, string datasetname, const_arr3_double& data, int nx, int ny, int nz){
 
   /* -------------------------- */
   /* Local variables and arrays */
@@ -265,7 +266,7 @@ void PHDF5fileClass::ReadPHDF5param(){
 
 }
 
-void PHDF5fileClass::ReadPHDF5dataset_double(string datasetname, double ***data){
+void PHDF5fileClass::ReadPHDF5dataset_double(string datasetname, arr3_double& data){
 
   herr_t  status;
   double *filedata;
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index b05ad584..b0cb1426 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -2,6 +2,7 @@
 #include "iPic3D.h"
 #include "TimeTasks.h"
 #include "ipicdefs.h"
+#include "debug.h"
 
 using namespace iPic3D;
 MPIdata* iPic3D::c_Solver::mpi=0;
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 7a4d01d0..469fff44 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -316,16 +316,12 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
     cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
   }
   double start_mover_PC = MPI_Wtime();
-  double ***Ex = EMf->getEx();
-  double ***Ey = EMf->getEy();
-  double ***Ez = EMf->getEz();
-  double ***Bx = EMf->getBx();
-  double ***By = EMf->getBy();
-  double ***Bz = EMf->getBz();
-
-  double ***Bx_ext = EMf->getBx_ext();
-  double ***By_ext = EMf->getBy_ext();
-  double ***Bz_ext = EMf->getBz_ext();
+  const_arr3_double Ex = EMf->getEx();
+  const_arr3_double Ey = EMf->getEy();
+  const_arr3_double Ez = EMf->getEz();
+  const_arr3_double Bx = EMf->getBx();
+  const_arr3_double By = EMf->getBy();
+  const_arr3_double Bz = EMf->getBz();
 
   const double dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
   const double inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
@@ -423,32 +419,32 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       const double weight110 = xi[1] * eta[1] * zeta[0] * invVOL;
       const double weight111 = xi[1] * eta[1] * zeta[1] * invVOL;
       // 
-      Bxl += weight000 * Bx[ix][iy][iz]             + Bx_ext[ix][iy][iz];
-      Bxl += weight001 * Bx[ix][iy][iz - 1]         + Bx_ext[ix][iy][iz-1];
-      Bxl += weight010 * Bx[ix][iy - 1][iz]         + Bx_ext[ix][iy-1][iz];
-      Bxl += weight011 * Bx[ix][iy - 1][iz - 1]     + Bx_ext[ix][iy-1][iz-1];
-      Bxl += weight100 * Bx[ix - 1][iy][iz]         + Bx_ext[ix-1][iy][iz];
-      Bxl += weight101 * Bx[ix - 1][iy][iz - 1]     + Bx_ext[ix-1][iy][iz-1];
-      Bxl += weight110 * Bx[ix - 1][iy - 1][iz]     + Bx_ext[ix-1][iy-1][iz];
-      Bxl += weight111 * Bx[ix - 1][iy - 1][iz - 1] + Bx_ext[ix-1][iy-1][iz-1];
+      Bxl += weight000 * Bx[ix][iy][iz];
+      Bxl += weight001 * Bx[ix][iy][iz - 1];
+      Bxl += weight010 * Bx[ix][iy - 1][iz];
+      Bxl += weight011 * Bx[ix][iy - 1][iz - 1];
+      Bxl += weight100 * Bx[ix - 1][iy][iz];
+      Bxl += weight101 * Bx[ix - 1][iy][iz - 1];
+      Bxl += weight110 * Bx[ix - 1][iy - 1][iz];
+      Bxl += weight111 * Bx[ix - 1][iy - 1][iz - 1];
       // 
-      Byl += weight000 * By[ix][iy][iz]             + By_ext[ix][iy][iz];
-      Byl += weight001 * By[ix][iy][iz - 1]         + By_ext[ix][iy][iz-1];
-      Byl += weight010 * By[ix][iy - 1][iz]         + By_ext[ix][iy-1][iz];
-      Byl += weight011 * By[ix][iy - 1][iz - 1]     + By_ext[ix][iy-1][iz-1];
-      Byl += weight100 * By[ix - 1][iy][iz]         + By_ext[ix-1][iy][iz];
-      Byl += weight101 * By[ix - 1][iy][iz - 1]     + By_ext[ix-1][iy][iz-1];
-      Byl += weight110 * By[ix - 1][iy - 1][iz]     + By_ext[ix-1][iy-1][iz];
-      Byl += weight111 * By[ix - 1][iy - 1][iz - 1] + By_ext[ix-1][iy-1][iz-1];
+      Byl += weight000 * By[ix][iy][iz];
+      Byl += weight001 * By[ix][iy][iz - 1];
+      Byl += weight010 * By[ix][iy - 1][iz];
+      Byl += weight011 * By[ix][iy - 1][iz - 1];
+      Byl += weight100 * By[ix - 1][iy][iz];
+      Byl += weight101 * By[ix - 1][iy][iz - 1];
+      Byl += weight110 * By[ix - 1][iy - 1][iz];
+      Byl += weight111 * By[ix - 1][iy - 1][iz - 1];
       // 
-      Bzl += weight000 * Bz[ix][iy][iz]             + Bz_ext[ix][iy][iz];
-      Bzl += weight001 * Bz[ix][iy][iz - 1]         + Bz_ext[ix][iy][iz-1];
-      Bzl += weight010 * Bz[ix][iy - 1][iz]         + Bz_ext[ix][iy-1][iz];
-      Bzl += weight011 * Bz[ix][iy - 1][iz - 1]     + Bz_ext[ix][iy-1][iz-1];
-      Bzl += weight100 * Bz[ix - 1][iy][iz]         + Bz_ext[ix-1][iy][iz];
-      Bzl += weight101 * Bz[ix - 1][iy][iz - 1]     + Bz_ext[ix-1][iy][iz-1];
-      Bzl += weight110 * Bz[ix - 1][iy - 1][iz]     + Bz_ext[ix-1][iy-1][iz];
-      Bzl += weight111 * Bz[ix - 1][iy - 1][iz - 1] + Bz_ext[ix-1][iy-1][iz-1];
+      Bzl += weight000 * Bz[ix][iy][iz];
+      Bzl += weight001 * Bz[ix][iy][iz - 1];
+      Bzl += weight010 * Bz[ix][iy - 1][iz];
+      Bzl += weight011 * Bz[ix][iy - 1][iz - 1];
+      Bzl += weight100 * Bz[ix - 1][iy][iz];
+      Bzl += weight101 * Bz[ix - 1][iy][iz - 1];
+      Bzl += weight110 * Bz[ix - 1][iy - 1][iz];
+      Bzl += weight111 * Bz[ix - 1][iy - 1][iz - 1];
       // 
       Exl += weight000 * Ex[ix][iy][iz];
       Exl += weight001 * Ex[ix][iy][iz - 1];
diff --git a/tests/Makefile b/tests/Makefile
index 10559ab9..01a27091 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -6,9 +6,10 @@ OBJECTS = \
   ../utility/asserts.o \
   debug.o
 
-FLAGS = -O3 -DNO_MPI -fno-exceptions #-DCHECK_BOUNDS -ggdb
+ARRAY_FLAGS = #-DCHAINED_ARRAYS #-DFLAT_ARRAYS #-DCHECK_BOUNDS
+FLAGS = -O3 -DNO_MPI -fno-exceptions $(ARRAY_FLAGS) #-ggdb 
 
-COMPILER = g++ #icpc # g++
+COMPILER = c++ #icpc # g++
 
 test: clean_test_arrays test_arrays
 
diff --git a/tests/test_arrays.cpp b/tests/test_arrays.cpp
index 45cdc92c..bfc7aa66 100644
--- a/tests/test_arrays.cpp
+++ b/tests/test_arrays.cpp
@@ -336,9 +336,9 @@ void testArr2_diagonal()
    type** Bold = newArr2(type, dim1, dim2);
    type** Cold = newArr2(type, dim1, dim2);
 
-   Arr2<type> Aarr(dim1, dim2);
-   Arr2<type> Barr(dim1, dim2);
-   Arr2<type> Carr(dim1, dim2);
+   array_ref2<type> Aarr(dim1, dim2);
+   array_ref2<type> Barr(dim1, dim2);
+   array_ref2<type> Carr(dim1, dim2);
 
    printf("Initializing data ...\n");
    for(size_t i=0; i<dim1; i++)
@@ -397,7 +397,7 @@ void testArr2_diagonal()
    {
       Aarr[i][j] = Barr[i][j] * Carr[i][j];
    }
-   printf("%d ms = Total time [i][j] access of Arr2\n", tv_to_ms(stopwatch(LAP)));
+   printf("%d ms = Total time [i][j] access of array_ref2\n", tv_to_ms(stopwatch(LAP)));
 
    for(int t=0; t<ITERS; t++)
    for(size_t i=0; i<dim1; i++)
@@ -406,7 +406,7 @@ void testArr2_diagonal()
       //Aarr(i,j) = Barr(i,j) * Carr(i,j);
       Aarr.fetch(i,j) = Barr.fetch(i,j) * Carr.fetch(i,j);
    }
-   printf("%d ms = Total time (i,j) access of Arr2\n", tv_to_ms(stopwatch(LAP)));
+   printf("%d ms = Total time (i,j) access of array_ref2\n", tv_to_ms(stopwatch(LAP)));
 
    for(size_t i=0; i<dim1; i++)
    for(size_t j=i; j<dim2; j++)
@@ -447,9 +447,9 @@ void testArr2()
    type** Bold = newArr2(type, dim1, dim2);
    type** Cold = newArr2(type, dim1, dim2);
 
-   Arr2<type> Aarr(dim1, dim2);
-   Arr2<type> Barr(dim1, dim2);
-   Arr2<type> Carr(dim1, dim2);
+   array_ref2<type> Aarr(dim1, dim2);
+   array_ref2<type> Barr(dim1, dim2);
+   array_ref2<type> Carr(dim1, dim2);
 
    printf("Initializing data ...\n");
    for(size_t i=0; i<dim1; i++)
@@ -506,7 +506,7 @@ void testArr2()
    {
       Aarr[i][j] = Barr[i][j] * Carr[i][j];
    }
-   printf("%d ms = Total time [i][j] access of Arr2\n", tv_to_ms(stopwatch(LAP)));
+   printf("%d ms = Total time [i][j] access of array_ref2\n", tv_to_ms(stopwatch(LAP)));
 
    for(int t=0; t<ITERS; t++)
    for(size_t i=0; i<dim1; i++)
@@ -514,7 +514,7 @@ void testArr2()
    {
       Aarr.fetch(i,j) = Barr.get(i,j) * Carr.get(i,j);
    }
-   printf("%d ms = Total time (i,j) access of Arr2\n", tv_to_ms(stopwatch(LAP)));
+   printf("%d ms = Total time (i,j) access of array_ref2\n", tv_to_ms(stopwatch(LAP)));
 
    for(size_t i=0; i<dim1; i++)
    for(size_t j=0; j<dim2; j++)
@@ -542,6 +542,20 @@ for(size_t k=0; k<dim3; k++) \
 } \
 printf("%d ms = Total time " #arg2 "\n", tv_to_ms(stopwatch(LAP)));
 
+template <class type>
+void set_prod3(array_ref3<type> Aarr,const_arr3<type> Barr,array_ref3<type> Carr,int ITERS, size_t dim1,size_t dim2,size_t dim3)
+{
+   for(int t=0; t<ITERS; t++)
+   for(size_t i=0; i<dim1; i++)
+   for(size_t j=0; j<dim2; j++)
+   for(size_t k=0; k<dim3; k++)
+   {
+      //Aarr[i][j][k] = Barr[i][j][k] * Carr[i][j][k];
+      Aarr.fetch(i,j,k) = Barr.get(i,j,k) * Carr.get(i,j,k);
+   }
+   printf("%d ms = Total time [i][j][k] access of array_ref3\n", tv_to_ms(stopwatch(LAP)));
+}
+
 template <class type>
 void testArr3()
 {
@@ -562,12 +576,12 @@ void testArr3()
    type*** Bold = newArr3(type, dim1, dim2, dim3);
    type*** Cold = newArr3(type, dim1, dim2, dim3);
 
-   //Array3<type> Aarr(dim1, dim2, dim3);
-   //Array3<type> Barr(dim1, dim2, dim3);
-   //Array3<type> Carr(dim1, dim2, dim3);
-   Arr3<type> Aarr(dim1, dim2, dim3);
-   Arr3<type> Barr(dim1, dim2, dim3);
-   Arr3<type> Carr(dim1, dim2, dim3);
+   //array3<type> Aarr(dim1, dim2, dim3);
+   //array3<type> Barr(dim1, dim2, dim3);
+   //array3<type> Carr(dim1, dim2, dim3);
+   array_ref3<type> Aarr(dim1, dim2, dim3);
+   array_ref3<type> Barr(dim1, dim2, dim3);
+   array_ref3<type> Carr(dim1, dim2, dim3);
 
    printf("Initializing data ...\n");
    for(size_t i=0; i<dim1; i++)
@@ -615,14 +629,7 @@ void testArr3()
    }
    printf("%d ms = Total time (i,j,k) Vicenc array\n", tv_to_ms(stopwatch(LAP)));
 
-   for(int t=0; t<ITERS; t++)
-   for(size_t i=0; i<dim1; i++)
-   for(size_t j=0; j<dim2; j++)
-   for(size_t k=0; k<dim3; k++)
-   {
-      Aarr[i][j][k] = Barr[i][j][k] * Carr[i][j][k];
-   }
-   printf("%d ms = Total time [i][j][k] access of Arr3\n", tv_to_ms(stopwatch(LAP)));
+   set_prod3(Aarr,Barr,Carr,ITERS,dim1,dim2,dim3);
 
    for(int t=0; t<ITERS; t++)
    for(size_t i=0; i<dim1; i++)
@@ -631,7 +638,7 @@ void testArr3()
    {
       Aarr.fetch(i,j,k) = Barr.get(i,j,k) * Carr.get(i,j,k);
    }
-   printf("%d ms = Total time (i,j,k) access of Arr3\n", tv_to_ms(stopwatch(LAP)));
+   printf("%d ms = Total time (i,j,k) access of array_ref3\n", tv_to_ms(stopwatch(LAP)));
 
    for(size_t i=0; i<dim1; i++)
    for(size_t j=0; j<dim2; j++)
@@ -662,7 +669,7 @@ void testArr4()
    // For some bizarre reason, if I comment out the code for the
    // "fbr" and "fpa" arrays below then icpc on knc2 is somehow
    // able to figure out that each iteration does the same thing
-   // in the case of Arr4, but not in the case of the chained
+   // in the case of array_ref4, but not in the case of the chained
    // pointer or fixed-dimension arrays.  Why not?  And why
    // does this optimization occur for four-dimensional arrays
    // and not for 3- or 2-dimensional arrays?  And why is this
@@ -682,21 +689,21 @@ void testArr4()
    type**** Bold = newArr4(type, dim1, dim2, dim3, dim4);
    type**** Cold = newArr4(type, dim1, dim2, dim3, dim4);
 
-   //Array4<type> Afbr(dim1, dim2, dim3, dim4);
-   //Array4<type> Bfbr(dim1, dim2, dim3, dim4);
-   //Array4<type> Cfbr(dim1, dim2, dim3, dim4);
+   //array4<type> Afbr(dim1, dim2, dim3, dim4);
+   //array4<type> Bfbr(dim1, dim2, dim3, dim4);
+   //array4<type> Cfbr(dim1, dim2, dim3, dim4);
 
-   //Array4<type> Afpa(dim1, dim2, dim3, dim4);
-   //Array4<type> Bfpa(dim1, dim2, dim3, dim4);
-   //Array4<type> Cfpa(dim1, dim2, dim3, dim4);
+   //array4<type> Afpa(dim1, dim2, dim3, dim4);
+   //array4<type> Bfpa(dim1, dim2, dim3, dim4);
+   //array4<type> Cfpa(dim1, dim2, dim3, dim4);
 
-   Arr4<type> Abra(dim1, dim2, dim3, dim4);
-   Arr4<type> Bbra(dim1, dim2, dim3, dim4);
-   Arr4<type> Cbra(dim1, dim2, dim3, dim4);
+   array_ref4<type> Abra(dim1, dim2, dim3, dim4);
+   array_ref4<type> Bbra(dim1, dim2, dim3, dim4);
+   array_ref4<type> Cbra(dim1, dim2, dim3, dim4);
 
-   Arr4<type> Apar(dim1, dim2, dim3, dim4);
-   Arr4<type> Bpar(dim1, dim2, dim3, dim4);
-   Arr4<type> Cpar(dim1, dim2, dim3, dim4);
+   array_ref4<type> Apar(dim1, dim2, dim3, dim4);
+   array_ref4<type> Bpar(dim1, dim2, dim3, dim4);
+   array_ref4<type> Cpar(dim1, dim2, dim3, dim4);
 
    printf("Initializing data ...\n");
    for(size_t i=0; i<dim1; i++)
@@ -750,7 +757,7 @@ void testArr4()
    //{
    //   Afbr.fetch(i,j,k,l) = Bfbr.get(i,j,k,l) * Cfbr.get(i,j,k,l);
    //}
-   //printf("%d us = Total time (i,j,k,l) access of Array4\n", tv_to_us(stopwatch(LAP)));
+   //printf("%d us = Total time (i,j,k,l) access of array4\n", tv_to_us(stopwatch(LAP)));
 
    //for(int t=0; t<ITERS; t++)
    //for(size_t i=0; i<dim1; i++)
@@ -760,7 +767,7 @@ void testArr4()
    //{
    //   Afpa.fetch(i,j,k,l) = Bfpa.get(i,j,k,l) * Cfpa.get(i,j,k,l);
    //}
-   //printf("%d us = Total time (i,j,k,l) access of Array4\n", tv_to_us(stopwatch(LAP)));
+   //printf("%d us = Total time (i,j,k,l) access of array4\n", tv_to_us(stopwatch(LAP)));
 
    for(int t=0; t<ITERS; t++)
    for(size_t i=0; i<dim1; i++)
@@ -770,7 +777,7 @@ void testArr4()
    {
       Abra[i][j][k][l] = Bbra[i][j][k][l] * Cbra[i][j][k][l];
    }
-   printf("%d us = Total time [i][j][k][l] access of Arr4\n", tv_to_us(stopwatch(LAP)));
+   printf("%d us = Total time [i][j][k][l] access of array_ref4\n", tv_to_us(stopwatch(LAP)));
 
    for(int t=0; t<ITERS; t++)
    for(size_t i=0; i<dim1; i++)
@@ -780,7 +787,7 @@ void testArr4()
    {
       Apar.fetch(i,j,k,l) = Bpar.get(i,j,k,l) * Cpar.get(i,j,k,l);
    }
-   printf("%d us = Total time (i,j,k,l) access of Arr4\n", tv_to_us(stopwatch(LAP)));
+   printf("%d us = Total time (i,j,k,l) access of array_ref4\n", tv_to_us(stopwatch(LAP)));
 
    for(int t=0; t<ITERS; t++)
    for(size_t i=0; i<dim1; i++)
@@ -808,20 +815,20 @@ void testArr4()
 
 int main()
 {
-  //printf("=== testing Arr2<int> (diagonal) ===\n");
+  //printf("=== testing array_ref2<int> (diagonal) ===\n");
   //testArr2_diagonal<int>();
-  //printf("=== testing Arr2<double> (diagonal) ===\n");
+  //printf("=== testing array_ref2<double> (diagonal) ===\n");
   //testArr2_diagonal<double>();
-  printf("=== testing Arr2<int> ===\n");
+  printf("=== testing array_ref2<int> ===\n");
   testArr2<int>();
-  printf("=== testing Arr2<double> ===\n");
+  printf("=== testing array_ref2<double> ===\n");
   testArr2<double>();
-  printf("=== testing Arr3<int> ===\n");
+  printf("=== testing array_ref3<int> ===\n");
   testArr3<int>();
-  printf("=== testing Arr3<double> ===\n");
+  printf("=== testing array_ref3<double> ===\n");
   testArr3<double>();
-  printf("=== testing Arr4<int> ===\n");
+  printf("=== testing array_ref4<int> ===\n");
   testArr4<int>();
-  printf("=== testing Arr4<double> ===\n");
+  printf("=== testing array_ref4<double> ===\n");
   testArr4<double>();
 }

From f2fe840be04b02cfcbb0d53adffc9dffdc1954ee Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 9 Aug 2013 17:35:24 +0200
Subject: [PATCH 028/118] eliminated unnecessary reference for arrays

---
 communication/ComNodes3D.cpp | 26 +++++-----
 fields/EMfields3D.cpp        | 54 ++++++++++----------
 grids/Grid3DCU.cpp           | 34 ++++++-------
 include/Basic.h              | 66 ++++++++++++------------
 include/ComNodes3D.h         | 26 +++++-----
 include/EMfields3D.h         | 56 ++++++++++-----------
 include/Grid3DCU.h           | 98 ++++++++++++++++++------------------
 include/Moments.h            | 20 ++++----
 include/TransArraySpace3D.h  |  8 +--
 include/phdf5.h              |  4 +-
 inputoutput/phdf5.cpp        |  4 +-
 11 files changed, 198 insertions(+), 198 deletions(-)

diff --git a/communication/ComNodes3D.cpp b/communication/ComNodes3D.cpp
index 5d6f4424..73906404 100644
--- a/communication/ComNodes3D.cpp
+++ b/communication/ComNodes3D.cpp
@@ -5,7 +5,7 @@
 #include "Alloc.h"
 
 /** communicate ghost cells (FOR NODES) */
-void communicateNode(int nx, int ny, int nz, arr3_double& _vector, VirtualTopology3D * vct) {
+void communicateNode(int nx, int ny, int nz, arr3_double _vector, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector=_vector.fetch_arr3();
 
@@ -109,7 +109,7 @@ void communicateNode(int nx, int ny, int nz, arr3_double& _vector, VirtualTopolo
   timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR NODES) */
-void communicateNodeBC(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateNodeBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector = _vector.fetch_arr3();
   // allocate 6 ghost cell Faces
@@ -215,7 +215,7 @@ void communicateNodeBC(int nx, int ny, int nz, arr3_double& _vector, int bcFaceX
   timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR NODES) with particles BC*/
-void communicateNodeBC_P(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateNodeBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
@@ -322,7 +322,7 @@ void communicateNodeBC_P(int nx, int ny, int nz, arr3_double& _vector, int bcFac
 }
 
 /** SPECIES: communicate ghost cells */
-void communicateNode(int nx, int ny, int nz, arr4_double& _vector, int ns, VirtualTopology3D * vct) {
+void communicateNode(int nx, int ny, int nz, arr4_double _vector, int ns, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ****vector = _vector.fetch_arr4();
 
@@ -427,7 +427,7 @@ void communicateNode(int nx, int ny, int nz, arr4_double& _vector, int ns, Virtu
 
 // PARTICLES
 /** SPECIES: communicate ghost cells */
-void communicateNode_P(int nx, int ny, int nz, arr4_double& _vector, int ns, VirtualTopology3D * vct) {
+void communicateNode_P(int nx, int ny, int nz, arr4_double _vector, int ns, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ****vector = _vector.fetch_arr4();
 
@@ -532,7 +532,7 @@ void communicateNode_P(int nx, int ny, int nz, arr4_double& _vector, int ns, Vir
 
 // 
 /** communicate ghost cells (FOR CENTERS) */
-void communicateCenter(int nx, int ny, int nz, arr3_double& _vector, VirtualTopology3D * vct) {
+void communicateCenter(int nx, int ny, int nz, arr3_double _vector, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector = _vector.fetch_arr3();
 
@@ -634,7 +634,7 @@ void communicateCenter(int nx, int ny, int nz, arr3_double& _vector, VirtualTopo
   timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
-void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
@@ -667,7 +667,7 @@ void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double& _vector,
 }
 // particles
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
-void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
@@ -702,7 +702,7 @@ void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double& _vecto
 // 
 
 
-void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
@@ -734,7 +734,7 @@ void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double& _vector, i
   timeTasks.addto_communicate();
 }
 
-void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
@@ -769,7 +769,7 @@ void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double& _vector,
 
 
 /** SPECIES: communicate ghost cells */
-void communicateCenter(int nx, int ny, int nz, arr4_double& _vector, int ns, VirtualTopology3D * vct) {
+void communicateCenter(int nx, int ny, int nz, arr4_double _vector, int ns, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ****vector=_vector.fetch_arr4();
 
@@ -870,7 +870,7 @@ void communicateCenter(int nx, int ny, int nz, arr4_double& _vector, int ns, Vir
   timeTasks.addto_communicate();
 }
 // /////////// communication + BC ////////////////////////////
-void communicateCenterBC(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateCenterBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector=_vector.fetch_arr3();
 
@@ -974,7 +974,7 @@ void communicateCenterBC(int nx, int ny, int nz, arr3_double& _vector, int bcFac
   timeTasks.addto_communicate();
 }
 // /////////// communication + BC ////////////////////////////
-void communicateCenterBC_P(int nx, int ny, int nz, arr3_double& _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
+void communicateCenterBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks.start_communicate();
   double ***vector=_vector.fetch_arr3();
 
diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index c01b149c..93b18100 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -699,7 +699,7 @@ void EMfields3D::MaxwellImage(double *im, double *vector, Grid * grid, VirtualTo
 }
 
 /*! Calculate PI dot (vectX, vectY, vectZ) */
-void EMfields3D::PIdot(arr3_double& PIdotX, arr3_double& PIdotY, arr3_double& PIdotZ, const_arr3_double& vectX, const_arr3_double& vectY, const_arr3_double& vectZ, int ns, Grid * grid) {
+void EMfields3D::PIdot(arr3_double PIdotX, arr3_double PIdotY, arr3_double PIdotZ, const_arr3_double vectX, const_arr3_double vectY, const_arr3_double vectZ, int ns, Grid * grid) {
   double beta, edotb, omcx, omcy, omcz, denom;
   beta = .5 * qom[ns] * dt / c;
   for (int i = 1; i < nxn - 1; i++)
@@ -716,8 +716,8 @@ void EMfields3D::PIdot(arr3_double& PIdotX, arr3_double& PIdotY, arr3_double& PI
       }
 }
 /*! Calculate MU dot (vectX, vectY, vectZ) */
-void EMfields3D::MUdot(arr3_double& MUdotX, arr3_double& MUdotY, arr3_double& MUdotZ,
-  const_arr3_double& vectX, const_arr3_double& vectY, const_arr3_double& vectZ, Grid * grid)
+void EMfields3D::MUdot(arr3_double MUdotX, arr3_double MUdotY, arr3_double MUdotZ,
+  const_arr3_double vectX, const_arr3_double vectY, const_arr3_double vectZ, Grid * grid)
 {
   double beta, edotb, omcx, omcy, omcz, denom;
   for (int i = 1; i < nxn - 1; i++)
@@ -744,7 +744,7 @@ void EMfields3D::MUdot(arr3_double& MUdotX, arr3_double& MUdotY, arr3_double& MU
   }
 }
 /* Interpolation smoothing: Smoothing (vector must already have ghost cells) TO MAKE SMOOTH value as to be different from 1.0 type = 0 --> center based vector ; type = 1 --> node based vector ; */
-void EMfields3D::smooth(double value, arr3_double& vector, int type, Grid * grid, VirtualTopology3D * vct) {
+void EMfields3D::smooth(double value, arr3_double vector, int type, Grid * grid, VirtualTopology3D * vct) {
 
   int nvolte = 6;
   for (int icount = 1; icount < nvolte + 1; icount++) {
@@ -841,7 +841,7 @@ void EMfields3D::smoothE(double value, VirtualTopology3D * vct, Collective *col)
 }
 
 /* SPECIES: Interpolation smoothing TO MAKE SMOOTH value as to be different from 1.0 type = 0 --> center based vector type = 1 --> node based vector */
-void EMfields3D::smooth(double value, arr4_double& vector, int is, int type, Grid * grid, VirtualTopology3D * vct) {
+void EMfields3D::smooth(double value, arr4_double vector, int is, int type, Grid * grid, VirtualTopology3D * vct) {
   cout << "Smoothing for Species not implemented in 3D" << endl;
 }
 
@@ -2488,8 +2488,8 @@ void EMfields3D::sustensorRightZ(double **susxz, double **susyz, double **suszz)
 }
 
 /*! Perfect conductor boundary conditions: LEFT wall */
-void EMfields3D::perfectConductorLeft(arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
-  const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ,
+void EMfields3D::perfectConductorLeft(arr3_double imageX, arr3_double imageY, arr3_double imageZ,
+  const_arr3_double vectorX, const_arr3_double vectorY, const_arr3_double vectorZ,
   int dir, Grid * grid)
 {
   double** susxy;
@@ -2552,10 +2552,10 @@ void EMfields3D::perfectConductorLeft(arr3_double& imageX, arr3_double& imageY,
 
 /*! Perfect conductor boundary conditions: RIGHT wall */
 void EMfields3D::perfectConductorRight(
-  arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
-  const_arr3_double& vectorX,
-  const_arr3_double& vectorY,
-  const_arr3_double& vectorZ,
+  arr3_double imageX, arr3_double imageY, arr3_double imageZ,
+  const_arr3_double vectorX,
+  const_arr3_double vectorY,
+  const_arr3_double vectorZ,
   int dir, Grid * grid)
 {
   double beta, omcx, omcy, omcz, denom;
@@ -2618,7 +2618,7 @@ void EMfields3D::perfectConductorRight(
 }
 
 /*! Perfect conductor boundary conditions for source: LEFT WALL */
-void EMfields3D::perfectConductorLeftS(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ, int dir) {
+void EMfields3D::perfectConductorLeftS(arr3_double vectorX, arr3_double vectorY, arr3_double vectorZ, int dir) {
 
   double ebc[3];
 
@@ -2664,7 +2664,7 @@ void EMfields3D::perfectConductorLeftS(arr3_double& vectorX, arr3_double& vector
 }
 
 /*! Perfect conductor boundary conditions for source: RIGHT WALL */
-void EMfields3D::perfectConductorRightS(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ, int dir) {
+void EMfields3D::perfectConductorRightS(arr3_double vectorX, arr3_double vectorY, arr3_double vectorZ, int dir) {
 
   double ebc[3];
 
@@ -2829,8 +2829,8 @@ void EMfields3D::updateInfoFields(Grid *grid,VirtualTopology3D *vct,Collective *
 
 }
 
-void EMfields3D::BoundaryConditionsEImage(arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
-  const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ,
+void EMfields3D::BoundaryConditionsEImage(arr3_double imageX, arr3_double imageY, arr3_double imageZ,
+  const_arr3_double vectorX, const_arr3_double vectorY, const_arr3_double vectorZ,
   int nx, int ny, int nz, VirtualTopology3D *vct,Grid *grid)
 {
 
@@ -2892,7 +2892,7 @@ void EMfields3D::BoundaryConditionsEImage(arr3_double& imageX, arr3_double& imag
 
 }
 
-void EMfields3D::BoundaryConditionsB(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct){
+void EMfields3D::BoundaryConditionsB(arr3_double vectorX, arr3_double vectorY, arr3_double vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct){
 
   if(vct->getXleft_neighbor()==MPI_PROC_NULL && bcEMfaceXleft ==2) {
     for (int j=0; j < ny;j++)
@@ -2975,7 +2975,7 @@ void EMfields3D::BoundaryConditionsB(arr3_double& vectorX, arr3_double& vectorY,
 
 }
 
-void EMfields3D::BoundaryConditionsE(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct){
+void EMfields3D::BoundaryConditionsE(arr3_double vectorX, arr3_double vectorY, arr3_double vectorZ,int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct){
 
   if(vct->getXleft_neighbor()==MPI_PROC_NULL && bcEMfaceXleft ==2) {
     for (int j=0; j < ny;j++)
@@ -3058,7 +3058,7 @@ void EMfields3D::BoundaryConditionsE(arr3_double& vectorX, arr3_double& vectorY,
 }
 
 /*! get Electric Field component X array cell without the ghost cells */
-void EMfields3D::getExc(arr3_double& arr, Grid3DCU *grid) {
+void EMfields3D::getExc(arr3_double arr, Grid3DCU *grid) {
 
   array3_double tmp(nxc,nyc,nzc);
   grid->interpN2C(tmp, Ex);
@@ -3069,7 +3069,7 @@ void EMfields3D::getExc(arr3_double& arr, Grid3DCU *grid) {
         arr[i-1][j-1][k-1]=tmp[i][j][k];
 }
 /*! get Electric Field component Y array cell without the ghost cells */
-void EMfields3D::getEyc(arr3_double& arr, Grid3DCU *grid) {
+void EMfields3D::getEyc(arr3_double arr, Grid3DCU *grid) {
 
   array3_double tmp(nxc,nyc,nzc);
   grid->interpN2C(tmp, Ey);
@@ -3080,7 +3080,7 @@ void EMfields3D::getEyc(arr3_double& arr, Grid3DCU *grid) {
         arr[i-1][j-1][k-1]=tmp[i][j][k];
 }
 /*! get Electric Field component Z array cell without the ghost cells */
-void EMfields3D::getEzc(arr3_double& arr, Grid3DCU *grid) {
+void EMfields3D::getEzc(arr3_double arr, Grid3DCU *grid) {
 
   array3_double tmp(nxc,nyc,nzc);
   grid->interpN2C(tmp, Ez);
@@ -3091,28 +3091,28 @@ void EMfields3D::getEzc(arr3_double& arr, Grid3DCU *grid) {
         arr[i-1][j-1][k-1]=tmp[i][j][k];
 }
 /*! get Magnetic Field component X array cell without the ghost cells */
-void EMfields3D::getBxc(arr3_double& arr) {
+void EMfields3D::getBxc(arr3_double arr) {
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=Bxc[i][j][k];
 }
 /*! get Magnetic Field component Y array cell without the ghost cells */
-void EMfields3D::getByc(arr3_double& arr) {
+void EMfields3D::getByc(arr3_double arr) {
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=Byc[i][j][k];
 }
 /*! get Magnetic Field component Z array cell without the ghost cells */
-void EMfields3D::getBzc(arr3_double& arr) {
+void EMfields3D::getBzc(arr3_double arr) {
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
         arr[i-1][j-1][k-1]=Bzc[i][j][k];
 }
 /*! get species density component X array cell without the ghost cells */
-void EMfields3D::getRHOcs(arr3_double& arr, Grid3DCU *grid, int is) {
+void EMfields3D::getRHOcs(arr3_double arr, Grid3DCU *grid, int is) {
 
   array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, rhons);
@@ -3124,7 +3124,7 @@ void EMfields3D::getRHOcs(arr3_double& arr, Grid3DCU *grid, int is) {
 }
 
 /*! get Magnetic Field component X array species is cell without the ghost cells */
-void EMfields3D::getJxsc(arr3_double& arr, Grid3DCU *grid, int is) {
+void EMfields3D::getJxsc(arr3_double arr, Grid3DCU *grid, int is) {
 
   array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, Jxs);
@@ -3136,7 +3136,7 @@ void EMfields3D::getJxsc(arr3_double& arr, Grid3DCU *grid, int is) {
 }
 
 /*! get current component Y array species is cell without the ghost cells */
-void EMfields3D::getJysc(arr3_double& arr, Grid3DCU *grid, int is) {
+void EMfields3D::getJysc(arr3_double arr, Grid3DCU *grid, int is) {
 
   array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, Jys);
@@ -3147,7 +3147,7 @@ void EMfields3D::getJysc(arr3_double& arr, Grid3DCU *grid, int is) {
         arr[i-1][j-1][k-1]=tmp[is][i][j][k];
 }
 /*! get current component Z array species is cell without the ghost cells */
-void EMfields3D::getJzsc(arr3_double& arr, Grid3DCU *grid, int is) {
+void EMfields3D::getJzsc(arr3_double arr, Grid3DCU *grid, int is) {
 
   array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, Jzs);
diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index a533f481..755f3096 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -88,7 +88,7 @@ void Grid3DCU::print(VirtualTopology3D * ptVCT) {
 }
 
 /** calculate gradient on nodes, given a scalar field defined on central points  */
-void Grid3DCU::gradC2N(arr3_double& gradXN, arr3_double& gradYN, arr3_double& gradZN, const_arr3_double& scFieldC) {
+void Grid3DCU::gradC2N(arr3_double gradXN, arr3_double gradYN, arr3_double gradZN, const_arr3_double scFieldC) {
   for (register int i = 1; i < nxn - 1; i++)
     for (register int j = 1; j < nyn - 1; j++)
       for (register int k = 1; k < nzn - 1; k++) {
@@ -99,7 +99,7 @@ void Grid3DCU::gradC2N(arr3_double& gradXN, arr3_double& gradYN, arr3_double& gr
 }
 
 /** calculate gradient on nodes, given a scalar field defined on central points  */
-void Grid3DCU::gradN2C(arr3_double& gradXC, arr3_double& gradYC, arr3_double& gradZC, const_arr3_double& scFieldN) {
+void Grid3DCU::gradN2C(arr3_double gradXC, arr3_double gradYC, arr3_double gradZC, const_arr3_double scFieldN) {
   for (register int i = 1; i < nxc - 1; i++)
     for (register int j = 1; j < nyc - 1; j++)
       for (register int k = 1; k < nzc - 1; k++) {
@@ -110,7 +110,7 @@ void Grid3DCU::gradN2C(arr3_double& gradXC, arr3_double& gradYC, arr3_double& gr
 }
 
 /** calculate divergence on central points, given a vector field defined on nodes  */
-void Grid3DCU::divN2C(arr3_double& divC, const_arr3_double& vecFieldXN, const_arr3_double& vecFieldYN, const_arr3_double& vecFieldZN) {
+void Grid3DCU::divN2C(arr3_double divC, const_arr3_double vecFieldXN, const_arr3_double vecFieldYN, const_arr3_double vecFieldZN) {
   double compX;
   double compY;
   double compZ;
@@ -125,7 +125,7 @@ void Grid3DCU::divN2C(arr3_double& divC, const_arr3_double& vecFieldXN, const_ar
 }
 
 /** calculate divergence on central points, given a Tensor field defined on nodes  */
-void Grid3DCU::divSymmTensorN2C(arr3_double& divCX, arr3_double& divCY, arr3_double& divCZ, const_arr4_double& pXX, const_arr4_double& pXY, const_arr4_double& pXZ, const_arr4_double& pYY, const_arr4_double& pYZ, const_arr4_double& pZZ, int ns) {
+void Grid3DCU::divSymmTensorN2C(arr3_double divCX, arr3_double divCY, arr3_double divCZ, const_arr4_double pXX, const_arr4_double pXY, const_arr4_double pXZ, const_arr4_double pYY, const_arr4_double pYZ, const_arr4_double pZZ, int ns) {
   double comp1X, comp2X, comp3X;
   double comp1Y, comp2Y, comp3Y;
   double comp1Z, comp2Z, comp3Z;
@@ -148,7 +148,7 @@ void Grid3DCU::divSymmTensorN2C(arr3_double& divCX, arr3_double& divCY, arr3_dou
 }
 
 /** calculate divergence on nodes, given a vector field defined on central points  */
-void Grid3DCU::divC2N(arr3_double& divN, const_arr3_double& vecFieldXC, const_arr3_double& vecFieldYC, const_arr3_double& vecFieldZC) {
+void Grid3DCU::divC2N(arr3_double divN, const_arr3_double vecFieldXC, const_arr3_double vecFieldYC, const_arr3_double vecFieldZC) {
   double compX;
   double compY;
   double compZ;
@@ -163,7 +163,7 @@ void Grid3DCU::divC2N(arr3_double& divN, const_arr3_double& vecFieldXC, const_ar
 }
 
 /** calculate curl on nodes, given a vector field defined on central points  */
-void Grid3DCU::curlC2N(arr3_double& curlXN, arr3_double& curlYN, arr3_double& curlZN, const_arr3_double& vecFieldXC, const_arr3_double& vecFieldYC, const_arr3_double& vecFieldZC) {
+void Grid3DCU::curlC2N(arr3_double curlXN, arr3_double curlYN, arr3_double curlZN, const_arr3_double vecFieldXC, const_arr3_double vecFieldYC, const_arr3_double vecFieldZC) {
   double compZDY, compYDZ;
   double compXDZ, compZDX;
   double compYDX, compXDY;
@@ -187,8 +187,8 @@ void Grid3DCU::curlC2N(arr3_double& curlXN, arr3_double& curlYN, arr3_double& cu
 }
 
 /** calculate curl on central points, given a vector field defined on nodes  */
-void Grid3DCU::curlN2C(arr3_double& curlXC, arr3_double& curlYC, arr3_double& curlZC,
-  const_arr3_double& vecFieldXN, const_arr3_double& vecFieldYN, const_arr3_double& vecFieldZN)
+void Grid3DCU::curlN2C(arr3_double curlXC, arr3_double curlYC, arr3_double curlZC,
+  const_arr3_double vecFieldXN, const_arr3_double vecFieldYN, const_arr3_double vecFieldZN)
 {
   double compZDY, compYDZ;
   double compXDZ, compZDX;
@@ -217,7 +217,7 @@ void Grid3DCU::curlN2C(arr3_double& curlXC, arr3_double& curlYC, arr3_double& cu
 }
 
 /** calculate laplacian on nodes, given a scalar field defined on nodes */
-void Grid3DCU::lapN2N(arr3_double& lapN, const_arr3_double& scFieldN, VirtualTopology3D * vct) {
+void Grid3DCU::lapN2N(arr3_double lapN, const_arr3_double scFieldN, VirtualTopology3D * vct) {
   // calculate laplacian as divercence of gradient
   // allocate 3 gradients: defined on central points
   array3_double gradXC(nxc, nyc, nzc);
@@ -233,7 +233,7 @@ void Grid3DCU::lapN2N(arr3_double& lapN, const_arr3_double& scFieldN, VirtualTop
 }
 
 /** calculate laplacian on central points, given a scalar field defined on central points */
-void Grid3DCU::lapC2C(arr3_double& lapC, const_arr3_double& scFieldC, VirtualTopology3D * vct) {
+void Grid3DCU::lapC2C(arr3_double lapC, const_arr3_double scFieldC, VirtualTopology3D * vct) {
   // calculate laplacian as divercence of gradient
   // allocate 3 gradients: defined on nodes
   array3_double gradXN(nxn, nyn, nzn);
@@ -269,7 +269,7 @@ void Grid3DCU::lapC2C(arr3_double& lapC, const_arr3_double& scFieldC, VirtualTop
 }
 
 /** calculate laplacian on central points, given a scalar field defined on central points for Poisson */
-void Grid3DCU::lapC2Cpoisson(arr3_double& lapC, arr3_double& scFieldC, VirtualTopology3D * vct) {
+void Grid3DCU::lapC2Cpoisson(arr3_double lapC, arr3_double scFieldC, VirtualTopology3D * vct) {
   // communicate first the scFieldC
   communicateCenterBoxStencilBC(nxc, nyc, nzc, scFieldC, 1, 1, 1, 1, 1, 1, vct);
   for (register int i = 1; i < nxc - 1; i++)
@@ -279,7 +279,7 @@ void Grid3DCU::lapC2Cpoisson(arr3_double& lapC, arr3_double& scFieldC, VirtualTo
 }
 
 /** calculate divergence on  boundaries */
-void Grid3DCU::divBCleft(arr3_double& divBC, const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ, int leftActiveNode, int dirDER) {
+void Grid3DCU::divBCleft(arr3_double divBC, const_arr3_double vectorX, const_arr3_double vectorY, const_arr3_double vectorZ, int leftActiveNode, int dirDER) {
   double compX, compY, compZ;
   switch (dirDER) {
     case 0:                    // DIVERGENCE DIRECTION X
@@ -316,7 +316,7 @@ void Grid3DCU::divBCleft(arr3_double& divBC, const_arr3_double& vectorX, const_a
 }
 
 /** calculate divergence on  boundaries */
-void Grid3DCU::divBCright(arr3_double& divBC, const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ, int rightActiveNode, int dirDER) {
+void Grid3DCU::divBCright(arr3_double divBC, const_arr3_double vectorX, const_arr3_double vectorY, const_arr3_double vectorZ, int rightActiveNode, int dirDER) {
   double compX, compY, compZ;
 
 
@@ -355,7 +355,7 @@ void Grid3DCU::divBCright(arr3_double& divBC, const_arr3_double& vectorX, const_
 }
 
 /** calculate derivative on left boundary */
-void Grid3DCU::derBC(arr3_double& derBC, const_arr3_double& vector, int leftActiveNode, int dirDER) {
+void Grid3DCU::derBC(arr3_double derBC, const_arr3_double vector, int leftActiveNode, int dirDER) {
   switch (dirDER) {
     case 0:                    // DERIVATIVE DIRECTION X
       for (register int j = 1; j < nyc - 1; j++)
@@ -378,7 +378,7 @@ void Grid3DCU::derBC(arr3_double& derBC, const_arr3_double& vector, int leftActi
 }
 
 /** interpolate on nodes from central points: do this for the magnetic field*/
-void Grid3DCU::interpC2N(arr3_double& vecFieldN, const_arr3_double& vecFieldC) {
+void Grid3DCU::interpC2N(arr3_double vecFieldN, const_arr3_double vecFieldC) {
   for (register int i = 1; i < nxn - 1; i++)
     for (register int j = 1; j < nyn - 1; j++)
       for (register int k = 1; k < nzn - 1; k++)
@@ -386,7 +386,7 @@ void Grid3DCU::interpC2N(arr3_double& vecFieldN, const_arr3_double& vecFieldC) {
 }
 
 /** interpolate on central points from nodes */
-void Grid3DCU::interpN2C(arr3_double& vecFieldC, const_arr3_double& vecFieldN) {
+void Grid3DCU::interpN2C(arr3_double vecFieldC, const_arr3_double vecFieldN) {
   for (register int i = 1; i < nxc - 1; i++)
     for (register int j = 1; j < nyc - 1; j++)
       for (register int k = 1; k < nzc - 1; k++)
@@ -394,7 +394,7 @@ void Grid3DCU::interpN2C(arr3_double& vecFieldC, const_arr3_double& vecFieldN) {
 }
 
 /** interpolate on central points from nodes */
-void Grid3DCU::interpN2C(arr4_double& vecFieldC, int ns, const_arr4_double& vecFieldN) {
+void Grid3DCU::interpN2C(arr4_double vecFieldC, int ns, const_arr4_double vecFieldN) {
   for (register int i = 1; i < nxc - 1; i++)
     for (register int j = 1; j < nyc - 1; j++)
       for (register int k = 1; k < nzc - 1; k++)
diff --git a/include/Basic.h b/include/Basic.h
index f444b216..2dd7da6c 100644
--- a/include/Basic.h
+++ b/include/Basic.h
@@ -64,7 +64,7 @@ inline double norm2(double **vect, int nx, int ny) {
   return (result);
 }
 /** method to calculate the square norm of a vector */
-inline double norm2(const arr3_double& vect, int nx, int ny) {
+inline double norm2(const arr3_double vect, int nx, int ny) {
   double result = 0;
   for (int i = 0; i < nx; i++)
     for (int j = 0; j < ny; j++)
@@ -82,7 +82,7 @@ inline double norm2(double *vect, int nx) {
 
 
 /** method to calculate the parallel dot product */
-inline double norm2P(const arr3_double& vect, int nx, int ny, int nz) {
+inline double norm2P(const arr3_double vect, int nx, int ny, int nz) {
   double result = 0;
   double local_result = 0;
   for (int i = 0; i < nx; i++)
@@ -128,7 +128,7 @@ inline void sum(double *vect1, double *vect2, int n) {
 
 }
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
+inline void sum(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -136,14 +136,14 @@ inline void sum(arr3_double& vect1, const arr3_double& vect2, int nx, int ny, in
 }
 
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
+inline void sum(arr3_double vect1, const arr3_double vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect1.fetch(i,j,0) += vect2.get(i,j,0);
 }
 
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(arr3_double& vect1, const arr4_double& vect2, int nx, int ny, int nz, int ns) {
+inline void sum(arr3_double vect1, const arr4_double vect2, int nx, int ny, int nz, int ns) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -151,13 +151,13 @@ inline void sum(arr3_double& vect1, const arr4_double& vect2, int nx, int ny, in
 }
 
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(arr3_double& vect1, const arr4_double& vect2, int nx, int ny, int ns) {
+inline void sum(arr3_double vect1, const arr4_double vect2, int nx, int ny, int ns) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect1.fetch(i,j,0) += vect2.get(ns,i,j,0);
 }
 /** method to calculate the subtraction of two vectors vector1 = vector1 - vector2*/
-inline void sub(arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
+inline void sub(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -165,7 +165,7 @@ inline void sub(arr3_double& vect1, const arr3_double& vect2, int nx, int ny, in
 }
 
 /** method to calculate the subtraction of two vectors vector1 = vector1 - vector2*/
-inline void sub(arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
+inline void sub(arr3_double vect1, const arr3_double vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect1.fetch(i,j,0) -= vect2.get(i,j,0);
@@ -173,7 +173,7 @@ inline void sub(arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
 
 
 /** method to sum 4 vectors vector1 = alfa*vector1 + beta*vector2 + gamma*vector3 + delta*vector4 */
-inline void sum4(arr3_double& vect1, double alfa, const arr3_double& vect2, double beta, const arr3_double& vect3, double gamma, const arr3_double& vect4, double delta, const arr3_double& vect5, int nx, int ny, int nz) {
+inline void sum4(arr3_double vect1, double alfa, const arr3_double vect2, double beta, const arr3_double vect3, double gamma, const arr3_double vect4, double delta, const arr3_double vect5, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -187,7 +187,7 @@ inline void scale(double *vect, double alfa, int n) {
 }
 
 /** method to calculate the scalar-vector product */
-inline void scale(arr3_double& vect, double alfa, int nx, int ny) {
+inline void scale(arr3_double vect, double alfa, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect.fetch(i,j,0) *= alfa;
@@ -195,7 +195,7 @@ inline void scale(arr3_double& vect, double alfa, int nx, int ny) {
 
 
 /** method to calculate the scalar-vector product */
-inline void scale(arr3_double& vect, double alfa, int nx, int ny, int nz) {
+inline void scale(arr3_double vect, double alfa, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -209,7 +209,7 @@ inline void scale(double vect[][2][2], double alfa, int nx, int ny, int nz) {
         vect[i][j][k] *= alfa;
 }
 /** method to calculate the scalar-vector product */
-inline void scale(arr3_double& vect1, const arr3_double& vect2, double alfa, int nx, int ny, int nz) {
+inline void scale(arr3_double vect1, const arr3_double vect2, double alfa, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -217,7 +217,7 @@ inline void scale(arr3_double& vect1, const arr3_double& vect2, double alfa, int
 }
 
 /** method to calculate the scalar-vector product */
-inline void scale(arr3_double& vect1, const arr3_double& vect2, double alfa, int nx, int ny) {
+inline void scale(arr3_double vect1, const arr3_double vect2, double alfa, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect1.fetch(i,j,0) = vect2.get(i,j,0) * alfa;
@@ -230,7 +230,7 @@ inline void scale(double *vect1, double *vect2, double alfa, int n) {
 }
 
 /** method to calculate vector1 = vector1 + alfa*vector2   */
-inline void addscale(double alfa, arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
+inline void addscale(double alfa, arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -245,7 +245,7 @@ inline void addscale(double alfa, double vect1[][2][2], double vect2[][2][2], in
 
 }
 /** method to calculate vector1 = vector1 + alfa*vector2   */
-inline void addscale(double alfa, arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
+inline void addscale(double alfa, arr3_double vect1, const arr3_double vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect1.fetch(i,j,0) += alfa * vect2.get(i,j,0);
@@ -263,7 +263,7 @@ inline void addscale(double alfa, double beta, double *vect1, double *vect2, int
 
 }
 /** method to calculate vector1 = beta*vector1 + alfa*vector2 */
-inline void addscale(double alfa, double beta, arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
+inline void addscale(double alfa, double beta, arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
 
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
@@ -273,7 +273,7 @@ inline void addscale(double alfa, double beta, arr3_double& vect1, const arr3_do
 
 }
 /** method to calculate vector1 = beta*vector1 + alfa*vector2 */
-inline void addscale(double alfa, double beta, arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
+inline void addscale(double alfa, double beta, arr3_double vect1, const arr3_double vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect1.fetch(i,j,0) = beta * vect1.get(i,j,0) + alfa * vect2.get(i,j,0);
@@ -282,21 +282,21 @@ inline void addscale(double alfa, double beta, arr3_double& vect1, const arr3_do
 
 
 /** method to calculate vector1 = alfa*vector2 + beta*vector3 */
-inline void scaleandsum(arr3_double& vect1, double alfa, double beta, const arr3_double& vect2, const arr3_double& vect3, int nx, int ny, int nz) {
+inline void scaleandsum(arr3_double vect1, double alfa, double beta, const arr3_double vect2, const arr3_double vect3, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
         vect1.fetch(i,j,k) = alfa * vect2.get(i,j,k) + beta * vect3.get(i,j,k);
 }
 /** method to calculate vector1 = alfa*vector2 + beta*vector3 with vector2 depending on species*/
-inline void scaleandsum(arr3_double& vect1, double alfa, double beta, const arr4_double& vect2, const arr3_double& vect3, int ns, int nx, int ny, int nz) {
+inline void scaleandsum(arr3_double vect1, double alfa, double beta, const arr4_double vect2, const arr3_double vect3, int ns, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
         vect1.fetch(i,j,k) = alfa * vect2.get(ns,i,j,k) + beta * vect3.get(i,j,k);
 }
 /** method to calculate vector1 = alfa*vector2*vector3 with vector2 depending on species*/
-inline void prod(arr3_double& vect1, double alfa, const arr4_double& vect2, int ns, const arr3_double& vect3, int nx, int ny, int nz) {
+inline void prod(arr3_double vect1, double alfa, const arr4_double vect2, int ns, const arr3_double vect3, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -304,21 +304,21 @@ inline void prod(arr3_double& vect1, double alfa, const arr4_double& vect2, int
 
 }
 /** method to calculate vect1 = vect2/alfa */
-inline void div(arr3_double& vect1, double alfa, const arr3_double& vect2, int nx, int ny, int nz) {
+inline void div(arr3_double vect1, double alfa, const arr3_double vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
         vect1.fetch(i,j,k) = vect2.get(i,j,k) / alfa;
 
 }
-inline void prod6(arr3_double& vect1, const arr3_double& vect2, const arr3_double& vect3, const arr3_double& vect4, const arr3_double& vect5, const arr3_double& vect6, const arr3_double& vect7, int nx, int ny, int nz) {
+inline void prod6(arr3_double vect1, const arr3_double vect2, const arr3_double vect3, const arr3_double vect4, const arr3_double vect5, const arr3_double vect6, const arr3_double vect7, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
         vect1.fetch(i,j,k) = vect2.get(i,j,k) * vect3.get(i,j,k) + vect4.get(i,j,k) * vect5.get(i,j,k) + vect6.get(i,j,k) * vect7.get(i,j,k);
 }
 /** method used for calculating PI */
-inline void proddiv(arr3_double& vect1, const arr3_double& vect2, double alfa, const arr3_double& vect3, const arr3_double& vect4, const arr3_double& vect5, const arr3_double& vect6, double beta, const arr3_double& vect7, const arr3_double& vect8, double gamma, const arr3_double& vect9, int nx, int ny, int nz) {
+inline void proddiv(arr3_double vect1, const arr3_double vect2, double alfa, const arr3_double vect3, const arr3_double vect4, const arr3_double vect5, const arr3_double vect6, double beta, const arr3_double vect7, const arr3_double vect8, double gamma, const arr3_double vect9, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -328,7 +328,7 @@ inline void proddiv(arr3_double& vect1, const arr3_double& vect2, double alfa, c
   // ***vect1++ = (***vect2++ + alfa*((***vect3++)*(***vect4++) - (***vect5++)*(***vect6++)) + beta*(***vect7++)*(***vect8++))/(1+gamma*(***vect9++));
 }
 /** method to calculate the opposite of a vector */
-inline void neg(arr3_double& vect, int nx, int ny, int nz) {
+inline void neg(arr3_double vect, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -336,13 +336,13 @@ inline void neg(arr3_double& vect, int nx, int ny, int nz) {
 }
 
 /** method to calculate the opposite of a vector */
-inline void neg(arr3_double& vect, int nx, int ny) {
+inline void neg(arr3_double vect, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect.fetch(i,j,0) = -vect.get(i,j,0);
 }
 /** method to calculate the opposite of a vector */
-inline void neg(arr3_double& vect, int nx) {
+inline void neg(arr3_double vect, int nx) {
   for (register int i = 0; i < nx; i++)
     vect.fetch(i,0,0) = -vect.get(i,0,0);
 }
@@ -354,7 +354,7 @@ inline void neg(double *vect, int n) {
 
 }
 /** method to set equal two vectors */
-inline void eq(arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int nz) {
+inline void eq(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -362,7 +362,7 @@ inline void eq(arr3_double& vect1, const arr3_double& vect2, int nx, int ny, int
 
 }
 /** method to set equal two vectors */
-inline void eq(arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
+inline void eq(arr3_double vect1, const arr3_double vect2, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect1.fetch(i,j,0) = vect2.get(i,j,0);
@@ -370,14 +370,14 @@ inline void eq(arr3_double& vect1, const arr3_double& vect2, int nx, int ny) {
 }
 
 /** method to set equal two vectors */
-inline void eq(arr4_double& vect1, const arr3_double& vect2, int nx, int ny, int is) {
+inline void eq(arr4_double vect1, const arr3_double vect2, int nx, int ny, int is) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect1.fetch(is,i,j,0) = vect2.get(i,j,0);
 
 }
 /** method to set equal two vectors */
-inline void eq(arr4_double& vect1, const arr3_double& vect2, int nx, int ny, int nz, int is) {
+inline void eq(arr4_double vect1, const arr3_double vect2, int nx, int ny, int nz, int is) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -390,7 +390,7 @@ inline void eq(double *vect1, double *vect2, int n) {
     vect1[i] = vect2[i];
 }
 /** method to set a vector to a Value */
-inline void eqValue(double value, arr3_double& vect, int nx, int ny, int nz) {
+inline void eqValue(double value, arr3_double vect, int nx, int ny, int nz) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++)
@@ -405,14 +405,14 @@ inline void eqValue(double value, double vect[][2][2], int nx, int ny, int nz) {
 
 }
 /** method to set a vector to a Value */
-inline void eqValue(double value, arr3_double& vect, int nx, int ny) {
+inline void eqValue(double value, arr3_double vect, int nx, int ny) {
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       vect.fetch(i,j,0) = value;
 
 }
 /** method to set a vector to a Value */
-inline void eqValue(double value, arr3_double& vect, int nx) {
+inline void eqValue(double value, arr3_double vect, int nx) {
   for (register int i = 0; i < nx; i++)
     vect.fetch(i,0,0) = value;
 
diff --git a/include/ComNodes3D.h b/include/ComNodes3D.h
index 7360a02e..c7e86731 100644
--- a/include/ComNodes3D.h
+++ b/include/ComNodes3D.h
@@ -20,45 +20,45 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #include "BcFields3D.h"
 
 /** communicate ghost cells (FOR NODES) */
-void communicateNode(int nx, int ny, int nz, arr3_double& vector, VirtualTopology3D * vct);
+void communicateNode(int nx, int ny, int nz, arr3_double vector, VirtualTopology3D * vct);
 
 /** communicate ghost cells (FOR NODES) */
-void communicateNodeBC(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateNodeBC(int nx, int ny, int nz, arr3_double vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 /** communicate ghost cells (FOR NODES) with particles BC*/
-void communicateNodeBC_P(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateNodeBC_P(int nx, int ny, int nz, arr3_double vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 /** SPECIES: communicate ghost cells */
-void communicateNode(int nx, int ny, int nz, arr4_double& vector, int ns, VirtualTopology3D * vct);
+void communicateNode(int nx, int ny, int nz, arr4_double vector, int ns, VirtualTopology3D * vct);
 
 // PARTICLES
 /** SPECIES: communicate ghost cells */
-void communicateNode_P(int nx, int ny, int nz, arr4_double& vector, int ns, VirtualTopology3D * vct);
+void communicateNode_P(int nx, int ny, int nz, arr4_double vector, int ns, VirtualTopology3D * vct);
 
 // 
 /** communicate ghost cells (FOR CENTERS) */
-void communicateCenter(int nx, int ny, int nz, arr3_double& vector, VirtualTopology3D * vct);
+void communicateCenter(int nx, int ny, int nz, arr3_double vector, VirtualTopology3D * vct);
 
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
-void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 // particles
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
-void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 // 
 
-void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
-void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 /** SPECIES: communicate ghost cells */
-void communicateCenter(int nx, int ny, int nz, arr4_double& vector, int ns, VirtualTopology3D * vct);
+void communicateCenter(int nx, int ny, int nz, arr4_double vector, int ns, VirtualTopology3D * vct);
 
 // /////////// communication + BC ////////////////////////////
-void communicateCenterBC(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateCenterBC(int nx, int ny, int nz, arr3_double vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 // /////////// communication + BC ////////////////////////////
-void communicateCenterBC_P(int nx, int ny, int nz, arr3_double& vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
+void communicateCenterBC_P(int nx, int ny, int nz, arr3_double vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct);
 
 #endif
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 9a086d1f..335a45ea 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -91,11 +91,11 @@ class EMfields3D                // :public Field
     void fixBforcefree(Grid * grid, VirtualTopology3D * vct);
 
     /*! Calculate the three components of Pi(implicit pressure) cross image vector */
-    void PIdot(arr3_double& PIdotX, arr3_double& PIdotY, arr3_double& PIdotZ,
-      const_arr3_double& vectX, const_arr3_double& vectY, const_arr3_double& vectZ, int ns, Grid * grid);
+    void PIdot(arr3_double PIdotX, arr3_double PIdotY, arr3_double PIdotZ,
+      const_arr3_double vectX, const_arr3_double vectY, const_arr3_double vectZ, int ns, Grid * grid);
     /*! Calculate the three components of mu (implicit permeattivity) cross image vector */
-    void MUdot(arr3_double& MUdotX, arr3_double& MUdotY, arr3_double& MUdotZ,
-      const_arr3_double& vectX, const_arr3_double& vectY, const_arr3_double& vectZ, Grid * grid);
+    void MUdot(arr3_double MUdotX, arr3_double MUdotY, arr3_double MUdotZ,
+      const_arr3_double vectX, const_arr3_double vectY, const_arr3_double vectZ, Grid * grid);
     /*! Calculate rho hat, Jx hat, Jy hat, Jz hat */
     void calculateHatFunctions(Grid * grid, VirtualTopology3D * vct);
 
@@ -109,9 +109,9 @@ class EMfields3D                // :public Field
     /*! Sum current over different species */
     void sumOverSpeciesJ();
     /*! Smoothing after the interpolation* */
-    void smooth(double value, arr3_double& vector, int type, Grid * grid, VirtualTopology3D * vct);
+    void smooth(double value, arr3_double vector, int type, Grid * grid, VirtualTopology3D * vct);
     /*! SPECIES: Smoothing after the interpolation for species fields* */
-    void smooth(double value, arr4_double& vector, int is, int type, Grid * grid, VirtualTopology3D * vct);
+    void smooth(double value, arr4_double vector, int is, int type, Grid * grid, VirtualTopology3D * vct);
     /*! smooth the electric field */
     void smoothE(double value, VirtualTopology3D * vct, Collective *col);
 
@@ -147,20 +147,20 @@ class EMfields3D                // :public Field
 
 
     /*! Perfect conductor boundary conditions LEFT wall */
-    void perfectConductorLeft(arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
-      const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ,
+    void perfectConductorLeft(arr3_double imageX, arr3_double imageY, arr3_double imageZ,
+      const_arr3_double vectorX, const_arr3_double vectorY, const_arr3_double vectorZ,
       int dir, Grid * grid);
     /*! Perfect conductor boundary conditions RIGHT wall */
     void perfectConductorRight(
-      arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
-      const_arr3_double& vectorX,
-      const_arr3_double& vectorY,
-      const_arr3_double& vectorZ,
+      arr3_double imageX, arr3_double imageY, arr3_double imageZ,
+      const_arr3_double vectorX,
+      const_arr3_double vectorY,
+      const_arr3_double vectorZ,
       int dir, Grid * grid);
     /*! Perfect conductor boundary conditions for source LEFT wall */
-    void perfectConductorLeftS(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ, int dir);
+    void perfectConductorLeftS(arr3_double vectorX, arr3_double vectorY, arr3_double vectorZ, int dir);
     /*! Perfect conductor boundary conditions for source RIGHT wall */
-    void perfectConductorRightS(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ, int dir);
+    void perfectConductorRightS(arr3_double vectorX, arr3_double vectorY, arr3_double vectorZ, int dir);
 
     /*! Calculate the sysceptibility tensor on the boundary */
     void sustensorRightX(double **susxx, double **susyx, double **suszx);
@@ -193,12 +193,12 @@ class EMfields3D                // :public Field
 
     // field components without ghost cells
     //
-    void getExc(arr3_double& arr, Grid3DCU *grid);
-    void getEyc(arr3_double& arr, Grid3DCU *grid);
-    void getEzc(arr3_double& arr, Grid3DCU *grid);
-    void getBxc(arr3_double& arr);
-    void getByc(arr3_double& arr);
-    void getBzc(arr3_double& arr);
+    void getExc(arr3_double arr, Grid3DCU *grid);
+    void getEyc(arr3_double arr, Grid3DCU *grid);
+    void getEzc(arr3_double arr, Grid3DCU *grid);
+    void getBxc(arr3_double arr);
+    void getByc(arr3_double arr);
+    void getBzc(arr3_double arr);
 
     arr3_double getRHOc() { return rhoc; }
     arr3_double getRHOn() { return rhon; }
@@ -211,7 +211,7 @@ class EMfields3D                // :public Field
     double getRHOns(int X,int Y,int Z,int is)const{return rhons.get(is,X,Y,Z);}
     arr4_double getRHOns(){return rhons;}
     /* density on cells without ghost cells */
-    void getRHOcs(arr3_double& arr, Grid3DCU *grid, int is);
+    void getRHOcs(arr3_double arr, Grid3DCU *grid, int is);
 
     double getBx_ext(int X, int Y, int Z) const{return Bx_ext.get(X,Y,Z);}
     double getBy_ext(int X, int Y, int Z) const{return By_ext.get(X,Y,Z);}
@@ -246,9 +246,9 @@ class EMfields3D                // :public Field
 
     // get current for species in all cells except ghost
     //
-    void getJxsc(arr3_double& arr, Grid3DCU *grid, int is);
-    void getJysc(arr3_double& arr, Grid3DCU *grid, int is);
-    void getJzsc(arr3_double& arr, Grid3DCU *grid, int is);
+    void getJxsc(arr3_double arr, Grid3DCU *grid, int is);
+    void getJysc(arr3_double arr, Grid3DCU *grid, int is);
+    void getJzsc(arr3_double arr, Grid3DCU *grid, int is);
 
     /*! get the electric field energy */
     double getEenergy();
@@ -497,12 +497,12 @@ class EMfields3D                // :public Field
     injInfoFields* get_InfoFieldsRear();
     injInfoFields* get_InfoFieldsRight();
 
-    void BoundaryConditionsB(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ,
+    void BoundaryConditionsB(arr3_double vectorX, arr3_double vectorY, arr3_double vectorZ,
       int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct);
-    void BoundaryConditionsE(arr3_double& vectorX, arr3_double& vectorY, arr3_double& vectorZ,
+    void BoundaryConditionsE(arr3_double vectorX, arr3_double vectorY, arr3_double vectorZ,
       int nx, int ny, int nz,Grid *grid, VirtualTopology3D *vct);
-    void BoundaryConditionsEImage(arr3_double& imageX, arr3_double& imageY, arr3_double& imageZ,
-      const_arr3_double& vectorX, const_arr3_double& vectorY, const_arr3_double& vectorZ,
+    void BoundaryConditionsEImage(arr3_double imageX, arr3_double imageY, arr3_double imageZ,
+      const_arr3_double vectorX, const_arr3_double vectorY, const_arr3_double vectorZ,
       int nx, int ny, int nz, VirtualTopology3D *vct,Grid *grid);
 };
 
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index c8cd883d..2754ec95 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -42,76 +42,76 @@ class Grid3DCU                  // :public Grid
   /** print grid info */
   void print(VirtualTopology3D * ptVCT);
   /** calculate a derivative along a direction on nodes */
-  void derivN(arr3_double& derN,
-    const_arr4_double& scFieldC, int ns, int dir);
+  void derivN(arr3_double derN,
+    const_arr4_double scFieldC, int ns, int dir);
   /** calculate gradient on nodes, given a scalar field defined on central points  */
-  void gradC2N(arr3_double& gradXN, arr3_double& gradYN, arr3_double& gradZN,
-    const_arr3_double& scFieldC);
+  void gradC2N(arr3_double gradXN, arr3_double gradYN, arr3_double gradZN,
+    const_arr3_double scFieldC);
   /** calculate gradient on nodes, given a scalar field defined on central points  */
-  void gradN2C(arr3_double& gradXC, arr3_double& gradYC, arr3_double& gradZC,
-    const_arr3_double& scFieldN);
+  void gradN2C(arr3_double gradXC, arr3_double gradYC, arr3_double gradZC,
+    const_arr3_double scFieldN);
   /** calculate divergence on central points, given a vector field defined on nodes  */
-  void divN2C(arr3_double& divC,
-    const_arr3_double& vecFieldXN,
-    const_arr3_double& vecFieldYN,
-    const_arr3_double& vecFieldZN);
+  void divN2C(arr3_double divC,
+    const_arr3_double vecFieldXN,
+    const_arr3_double vecFieldYN,
+    const_arr3_double vecFieldZN);
   /** calculate divergence on nodes, given a vector field defined on central points  */
-  void divC2N(arr3_double& divN,
-    const_arr3_double& vecFieldXC,
-    const_arr3_double& vecFieldYC,
-    const_arr3_double& vecFieldZC);
+  void divC2N(arr3_double divN,
+    const_arr3_double vecFieldXC,
+    const_arr3_double vecFieldYC,
+    const_arr3_double vecFieldZC);
   /** calculate curl on nodes, given a vector field defined on central points  */
-  void curlC2N(arr3_double& curlXN, arr3_double& curlYN,
-    arr3_double& curlZN,
-    const_arr3_double& vecFieldXC,
-    const_arr3_double& vecFieldYC,
-    const_arr3_double& vecFieldZC);
+  void curlC2N(arr3_double curlXN, arr3_double curlYN,
+    arr3_double curlZN,
+    const_arr3_double vecFieldXC,
+    const_arr3_double vecFieldYC,
+    const_arr3_double vecFieldZC);
   /** calculate curl on central points, given a vector field defined on nodes  */
-  void curlN2C(arr3_double& curlXC, arr3_double& curlYC, arr3_double& curlZC,
-    const_arr3_double& vecFieldXN,
-    const_arr3_double& vecFieldYN,
-    const_arr3_double& vecFieldZN);
+  void curlN2C(arr3_double curlXC, arr3_double curlYC, arr3_double curlZC,
+    const_arr3_double vecFieldXN,
+    const_arr3_double vecFieldYN,
+    const_arr3_double vecFieldZN);
 
   /** calculate divergence on central points, given a Tensor field defined on nodes  */
-  void divSymmTensorN2C(arr3_double& divCX, arr3_double& divCY, arr3_double& divCZ,
-    const_arr4_double& pXX,
-    const_arr4_double& pXY,
-    const_arr4_double& pXZ,
-    const_arr4_double& pYY,
-    const_arr4_double& pYZ,
-    const_arr4_double& pZZ, int ns);
+  void divSymmTensorN2C(arr3_double divCX, arr3_double divCY, arr3_double divCZ,
+    const_arr4_double pXX,
+    const_arr4_double pXY,
+    const_arr4_double pXZ,
+    const_arr4_double pYY,
+    const_arr4_double pYZ,
+    const_arr4_double pZZ, int ns);
 
   /** calculate laplacian on nodes, given a scalar field defined on nodes */
-  void lapN2N(arr3_double& lapN,
-    const_arr3_double& scFieldN, VirtualTopology3D * vct);
+  void lapN2N(arr3_double lapN,
+    const_arr3_double scFieldN, VirtualTopology3D * vct);
   /** calculate laplacian on central points, given a scalar field defined on central points for Poisson */
-  void lapC2Cpoisson(arr3_double& lapC,
-    arr3_double& scFieldC, VirtualTopology3D * vct);
+  void lapC2Cpoisson(arr3_double lapC,
+    arr3_double scFieldC, VirtualTopology3D * vct);
   /** calculate laplacian on central points, given a scalar field defined on central points */
-  void lapC2C(arr3_double& lapC,
-    const_arr3_double& scFieldC, VirtualTopology3D * vct);
+  void lapC2C(arr3_double lapC,
+    const_arr3_double scFieldC, VirtualTopology3D * vct);
 
   /** calculate divergence on boundaries */
-  void divBCleft(arr3_double& divBC,
-    const_arr3_double& vectorX,
-    const_arr3_double& vectorY,
-    const_arr3_double& vectorZ, int leftActiveNode, int dirDER);
+  void divBCleft(arr3_double divBC,
+    const_arr3_double vectorX,
+    const_arr3_double vectorY,
+    const_arr3_double vectorZ, int leftActiveNode, int dirDER);
   /** calculate divergence on boundaries */
-  void divBCright(arr3_double& divBC,
-    const_arr3_double& vectorX,
-    const_arr3_double& vectorY,
-    const_arr3_double& vectorZ, int rightActiveNode, int dirDER);
+  void divBCright(arr3_double divBC,
+    const_arr3_double vectorX,
+    const_arr3_double vectorY,
+    const_arr3_double vectorZ, int rightActiveNode, int dirDER);
   /** calculate derivative on boundaries */
-  void derBC(arr3_double& derBC,
-    const_arr3_double& vector, int leftActiveNode, int dirDER);
+  void derBC(arr3_double derBC,
+    const_arr3_double vector, int leftActiveNode, int dirDER);
 
 
   /** interpolate on nodes from central points */
-  void interpC2N(arr3_double& vecFieldN, const_arr3_double& vecFieldC);
+  void interpC2N(arr3_double vecFieldN, const_arr3_double vecFieldC);
   /** interpolate on central points from nodes */
-  void interpN2C(arr3_double& vecFieldC, const_arr3_double& vecFieldN);
+  void interpN2C(arr3_double vecFieldC, const_arr3_double vecFieldN);
   /** interpolate on central points from nodes */
-  void interpN2C(arr4_double& vecFieldC, int ns, const_arr4_double& vecFieldN);
+  void interpN2C(arr4_double vecFieldC, int ns, const_arr4_double vecFieldN);
 
   // /////////// PRIVATE VARIABLES //////////////
 private:
diff --git a/include/Moments.h b/include/Moments.h
index 981ee15b..fd28e169 100644
--- a/include/Moments.h
+++ b/include/Moments.h
@@ -39,16 +39,16 @@ class Moments {
     double get_pYZ(int i, int j, int k) const { return pYZ.get(i,j,k); }
     double get_pZZ(int i, int j, int k) const { return pZZ.get(i,j,k); }
     // fetch accessors (write access)
-    arr3_double& fetch_rho() { return rho; }
-    arr3_double& fetch_Jx () { return Jx ; }
-    arr3_double& fetch_Jy () { return Jy ; }
-    arr3_double& fetch_Jz () { return Jz ; }
-    arr3_double& fetch_Pxx() { return pXX; }
-    arr3_double& fetch_Pxy() { return pXY; }
-    arr3_double& fetch_Pxz() { return pXZ; }
-    arr3_double& fetch_Pyy() { return pYY; }
-    arr3_double& fetch_Pyz() { return pYZ; }
-    arr3_double& fetch_Pzz() { return pZZ; }
+    arr3_double fetch_rho() { return rho; }
+    arr3_double fetch_Jx () { return Jx ; }
+    arr3_double fetch_Jy () { return Jy ; }
+    arr3_double fetch_Jz () { return Jz ; }
+    arr3_double fetch_Pxx() { return pXX; }
+    arr3_double fetch_Pxy() { return pXY; }
+    arr3_double fetch_Pxz() { return pXZ; }
+    arr3_double fetch_Pyy() { return pYY; }
+    arr3_double fetch_Pyz() { return pYZ; }
+    arr3_double fetch_Pzz() { return pZZ; }
   public:
     Moments(int nxn, int nyn, int nzn) :
       nx(nxn),
diff --git a/include/TransArraySpace3D.h b/include/TransArraySpace3D.h
index c83710f3..c05a1bdf 100644
--- a/include/TransArraySpace3D.h
+++ b/include/TransArraySpace3D.h
@@ -11,7 +11,7 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #define TransArraySpace3D_H
 
 /** method to convert a 1D field in a 3D field not considering guard cells*/
-inline void solver2phys(arr3_double& vectPhys, double *vectSolver, int nx, int ny, int nz) {
+inline void solver2phys(arr3_double vectPhys, double *vectSolver, int nx, int ny, int nz) {
   for (register int i = 1; i < nx - 1; i++)
     for (register int j = 1; j < ny - 1; j++)
       for (register int k = 1; k < nz - 1; k++)
@@ -19,7 +19,7 @@ inline void solver2phys(arr3_double& vectPhys, double *vectSolver, int nx, int n
 
 }
 /** method to convert a 1D field in a 3D field not considering guard cells*/
-inline void solver2phys(arr3_double& vectPhys1, arr3_double& vectPhys2, arr3_double& vectPhys3, double *vectSolver, int nx, int ny, int nz) {
+inline void solver2phys(arr3_double vectPhys1, arr3_double vectPhys2, arr3_double vectPhys3, double *vectSolver, int nx, int ny, int nz) {
   for (register int i = 1; i < nx - 1; i++)
     for (register int j = 1; j < ny - 1; j++)
       for (register int k = 1; k < nz - 1; k++) {
@@ -29,14 +29,14 @@ inline void solver2phys(arr3_double& vectPhys1, arr3_double& vectPhys2, arr3_dou
       }
 }
 /** method to convert a 3D field in a 1D field not considering guard cells*/
-inline void phys2solver(double *vectSolver, const arr3_double& vectPhys, int nx, int ny, int nz) {
+inline void phys2solver(double *vectSolver, const arr3_double vectPhys, int nx, int ny, int nz) {
   for (register int i = 1; i < nx - 1; i++)
     for (register int j = 1; j < ny - 1; j++)
       for (register int k = 1; k < nz - 1; k++)
         *vectSolver++ = vectPhys.get(i,j,k);
 }
 /** method to convert a 3D field in a 1D field not considering guard cells*/
-inline void phys2solver(double *vectSolver, const arr3_double& vectPhys1, const arr3_double& vectPhys2, const arr3_double& vectPhys3, int nx, int ny, int nz) {
+inline void phys2solver(double *vectSolver, const arr3_double vectPhys1, const arr3_double vectPhys2, const arr3_double vectPhys3, int nx, int ny, int nz) {
   for (register int i = 1; i < nx - 1; i++)
     for (register int j = 1; j < ny - 1; j++)
       for (register int k = 1; k < nz - 1; k++) {
diff --git a/include/phdf5.h b/include/phdf5.h
index 39dba569..462fe746 100644
--- a/include/phdf5.h
+++ b/include/phdf5.h
@@ -22,9 +22,9 @@ class PHDF5fileClass{
     void CreatePHDF5file(double *L, int *dglob, int *dlocl, bool bp);
     void ClosePHDF5file();
     void OpenPHDF5file();
-    void ReadPHDF5dataset_double(string dataset, arr3_double& data);
+    void ReadPHDF5dataset_double(string dataset, arr3_double data);
     void ReadPHDF5param();
-    int  WritePHDF5dataset(string grpname, string datasetname, const_arr3_double& data, int nx, int ny, int nz);
+    int  WritePHDF5dataset(string grpname, string datasetname, const_arr3_double data, int nx, int ny, int nz);
 
     int  getPHDF5ndim();
     int  getPHDF5ncx();
diff --git a/inputoutput/phdf5.cpp b/inputoutput/phdf5.cpp
index 5b8368ab..1496aa59 100644
--- a/inputoutput/phdf5.cpp
+++ b/inputoutput/phdf5.cpp
@@ -112,7 +112,7 @@ void PHDF5fileClass::ClosePHDF5file(){
 
 }
 
-int PHDF5fileClass::WritePHDF5dataset(string grpname, string datasetname, const_arr3_double& data, int nx, int ny, int nz){
+int PHDF5fileClass::WritePHDF5dataset(string grpname, string datasetname, const_arr3_double data, int nx, int ny, int nz){
 
   /* -------------------------- */
   /* Local variables and arrays */
@@ -266,7 +266,7 @@ void PHDF5fileClass::ReadPHDF5param(){
 
 }
 
-void PHDF5fileClass::ReadPHDF5dataset_double(string datasetname, arr3_double& data){
+void PHDF5fileClass::ReadPHDF5dataset_double(string datasetname, arr3_double data){
 
   herr_t  status;
   double *filedata;

From 967631ae81cd5c9f25bdcc23f73628af92ea427c Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 9 Aug 2013 17:56:18 +0200
Subject: [PATCH 029/118] committing settings for large number of particles
 (for MIC)

---
 inputfiles/GEM.inp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/inputfiles/GEM.inp b/inputfiles/GEM.inp
index b268cac7..ba6173c2 100644
--- a/inputfiles/GEM.inp
+++ b/inputfiles/GEM.inp
@@ -47,8 +47,8 @@ y_center =   1.                  # Ly = simulation box length - y direction in m
 z_center =   1.                  # Lz = simulation box length - z direction in m  
 L_square =   .1
 
-nxc =  64                   # nxc = number of cells - x direction        
-nyc =  64                   # nyc = number of cells - y direction
+nxc = 128                   # nxc = number of cells - x direction        
+nyc = 128                   # nyc = number of cells - y direction
 nzc =  1                   # nzc = number of cells - z direction        
 
 # %%%%%%%%%%%%%% PARTICLES %%%%%%%%%%%%%%%%%
@@ -64,11 +64,11 @@ rhoINJECT =  1.0   1.0
 # TrackParticleID[species] = 1=true, 0=false --> Assign ID to particles 
 TrackParticleID = 0	0
 # npcelx = number of particles per cell - Direction X 
-npcelx =   3	3
+npcelx =   8	8
 # npcely = number of particles per cell - Direction Y 
-npcely =   3	3
+npcely =   8	8
 # npcelz = number of particles per cell - Direction Z 
-npcelz =   3	3
+npcelz =   8	8
 # qom = charge to mass ratio for different species 
 qom =  -256.0	1.0
 # uth = thermal velocity for different species - Direction X  

From dc4a2dcb81a21b44de4b6eddb365cc74bce7b06c Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 12 Aug 2013 15:16:36 +0200
Subject: [PATCH 030/118] using arr4_double(nxn,nyn,nzn,10) instance to sum
 moments

---
 fields/EMfields3D.cpp | 339 ++++++++++++++++++++----------------------
 include/Alloc.h       |   8 +
 include/EMfields3D.h  |   5 +-
 3 files changed, 172 insertions(+), 180 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 93b18100..7d752707 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -191,11 +191,10 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
   injFieldsRear   = new injInfoFields(nxn, nyn, nzn);
 
   sizeMomentsArray = omp_thread_count();
-  momentsArray = new Moments*[sizeMomentsArray];
+  momentsArray = (arr4_double**) malloc(sizeof(void*)*sizeMomentsArray);
   for(int i=0;i<sizeMomentsArray;i++)
   {
-    momentsArray[i] = new Moments(nxn,nyn,nzn);
-    //momentsArray[i]->init(nxn,nyn,nzn);
+    momentsArray[i] = new arr4_double(nxn,nyn,nzn,10);
   }
 }
 
@@ -220,16 +219,6 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   double const*const q = pcls.getQall();
   //
   const int is = pcls.get_ns();
-  double* rhons1d = &rhons[is][0][0][0];
-  double* Jxs1d   = &Jxs  [is][0][0][0];
-  double* Jys1d   = &Jys  [is][0][0][0];
-  double* Jzs1d   = &Jzs  [is][0][0][0];
-  double* pXXsn1d = &pXXsn[is][0][0][0];
-  double* pXYsn1d = &pXYsn[is][0][0][0];
-  double* pXZsn1d = &pXZsn[is][0][0][0];
-  double* pYYsn1d = &pYYsn[is][0][0][0];
-  double* pYZsn1d = &pYZsn[is][0][0][0];
-  double* pZZsn1d = &pZZsn[is][0][0][0];
   //
   const long long nop_ll = pcls.getNOP();
   const int nop = pcls.getNOP();
@@ -242,19 +231,21 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   #pragma omp parallel
   {
     int thread_num = omp_get_thread_num();
-    Moments& speciesMoments = fetch_momentsArray(thread_num);
-    speciesMoments.set_to_zero();
+    arr4_double moments = fetch_momentsArray(thread_num);
+    moments.setall(0.);
+    //Moments& speciesMoments = fetch_momentsArray(thread_num);
+    //speciesMoments.set_to_zero();
     //
-    arr3_double rho = speciesMoments.fetch_rho();
-    arr3_double Jx  = speciesMoments.fetch_Jx();
-    arr3_double Jy  = speciesMoments.fetch_Jy();
-    arr3_double Jz  = speciesMoments.fetch_Jz();
-    arr3_double Pxx = speciesMoments.fetch_Pxx();
-    arr3_double Pxy = speciesMoments.fetch_Pxy();
-    arr3_double Pxz = speciesMoments.fetch_Pxz();
-    arr3_double Pyy = speciesMoments.fetch_Pyy();
-    arr3_double Pyz = speciesMoments.fetch_Pyz();
-    arr3_double Pzz = speciesMoments.fetch_Pzz();
+    //arr3_double rho = speciesMoments.fetch_rho();
+    //arr3_double Jx  = speciesMoments.fetch_Jx();
+    //arr3_double Jy  = speciesMoments.fetch_Jy();
+    //arr3_double Jz  = speciesMoments.fetch_Jz();
+    //arr3_double Pxx = speciesMoments.fetch_Pxx();
+    //arr3_double Pxy = speciesMoments.fetch_Pxy();
+    //arr3_double Pxz = speciesMoments.fetch_Pxz();
+    //arr3_double Pyy = speciesMoments.fetch_Pyy();
+    //arr3_double Pyz = speciesMoments.fetch_Pyz();
+    //arr3_double Pzz = speciesMoments.fetch_Pzz();
     // The following loop is expensive, so it is wise to assume that the
     // compiler is stupid.  Therefore we should on the one hand
     // expand things out and on the other hand avoid repeating computations.
@@ -266,12 +257,24 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
       const double ui=u[i];
       const double vi=v[i];
       const double wi=w[i];
-      const double uui=ui*ui;
-      const double uvi=ui*vi;
-      const double uwi=ui*wi;
-      const double vvi=vi*vi;
-      const double vwi=vi*wi;
-      const double wwi=wi*wi;
+      //const double uui=ui*ui;
+      //const double uvi=ui*vi;
+      //const double uwi=ui*wi;
+      //const double vvi=vi*vi;
+      //const double vwi=vi*wi;
+      //const double wwi=wi*wi;
+      double velmoments[10];
+      velmoments[0] = 1.;
+      velmoments[1] = ui;
+      velmoments[2] = vi;
+      velmoments[3] = wi;
+      velmoments[4] = ui*ui;
+      velmoments[5] = ui*vi;
+      velmoments[6] = ui*wi;
+      velmoments[7] = vi*vi;
+      velmoments[8] = vi*wi;
+      velmoments[9] = wi*wi;
+
       //
       // compute the weights to distribute the moments
       //
@@ -299,156 +302,136 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
       const double weight101 = qi * xi1 * eta0 * zeta1 * invVOL;
       const double weight110 = qi * xi1 * eta1 * zeta0 * invVOL;
       const double weight111 = qi * xi1 * eta1 * zeta1 * invVOL;
+
+      moments[ix  ][iy  ][iz  ][0] += velmoments[0]*weight000;
+      moments[ix  ][iy  ][iz  ][1] += velmoments[1]*weight000;
+      moments[ix  ][iy  ][iz  ][2] += velmoments[2]*weight000;
+      moments[ix  ][iy  ][iz  ][3] += velmoments[3]*weight000;
+      moments[ix  ][iy  ][iz  ][4] += velmoments[4]*weight000;
+      moments[ix  ][iy  ][iz  ][5] += velmoments[5]*weight000;
+      moments[ix  ][iy  ][iz  ][6] += velmoments[6]*weight000;
+      moments[ix  ][iy  ][iz  ][7] += velmoments[7]*weight000;
+      moments[ix  ][iy  ][iz  ][8] += velmoments[8]*weight000;
+      moments[ix  ][iy  ][iz  ][9] += velmoments[9]*weight000;
+      //
+      moments[ix  ][iy  ][iz-1][0] += velmoments[0]*weight001;
+      moments[ix  ][iy  ][iz-1][1] += velmoments[1]*weight001;
+      moments[ix  ][iy  ][iz-1][2] += velmoments[2]*weight001;
+      moments[ix  ][iy  ][iz-1][3] += velmoments[3]*weight001;
+      moments[ix  ][iy  ][iz-1][4] += velmoments[4]*weight001;
+      moments[ix  ][iy  ][iz-1][5] += velmoments[5]*weight001;
+      moments[ix  ][iy  ][iz-1][6] += velmoments[6]*weight001;
+      moments[ix  ][iy  ][iz-1][7] += velmoments[7]*weight001;
+      moments[ix  ][iy  ][iz-1][8] += velmoments[8]*weight001;
+      moments[ix  ][iy  ][iz-1][9] += velmoments[9]*weight001;
+      //
+      moments[ix  ][iy-1][iz  ][0] += velmoments[0]*weight010;
+      moments[ix  ][iy-1][iz  ][1] += velmoments[1]*weight010;
+      moments[ix  ][iy-1][iz  ][2] += velmoments[2]*weight010;
+      moments[ix  ][iy-1][iz  ][3] += velmoments[3]*weight010;
+      moments[ix  ][iy-1][iz  ][4] += velmoments[4]*weight010;
+      moments[ix  ][iy-1][iz  ][5] += velmoments[5]*weight010;
+      moments[ix  ][iy-1][iz  ][6] += velmoments[6]*weight010;
+      moments[ix  ][iy-1][iz  ][7] += velmoments[7]*weight010;
+      moments[ix  ][iy-1][iz  ][8] += velmoments[8]*weight010;
+      moments[ix  ][iy-1][iz  ][9] += velmoments[9]*weight010;
+      //
+      moments[ix  ][iy-1][iz-1][0] += velmoments[0]*weight011;
+      moments[ix  ][iy-1][iz-1][1] += velmoments[1]*weight011;
+      moments[ix  ][iy-1][iz-1][2] += velmoments[2]*weight011;
+      moments[ix  ][iy-1][iz-1][3] += velmoments[3]*weight011;
+      moments[ix  ][iy-1][iz-1][4] += velmoments[4]*weight011;
+      moments[ix  ][iy-1][iz-1][5] += velmoments[5]*weight011;
+      moments[ix  ][iy-1][iz-1][6] += velmoments[6]*weight011;
+      moments[ix  ][iy-1][iz-1][7] += velmoments[7]*weight011;
+      moments[ix  ][iy-1][iz-1][8] += velmoments[8]*weight011;
+      moments[ix  ][iy-1][iz-1][9] += velmoments[9]*weight011;
+      //
+      moments[ix-1][iy-1][iz  ][0] += velmoments[0]*weight100;
+      moments[ix-1][iy-1][iz  ][1] += velmoments[1]*weight100;
+      moments[ix-1][iy-1][iz  ][2] += velmoments[2]*weight100;
+      moments[ix-1][iy-1][iz  ][3] += velmoments[3]*weight100;
+      moments[ix-1][iy-1][iz  ][4] += velmoments[4]*weight100;
+      moments[ix-1][iy-1][iz  ][5] += velmoments[5]*weight100;
+      moments[ix-1][iy-1][iz  ][6] += velmoments[6]*weight100;
+      moments[ix-1][iy-1][iz  ][7] += velmoments[7]*weight100;
+      moments[ix-1][iy-1][iz  ][8] += velmoments[8]*weight100;
+      moments[ix-1][iy-1][iz  ][9] += velmoments[9]*weight100;
       //
-      // use the weight to distribute the moments
+      moments[ix-1][iy  ][iz-1][0] += velmoments[0]*weight101;
+      moments[ix-1][iy  ][iz-1][1] += velmoments[1]*weight101;
+      moments[ix-1][iy  ][iz-1][2] += velmoments[2]*weight101;
+      moments[ix-1][iy  ][iz-1][3] += velmoments[3]*weight101;
+      moments[ix-1][iy  ][iz-1][4] += velmoments[4]*weight101;
+      moments[ix-1][iy  ][iz-1][5] += velmoments[5]*weight101;
+      moments[ix-1][iy  ][iz-1][6] += velmoments[6]*weight101;
+      moments[ix-1][iy  ][iz-1][7] += velmoments[7]*weight101;
+      moments[ix-1][iy  ][iz-1][8] += velmoments[8]*weight101;
+      moments[ix-1][iy  ][iz-1][9] += velmoments[9]*weight101;
       //
-      // add charge density
-      //speciesMoments.addRho(weight, ix, iy, iz);
-      rho[ix  ][iy  ][iz  ] += weight000;
-      rho[ix  ][iy  ][iz-1] += weight001;
-      rho[ix  ][iy-1][iz  ] += weight010;
-      rho[ix  ][iy-1][iz-1] += weight011;
-      rho[ix-1][iy  ][iz  ] += weight100;
-      rho[ix-1][iy  ][iz-1] += weight101;
-      rho[ix-1][iy-1][iz  ] += weight110;
-      rho[ix-1][iy-1][iz-1] += weight111;
-      // add current density - X
-      //speciesMoments.addJx(temp, ix, iy, iz);
-      Jx[ix  ][iy  ][iz  ] += ui*weight000;
-      Jx[ix  ][iy  ][iz-1] += ui*weight001;
-      Jx[ix  ][iy-1][iz  ] += ui*weight010;
-      Jx[ix  ][iy-1][iz-1] += ui*weight011;
-      Jx[ix-1][iy  ][iz  ] += ui*weight100;
-      Jx[ix-1][iy  ][iz-1] += ui*weight101;
-      Jx[ix-1][iy-1][iz  ] += ui*weight110;
-      Jx[ix-1][iy-1][iz-1] += ui*weight111;
-      // add current density - Y
-      //speciesMoments.addJy(temp, ix, iy, iz);
-      Jy[ix  ][iy  ][iz  ] += vi*weight000;
-      Jy[ix  ][iy  ][iz-1] += vi*weight001;
-      Jy[ix  ][iy-1][iz  ] += vi*weight010;
-      Jy[ix  ][iy-1][iz-1] += vi*weight011;
-      Jy[ix-1][iy  ][iz  ] += vi*weight100;
-      Jy[ix-1][iy  ][iz-1] += vi*weight101;
-      Jy[ix-1][iy-1][iz  ] += vi*weight110;
-      Jy[ix-1][iy-1][iz-1] += vi*weight111;
-      // add current density - Z
-      //speciesMoments.addJz(temp, ix, iy, iz);
-      Jz[ix  ][iy  ][iz  ] += wi*weight000;
-      Jz[ix  ][iy  ][iz-1] += wi*weight001;
-      Jz[ix  ][iy-1][iz  ] += wi*weight010;
-      Jz[ix  ][iy-1][iz-1] += wi*weight011;
-      Jz[ix-1][iy  ][iz  ] += wi*weight100;
-      Jz[ix-1][iy  ][iz-1] += wi*weight101;
-      Jz[ix-1][iy-1][iz  ] += wi*weight110;
-      Jz[ix-1][iy-1][iz-1] += wi*weight111;
-      // Pxx - add pressure tensor
-      //speciesMoments.addPxx(temp, ix, iy, iz);
-      Pxx[ix  ][iy  ][iz  ] += uui*weight000;
-      Pxx[ix  ][iy  ][iz-1] += uui*weight001;
-      Pxx[ix  ][iy-1][iz  ] += uui*weight010;
-      Pxx[ix  ][iy-1][iz-1] += uui*weight011;
-      Pxx[ix-1][iy  ][iz  ] += uui*weight100;
-      Pxx[ix-1][iy  ][iz-1] += uui*weight101;
-      Pxx[ix-1][iy-1][iz  ] += uui*weight110;
-      Pxx[ix-1][iy-1][iz-1] += uui*weight111;
-      // Pxy - add pressure tensor
-      //speciesMoments.addPxy(temp, ix, iy, iz);
-      Pxy[ix  ][iy  ][iz  ] += uvi*weight000;
-      Pxy[ix  ][iy  ][iz-1] += uvi*weight001;
-      Pxy[ix  ][iy-1][iz  ] += uvi*weight010;
-      Pxy[ix  ][iy-1][iz-1] += uvi*weight011;
-      Pxy[ix-1][iy  ][iz  ] += uvi*weight100;
-      Pxy[ix-1][iy  ][iz-1] += uvi*weight101;
-      Pxy[ix-1][iy-1][iz  ] += uvi*weight110;
-      Pxy[ix-1][iy-1][iz-1] += uvi*weight111;
-      // Pxz - add pressure tensor
-      //speciesMoments.addPxz(temp, ix, iy, iz);
-      Pxz[ix  ][iy  ][iz  ] += uwi*weight000;
-      Pxz[ix  ][iy  ][iz-1] += uwi*weight001;
-      Pxz[ix  ][iy-1][iz  ] += uwi*weight010;
-      Pxz[ix  ][iy-1][iz-1] += uwi*weight011;
-      Pxz[ix-1][iy  ][iz  ] += uwi*weight100;
-      Pxz[ix-1][iy  ][iz-1] += uwi*weight101;
-      Pxz[ix-1][iy-1][iz  ] += uwi*weight110;
-      Pxz[ix-1][iy-1][iz-1] += uwi*weight111;
-      // Pyy - add pressure tensor
-      //speciesMoments.addPyy(temp, ix, iy, iz);
-      Pyy[ix  ][iy  ][iz  ] += vvi*weight000;
-      Pyy[ix  ][iy  ][iz-1] += vvi*weight001;
-      Pyy[ix  ][iy-1][iz  ] += vvi*weight010;
-      Pyy[ix  ][iy-1][iz-1] += vvi*weight011;
-      Pyy[ix-1][iy  ][iz  ] += vvi*weight100;
-      Pyy[ix-1][iy  ][iz-1] += vvi*weight101;
-      Pyy[ix-1][iy-1][iz  ] += vvi*weight110;
-      Pyy[ix-1][iy-1][iz-1] += vvi*weight111;
-      // Pyz - add pressure tensor
-      //speciesMoments.addPyz(temp, ix, iy, iz);
-      Pyz[ix  ][iy  ][iz  ] += vwi*weight000;
-      Pyz[ix  ][iy  ][iz-1] += vwi*weight001;
-      Pyz[ix  ][iy-1][iz  ] += vwi*weight010;
-      Pyz[ix  ][iy-1][iz-1] += vwi*weight011;
-      Pyz[ix-1][iy  ][iz  ] += vwi*weight100;
-      Pyz[ix-1][iy  ][iz-1] += vwi*weight101;
-      Pyz[ix-1][iy-1][iz  ] += vwi*weight110;
-      Pyz[ix-1][iy-1][iz-1] += vwi*weight111;
-      // Pzz - add pressure tensor
-      //speciesMoments.addPzz(temp, ix, iy, iz);
-      Pzz[ix  ][iy  ][iz  ] += wwi*weight000;
-      Pzz[ix  ][iy  ][iz-1] += wwi*weight001;
-      Pzz[ix  ][iy-1][iz  ] += wwi*weight010;
-      Pzz[ix  ][iy-1][iz-1] += wwi*weight011;
-      Pzz[ix-1][iy  ][iz  ] += wwi*weight100;
-      Pzz[ix-1][iy  ][iz-1] += wwi*weight101;
-      Pzz[ix-1][iy-1][iz  ] += wwi*weight110;
-      Pzz[ix-1][iy-1][iz-1] += wwi*weight111;
+      moments[ix-1][iy-1][iz  ][0] += velmoments[0]*weight110;
+      moments[ix-1][iy-1][iz  ][1] += velmoments[1]*weight110;
+      moments[ix-1][iy-1][iz  ][2] += velmoments[2]*weight110;
+      moments[ix-1][iy-1][iz  ][3] += velmoments[3]*weight110;
+      moments[ix-1][iy-1][iz  ][4] += velmoments[4]*weight110;
+      moments[ix-1][iy-1][iz  ][5] += velmoments[5]*weight110;
+      moments[ix-1][iy-1][iz  ][6] += velmoments[6]*weight110;
+      moments[ix-1][iy-1][iz  ][7] += velmoments[7]*weight110;
+      moments[ix-1][iy-1][iz  ][8] += velmoments[8]*weight110;
+      moments[ix-1][iy-1][iz  ][9] += velmoments[9]*weight110;
+      //
+      moments[ix-1][iy-1][iz-1][0] += velmoments[0]*weight111;
+      moments[ix-1][iy-1][iz-1][1] += velmoments[1]*weight111;
+      moments[ix-1][iy-1][iz-1][2] += velmoments[2]*weight111;
+      moments[ix-1][iy-1][iz-1][3] += velmoments[3]*weight111;
+      moments[ix-1][iy-1][iz-1][4] += velmoments[4]*weight111;
+      moments[ix-1][iy-1][iz-1][5] += velmoments[5]*weight111;
+      moments[ix-1][iy-1][iz-1][6] += velmoments[6]*weight111;
+      moments[ix-1][iy-1][iz-1][7] += velmoments[7]*weight111;
+      moments[ix-1][iy-1][iz-1][8] += velmoments[8]*weight111;
+      moments[ix-1][iy-1][iz-1][9] += velmoments[9]*weight111;
+      //
+      //for(int jx=0;jx<2;jx++)
+      //for(int jy=0;jy<2;jy++)
+      //for(int jz=0;jz<2;jz++)
+      //for(int m=0;m<10;m++)
+      //{
+      //  moments[ix-jx][iy-jy][iz-jz][m] += velmoments[m]*weight[jx][jy][jz];
+      //}
     }
-    // The following way is an equivalent reduction but less
-    // efficient for a large number of threads.
-    //
-    //#pragma omp critical
-    //addToSpeciesMoments(speciesMoments,is);
-    //
-    // Instead we split up the reduction tasks.
-    //
-    // One-dimensional array access is presumably
-    // more efficient on poor compilers.
-    //
-    double* rho1d = &rho[0][0][0];
-    double* Jx1d  = &Jx [0][0][0];
-    double* Jy1d  = &Jy [0][0][0];
-    double* Jz1d  = &Jz [0][0][0];
-    double* Pxx1d = &Pxx[0][0][0];
-    double* Pxy1d = &Pxy[0][0][0];
-    double* Pxz1d = &Pxz[0][0][0];
-    double* Pyy1d = &Pyy[0][0][0];
-    double* Pyz1d = &Pyz[0][0][0];
-    double* Pzz1d = &Pzz[0][0][0];
+    // We split up the reduction tasks.
     //
-    assert_eq(speciesMoments.get_nx(), nxn);
-    assert_eq(speciesMoments.get_ny(), nyn);
-    assert_eq(speciesMoments.get_nz(), nzn);
     const int numel = nxn*nyn*nzn;
-    #pragma omp critical
-    for(int i=0;i<numel;i++) rhons1d[i] += invVOL*rho1d[i];
-    #pragma omp critical
-    for(int i=0;i<numel;i++) Jxs1d  [i] += invVOL*Jx1d [i];
-    #pragma omp critical
-    for(int i=0;i<numel;i++) Jys1d  [i] += invVOL*Jy1d [i];
-    #pragma omp critical
-    for(int i=0;i<numel;i++) Jzs1d  [i] += invVOL*Jz1d [i];
-    #pragma omp critical
-    for(int i=0;i<numel;i++) pXXsn1d[i] += invVOL*Pxx1d[i];
-    #pragma omp critical
-    for(int i=0;i<numel;i++) pXYsn1d[i] += invVOL*Pxy1d[i];
-    #pragma omp critical
-    for(int i=0;i<numel;i++) pXZsn1d[i] += invVOL*Pxz1d[i];
-    #pragma omp critical
-    for(int i=0;i<numel;i++) pYYsn1d[i] += invVOL*Pyy1d[i];
-    #pragma omp critical
-    for(int i=0;i<numel;i++) pYZsn1d[i] += invVOL*Pyz1d[i];
-    #pragma omp critical
-    for(int i=0;i<numel;i++) pZZsn1d[i] += invVOL*Pzz1d[i];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      rhons[is][i][j][k] += invVOL*moments[i][j][k][0];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      Jys  [is][i][j][k] += invVOL*moments[i][j][k][2];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8];
+    #pragma omp critical collapse(3)
+    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+      pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9];
   }
   communicateGhostP2G(is, 0, 0, 0, 0, vct);
 }
@@ -3207,7 +3190,7 @@ EMfields3D::~EMfields3D() {
   delete injFieldsRear;
   for(int i=0;i<sizeMomentsArray;i++)
   {
-    delete momentsArray[i];
+    momentsArray[i]->free();
   }
-  delete [] momentsArray;
+  free(momentsArray);
 }
diff --git a/include/Alloc.h b/include/Alloc.h
index 5c3c5b5d..46c68fd9 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -608,6 +608,13 @@ namespace iPic3D
       void set(size_t n4,size_t n3,size_t n2,size_t n1, type value)
         { check_idx_bounds(n4,n3,n2,n1); arr4[n4][n3][n2][n1] = value; }
     #endif
+    protected:
+      void setall(double val)
+      {
+        #pragma omp for
+        for(int i=0;i<size;i++)
+          arr[i]=val;
+      }
   };
   
   template <class type>
@@ -644,6 +651,7 @@ namespace iPic3D
         { const_array_ref4<type>::set(n4,n3,n2,n1, value); }
       void free(){ delArray4<type>((type****)arr4); }
       type**** fetch_arr4(){ return (type****) arr4; }
+      void setall(double val) { const_array_ref4<type>::setall(val); }
       //bool verify_dims(size_t s4, size_t s3, size_t s2, size_t s1){
       //  if(s4==S4 && s3==S3 && s2==S2 && s1==S1) return true;
       //  Wprintf("%d==%d && %d==%d && %d==%d && %d==%d failed",
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 335a45ea..d0653090 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -256,7 +256,7 @@ class EMfields3D                // :public Field
     double getBenergy();
 
     /*! fetch array for summing moments of thread i */
-    Moments& fetch_momentsArray(int i){
+    arr4_double fetch_momentsArray(int i){
       assert_le(0,i);
       assert_le(i,sizeMomentsArray);
       return *momentsArray[i];
@@ -384,7 +384,8 @@ class EMfields3D                // :public Field
     array3_double divC;
     /* temporary arrays for summing moments */
     int sizeMomentsArray;
-    Moments **momentsArray;
+    //Moments **momentsArray;
+    arr4_double** momentsArray;
 
 
     // *******************************************************************************

From 991d97d997e1699b4a677b3c5e0dcf72cec757df Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 12 Aug 2013 20:23:22 +0200
Subject: [PATCH 031/118] fixed bugs introduced in previous commit

---
 fields/EMfields3D.cpp | 561 +++++++++++++++++++++++++++++-------------
 include/Alloc.h       |   4 +-
 include/EMfields3D.h  |  13 +-
 3 files changed, 397 insertions(+), 181 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 7d752707..7cfea8f1 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -5,6 +5,7 @@
 #include "TimeTasks.h"
 #include "Moments.h"
 #include "ompdefs.h"
+#include "debug.h"
 
 /*! constructor */
 //
@@ -191,10 +192,13 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
   injFieldsRear   = new injInfoFields(nxn, nyn, nzn);
 
   sizeMomentsArray = omp_thread_count();
-  momentsArray = (arr4_double**) malloc(sizeof(void*)*sizeMomentsArray);
+  momentsArray = new Moments*[sizeMomentsArray];
+  moments10 = (arr4_double**) malloc(sizeof(void*)*sizeMomentsArray);
   for(int i=0;i<sizeMomentsArray;i++)
   {
-    momentsArray[i] = new arr4_double(nxn,nyn,nzn,10);
+    momentsArray[i] = new Moments(nxn,nyn,nzn);
+    //momentsArray[i]->init(nxn,nyn,nzn);
+    moments10[i] = new arr4_double(nxn,nyn,nzn,10);
   }
 }
 
@@ -219,6 +223,20 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   double const*const q = pcls.getQall();
   //
   const int is = pcls.get_ns();
+  bool bmoments10 = true;
+  bool b10moments = false;
+
+  // if b10moments
+  double* rhons1d = &rhons[is][0][0][0];
+  double* Jxs1d   = &Jxs  [is][0][0][0];
+  double* Jys1d   = &Jys  [is][0][0][0];
+  double* Jzs1d   = &Jzs  [is][0][0][0];
+  double* pXXsn1d = &pXXsn[is][0][0][0];
+  double* pXYsn1d = &pXYsn[is][0][0][0];
+  double* pXZsn1d = &pXZsn[is][0][0][0];
+  double* pYYsn1d = &pYYsn[is][0][0][0];
+  double* pYZsn1d = &pYZsn[is][0][0][0];
+  double* pZZsn1d = &pZZsn[is][0][0][0];
   //
   const long long nop_ll = pcls.getNOP();
   const int nop = pcls.getNOP();
@@ -231,21 +249,21 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   #pragma omp parallel
   {
     int thread_num = omp_get_thread_num();
-    arr4_double moments = fetch_momentsArray(thread_num);
+    Moments& speciesMoments = fetch_momentsArray(thread_num);
+    speciesMoments.set_to_zero();
+    arr4_double moments = fetch_moments10(thread_num);
     moments.setall(0.);
-    //Moments& speciesMoments = fetch_momentsArray(thread_num);
-    //speciesMoments.set_to_zero();
     //
-    //arr3_double rho = speciesMoments.fetch_rho();
-    //arr3_double Jx  = speciesMoments.fetch_Jx();
-    //arr3_double Jy  = speciesMoments.fetch_Jy();
-    //arr3_double Jz  = speciesMoments.fetch_Jz();
-    //arr3_double Pxx = speciesMoments.fetch_Pxx();
-    //arr3_double Pxy = speciesMoments.fetch_Pxy();
-    //arr3_double Pxz = speciesMoments.fetch_Pxz();
-    //arr3_double Pyy = speciesMoments.fetch_Pyy();
-    //arr3_double Pyz = speciesMoments.fetch_Pyz();
-    //arr3_double Pzz = speciesMoments.fetch_Pzz();
+    arr3_double rho = speciesMoments.fetch_rho();
+    arr3_double Jx  = speciesMoments.fetch_Jx();
+    arr3_double Jy  = speciesMoments.fetch_Jy();
+    arr3_double Jz  = speciesMoments.fetch_Jz();
+    arr3_double Pxx = speciesMoments.fetch_Pxx();
+    arr3_double Pxy = speciesMoments.fetch_Pxy();
+    arr3_double Pxz = speciesMoments.fetch_Pxz();
+    arr3_double Pyy = speciesMoments.fetch_Pyy();
+    arr3_double Pyz = speciesMoments.fetch_Pyz();
+    arr3_double Pzz = speciesMoments.fetch_Pzz();
     // The following loop is expensive, so it is wise to assume that the
     // compiler is stupid.  Therefore we should on the one hand
     // expand things out and on the other hand avoid repeating computations.
@@ -257,23 +275,23 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
       const double ui=u[i];
       const double vi=v[i];
       const double wi=w[i];
-      //const double uui=ui*ui;
-      //const double uvi=ui*vi;
-      //const double uwi=ui*wi;
-      //const double vvi=vi*vi;
-      //const double vwi=vi*wi;
-      //const double wwi=wi*wi;
+      const double uui=ui*ui;
+      const double uvi=ui*vi;
+      const double uwi=ui*wi;
+      const double vvi=vi*vi;
+      const double vwi=vi*wi;
+      const double wwi=wi*wi;
       double velmoments[10];
       velmoments[0] = 1.;
       velmoments[1] = ui;
       velmoments[2] = vi;
       velmoments[3] = wi;
-      velmoments[4] = ui*ui;
-      velmoments[5] = ui*vi;
-      velmoments[6] = ui*wi;
-      velmoments[7] = vi*vi;
-      velmoments[8] = vi*wi;
-      velmoments[9] = wi*wi;
+      velmoments[4] = uui;
+      velmoments[5] = uvi;
+      velmoments[6] = uwi;
+      velmoments[7] = vvi;
+      velmoments[8] = vwi;
+      velmoments[9] = wwi;
 
       //
       // compute the weights to distribute the moments
@@ -303,135 +321,326 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
       const double weight110 = qi * xi1 * eta1 * zeta0 * invVOL;
       const double weight111 = qi * xi1 * eta1 * zeta1 * invVOL;
 
-      moments[ix  ][iy  ][iz  ][0] += velmoments[0]*weight000;
-      moments[ix  ][iy  ][iz  ][1] += velmoments[1]*weight000;
-      moments[ix  ][iy  ][iz  ][2] += velmoments[2]*weight000;
-      moments[ix  ][iy  ][iz  ][3] += velmoments[3]*weight000;
-      moments[ix  ][iy  ][iz  ][4] += velmoments[4]*weight000;
-      moments[ix  ][iy  ][iz  ][5] += velmoments[5]*weight000;
-      moments[ix  ][iy  ][iz  ][6] += velmoments[6]*weight000;
-      moments[ix  ][iy  ][iz  ][7] += velmoments[7]*weight000;
-      moments[ix  ][iy  ][iz  ][8] += velmoments[8]*weight000;
-      moments[ix  ][iy  ][iz  ][9] += velmoments[9]*weight000;
-      //
-      moments[ix  ][iy  ][iz-1][0] += velmoments[0]*weight001;
-      moments[ix  ][iy  ][iz-1][1] += velmoments[1]*weight001;
-      moments[ix  ][iy  ][iz-1][2] += velmoments[2]*weight001;
-      moments[ix  ][iy  ][iz-1][3] += velmoments[3]*weight001;
-      moments[ix  ][iy  ][iz-1][4] += velmoments[4]*weight001;
-      moments[ix  ][iy  ][iz-1][5] += velmoments[5]*weight001;
-      moments[ix  ][iy  ][iz-1][6] += velmoments[6]*weight001;
-      moments[ix  ][iy  ][iz-1][7] += velmoments[7]*weight001;
-      moments[ix  ][iy  ][iz-1][8] += velmoments[8]*weight001;
-      moments[ix  ][iy  ][iz-1][9] += velmoments[9]*weight001;
-      //
-      moments[ix  ][iy-1][iz  ][0] += velmoments[0]*weight010;
-      moments[ix  ][iy-1][iz  ][1] += velmoments[1]*weight010;
-      moments[ix  ][iy-1][iz  ][2] += velmoments[2]*weight010;
-      moments[ix  ][iy-1][iz  ][3] += velmoments[3]*weight010;
-      moments[ix  ][iy-1][iz  ][4] += velmoments[4]*weight010;
-      moments[ix  ][iy-1][iz  ][5] += velmoments[5]*weight010;
-      moments[ix  ][iy-1][iz  ][6] += velmoments[6]*weight010;
-      moments[ix  ][iy-1][iz  ][7] += velmoments[7]*weight010;
-      moments[ix  ][iy-1][iz  ][8] += velmoments[8]*weight010;
-      moments[ix  ][iy-1][iz  ][9] += velmoments[9]*weight010;
-      //
-      moments[ix  ][iy-1][iz-1][0] += velmoments[0]*weight011;
-      moments[ix  ][iy-1][iz-1][1] += velmoments[1]*weight011;
-      moments[ix  ][iy-1][iz-1][2] += velmoments[2]*weight011;
-      moments[ix  ][iy-1][iz-1][3] += velmoments[3]*weight011;
-      moments[ix  ][iy-1][iz-1][4] += velmoments[4]*weight011;
-      moments[ix  ][iy-1][iz-1][5] += velmoments[5]*weight011;
-      moments[ix  ][iy-1][iz-1][6] += velmoments[6]*weight011;
-      moments[ix  ][iy-1][iz-1][7] += velmoments[7]*weight011;
-      moments[ix  ][iy-1][iz-1][8] += velmoments[8]*weight011;
-      moments[ix  ][iy-1][iz-1][9] += velmoments[9]*weight011;
-      //
-      moments[ix-1][iy-1][iz  ][0] += velmoments[0]*weight100;
-      moments[ix-1][iy-1][iz  ][1] += velmoments[1]*weight100;
-      moments[ix-1][iy-1][iz  ][2] += velmoments[2]*weight100;
-      moments[ix-1][iy-1][iz  ][3] += velmoments[3]*weight100;
-      moments[ix-1][iy-1][iz  ][4] += velmoments[4]*weight100;
-      moments[ix-1][iy-1][iz  ][5] += velmoments[5]*weight100;
-      moments[ix-1][iy-1][iz  ][6] += velmoments[6]*weight100;
-      moments[ix-1][iy-1][iz  ][7] += velmoments[7]*weight100;
-      moments[ix-1][iy-1][iz  ][8] += velmoments[8]*weight100;
-      moments[ix-1][iy-1][iz  ][9] += velmoments[9]*weight100;
-      //
-      moments[ix-1][iy  ][iz-1][0] += velmoments[0]*weight101;
-      moments[ix-1][iy  ][iz-1][1] += velmoments[1]*weight101;
-      moments[ix-1][iy  ][iz-1][2] += velmoments[2]*weight101;
-      moments[ix-1][iy  ][iz-1][3] += velmoments[3]*weight101;
-      moments[ix-1][iy  ][iz-1][4] += velmoments[4]*weight101;
-      moments[ix-1][iy  ][iz-1][5] += velmoments[5]*weight101;
-      moments[ix-1][iy  ][iz-1][6] += velmoments[6]*weight101;
-      moments[ix-1][iy  ][iz-1][7] += velmoments[7]*weight101;
-      moments[ix-1][iy  ][iz-1][8] += velmoments[8]*weight101;
-      moments[ix-1][iy  ][iz-1][9] += velmoments[9]*weight101;
-      //
-      moments[ix-1][iy-1][iz  ][0] += velmoments[0]*weight110;
-      moments[ix-1][iy-1][iz  ][1] += velmoments[1]*weight110;
-      moments[ix-1][iy-1][iz  ][2] += velmoments[2]*weight110;
-      moments[ix-1][iy-1][iz  ][3] += velmoments[3]*weight110;
-      moments[ix-1][iy-1][iz  ][4] += velmoments[4]*weight110;
-      moments[ix-1][iy-1][iz  ][5] += velmoments[5]*weight110;
-      moments[ix-1][iy-1][iz  ][6] += velmoments[6]*weight110;
-      moments[ix-1][iy-1][iz  ][7] += velmoments[7]*weight110;
-      moments[ix-1][iy-1][iz  ][8] += velmoments[8]*weight110;
-      moments[ix-1][iy-1][iz  ][9] += velmoments[9]*weight110;
+      if(bmoments10)
+      {
+        moments[ix  ][iy  ][iz  ][0] += velmoments[0]*weight000;
+        moments[ix  ][iy  ][iz  ][1] += velmoments[1]*weight000;
+        moments[ix  ][iy  ][iz  ][2] += velmoments[2]*weight000;
+        moments[ix  ][iy  ][iz  ][3] += velmoments[3]*weight000;
+        moments[ix  ][iy  ][iz  ][4] += velmoments[4]*weight000;
+        moments[ix  ][iy  ][iz  ][5] += velmoments[5]*weight000;
+        moments[ix  ][iy  ][iz  ][6] += velmoments[6]*weight000;
+        moments[ix  ][iy  ][iz  ][7] += velmoments[7]*weight000;
+        moments[ix  ][iy  ][iz  ][8] += velmoments[8]*weight000;
+        moments[ix  ][iy  ][iz  ][9] += velmoments[9]*weight000;
+
+        moments[ix  ][iy  ][iz-1][0] += velmoments[0]*weight001;
+        moments[ix  ][iy  ][iz-1][1] += velmoments[1]*weight001;
+        moments[ix  ][iy  ][iz-1][2] += velmoments[2]*weight001;
+        moments[ix  ][iy  ][iz-1][3] += velmoments[3]*weight001;
+        moments[ix  ][iy  ][iz-1][4] += velmoments[4]*weight001;
+        moments[ix  ][iy  ][iz-1][5] += velmoments[5]*weight001;
+        moments[ix  ][iy  ][iz-1][6] += velmoments[6]*weight001;
+        moments[ix  ][iy  ][iz-1][7] += velmoments[7]*weight001;
+        moments[ix  ][iy  ][iz-1][8] += velmoments[8]*weight001;
+        moments[ix  ][iy  ][iz-1][9] += velmoments[9]*weight001;
+
+        moments[ix  ][iy-1][iz  ][0] += velmoments[0]*weight010;
+        moments[ix  ][iy-1][iz  ][1] += velmoments[1]*weight010;
+        moments[ix  ][iy-1][iz  ][2] += velmoments[2]*weight010;
+        moments[ix  ][iy-1][iz  ][3] += velmoments[3]*weight010;
+        moments[ix  ][iy-1][iz  ][4] += velmoments[4]*weight010;
+        moments[ix  ][iy-1][iz  ][5] += velmoments[5]*weight010;
+        moments[ix  ][iy-1][iz  ][6] += velmoments[6]*weight010;
+        moments[ix  ][iy-1][iz  ][7] += velmoments[7]*weight010;
+        moments[ix  ][iy-1][iz  ][8] += velmoments[8]*weight010;
+        moments[ix  ][iy-1][iz  ][9] += velmoments[9]*weight010;
+
+        moments[ix  ][iy-1][iz-1][0] += velmoments[0]*weight011;
+        moments[ix  ][iy-1][iz-1][1] += velmoments[1]*weight011;
+        moments[ix  ][iy-1][iz-1][2] += velmoments[2]*weight011;
+        moments[ix  ][iy-1][iz-1][3] += velmoments[3]*weight011;
+        moments[ix  ][iy-1][iz-1][4] += velmoments[4]*weight011;
+        moments[ix  ][iy-1][iz-1][5] += velmoments[5]*weight011;
+        moments[ix  ][iy-1][iz-1][6] += velmoments[6]*weight011;
+        moments[ix  ][iy-1][iz-1][7] += velmoments[7]*weight011;
+        moments[ix  ][iy-1][iz-1][8] += velmoments[8]*weight011;
+        moments[ix  ][iy-1][iz-1][9] += velmoments[9]*weight011;
+
+        moments[ix-1][iy  ][iz  ][0] += velmoments[0]*weight100;
+        moments[ix-1][iy  ][iz  ][1] += velmoments[1]*weight100;
+        moments[ix-1][iy  ][iz  ][2] += velmoments[2]*weight100;
+        moments[ix-1][iy  ][iz  ][3] += velmoments[3]*weight100;
+        moments[ix-1][iy  ][iz  ][4] += velmoments[4]*weight100;
+        moments[ix-1][iy  ][iz  ][5] += velmoments[5]*weight100;
+        moments[ix-1][iy  ][iz  ][6] += velmoments[6]*weight100;
+        moments[ix-1][iy  ][iz  ][7] += velmoments[7]*weight100;
+        moments[ix-1][iy  ][iz  ][8] += velmoments[8]*weight100;
+        moments[ix-1][iy  ][iz  ][9] += velmoments[9]*weight100;
+
+        moments[ix-1][iy  ][iz-1][0] += velmoments[0]*weight101;
+        moments[ix-1][iy  ][iz-1][1] += velmoments[1]*weight101;
+        moments[ix-1][iy  ][iz-1][2] += velmoments[2]*weight101;
+        moments[ix-1][iy  ][iz-1][3] += velmoments[3]*weight101;
+        moments[ix-1][iy  ][iz-1][4] += velmoments[4]*weight101;
+        moments[ix-1][iy  ][iz-1][5] += velmoments[5]*weight101;
+        moments[ix-1][iy  ][iz-1][6] += velmoments[6]*weight101;
+        moments[ix-1][iy  ][iz-1][7] += velmoments[7]*weight101;
+        moments[ix-1][iy  ][iz-1][8] += velmoments[8]*weight101;
+        moments[ix-1][iy  ][iz-1][9] += velmoments[9]*weight101;
+
+        moments[ix-1][iy-1][iz  ][0] += velmoments[0]*weight110;
+        moments[ix-1][iy-1][iz  ][1] += velmoments[1]*weight110;
+        moments[ix-1][iy-1][iz  ][2] += velmoments[2]*weight110;
+        moments[ix-1][iy-1][iz  ][3] += velmoments[3]*weight110;
+        moments[ix-1][iy-1][iz  ][4] += velmoments[4]*weight110;
+        moments[ix-1][iy-1][iz  ][5] += velmoments[5]*weight110;
+        moments[ix-1][iy-1][iz  ][6] += velmoments[6]*weight110;
+        moments[ix-1][iy-1][iz  ][7] += velmoments[7]*weight110;
+        moments[ix-1][iy-1][iz  ][8] += velmoments[8]*weight110;
+        moments[ix-1][iy-1][iz  ][9] += velmoments[9]*weight110;
+
+        moments[ix-1][iy-1][iz-1][0] += velmoments[0]*weight111;
+        moments[ix-1][iy-1][iz-1][1] += velmoments[1]*weight111;
+        moments[ix-1][iy-1][iz-1][2] += velmoments[2]*weight111;
+        moments[ix-1][iy-1][iz-1][3] += velmoments[3]*weight111;
+        moments[ix-1][iy-1][iz-1][4] += velmoments[4]*weight111;
+        moments[ix-1][iy-1][iz-1][5] += velmoments[5]*weight111;
+        moments[ix-1][iy-1][iz-1][6] += velmoments[6]*weight111;
+        moments[ix-1][iy-1][iz-1][7] += velmoments[7]*weight111;
+        moments[ix-1][iy-1][iz-1][8] += velmoments[8]*weight111;
+        moments[ix-1][iy-1][iz-1][9] += velmoments[9]*weight111;
+
+        //double weight[2][2][2];
+        //weight[0][0][0]=weight000;
+        //weight[0][0][1]=weight001;
+        //weight[0][1][0]=weight010;
+        //weight[0][1][1]=weight011;
+        //weight[1][0][0]=weight100;
+        //weight[1][0][1]=weight101;
+        //weight[1][1][0]=weight110;
+        //weight[1][1][1]=weight111;
+        ////
+        //for(int jx=0;jx<2;jx++)
+        //for(int jy=0;jy<2;jy++)
+        //for(int jz=0;jz<2;jz++)
+        //for(int m=0;m<10;m++)
+        //{
+        //  moments[ix-jx][iy-jy][iz-jz][m] += velmoments[m]*weight[jx][jy][jz];
+        //}
+      }
+
+      if(b10moments)
+      {
+        // use the weight to distribute the moments
+        //
+        // add charge density
+        //speciesMoments.addRho(weight, ix, iy, iz);
+        rho[ix  ][iy  ][iz  ] += weight000;
+        rho[ix  ][iy  ][iz-1] += weight001;
+        rho[ix  ][iy-1][iz  ] += weight010;
+        rho[ix  ][iy-1][iz-1] += weight011;
+        rho[ix-1][iy  ][iz  ] += weight100;
+        rho[ix-1][iy  ][iz-1] += weight101;
+        rho[ix-1][iy-1][iz  ] += weight110;
+        rho[ix-1][iy-1][iz-1] += weight111;
+        // add current density - X
+        //speciesMoments.addJx(temp, ix, iy, iz);
+        Jx[ix  ][iy  ][iz  ] += ui*weight000;
+        Jx[ix  ][iy  ][iz-1] += ui*weight001;
+        Jx[ix  ][iy-1][iz  ] += ui*weight010;
+        Jx[ix  ][iy-1][iz-1] += ui*weight011;
+        Jx[ix-1][iy  ][iz  ] += ui*weight100;
+        Jx[ix-1][iy  ][iz-1] += ui*weight101;
+        Jx[ix-1][iy-1][iz  ] += ui*weight110;
+        Jx[ix-1][iy-1][iz-1] += ui*weight111;
+        // add current density - Y
+        //speciesMoments.addJy(temp, ix, iy, iz);
+        Jy[ix  ][iy  ][iz  ] += vi*weight000;
+        Jy[ix  ][iy  ][iz-1] += vi*weight001;
+        Jy[ix  ][iy-1][iz  ] += vi*weight010;
+        Jy[ix  ][iy-1][iz-1] += vi*weight011;
+        Jy[ix-1][iy  ][iz  ] += vi*weight100;
+        Jy[ix-1][iy  ][iz-1] += vi*weight101;
+        Jy[ix-1][iy-1][iz  ] += vi*weight110;
+        Jy[ix-1][iy-1][iz-1] += vi*weight111;
+        // add current density - Z
+        //speciesMoments.addJz(temp, ix, iy, iz);
+        Jz[ix  ][iy  ][iz  ] += wi*weight000;
+        Jz[ix  ][iy  ][iz-1] += wi*weight001;
+        Jz[ix  ][iy-1][iz  ] += wi*weight010;
+        Jz[ix  ][iy-1][iz-1] += wi*weight011;
+        Jz[ix-1][iy  ][iz  ] += wi*weight100;
+        Jz[ix-1][iy  ][iz-1] += wi*weight101;
+        Jz[ix-1][iy-1][iz  ] += wi*weight110;
+        Jz[ix-1][iy-1][iz-1] += wi*weight111;
+        // Pxx - add pressure tensor
+        //speciesMoments.addPxx(temp, ix, iy, iz);
+        Pxx[ix  ][iy  ][iz  ] += uui*weight000;
+        Pxx[ix  ][iy  ][iz-1] += uui*weight001;
+        Pxx[ix  ][iy-1][iz  ] += uui*weight010;
+        Pxx[ix  ][iy-1][iz-1] += uui*weight011;
+        Pxx[ix-1][iy  ][iz  ] += uui*weight100;
+        Pxx[ix-1][iy  ][iz-1] += uui*weight101;
+        Pxx[ix-1][iy-1][iz  ] += uui*weight110;
+        Pxx[ix-1][iy-1][iz-1] += uui*weight111;
+        // Pxy - add pressure tensor
+        //speciesMoments.addPxy(temp, ix, iy, iz);
+        Pxy[ix  ][iy  ][iz  ] += uvi*weight000;
+        Pxy[ix  ][iy  ][iz-1] += uvi*weight001;
+        Pxy[ix  ][iy-1][iz  ] += uvi*weight010;
+        Pxy[ix  ][iy-1][iz-1] += uvi*weight011;
+        Pxy[ix-1][iy  ][iz  ] += uvi*weight100;
+        Pxy[ix-1][iy  ][iz-1] += uvi*weight101;
+        Pxy[ix-1][iy-1][iz  ] += uvi*weight110;
+        Pxy[ix-1][iy-1][iz-1] += uvi*weight111;
+        // Pxz - add pressure tensor
+        //speciesMoments.addPxz(temp, ix, iy, iz);
+        Pxz[ix  ][iy  ][iz  ] += uwi*weight000;
+        Pxz[ix  ][iy  ][iz-1] += uwi*weight001;
+        Pxz[ix  ][iy-1][iz  ] += uwi*weight010;
+        Pxz[ix  ][iy-1][iz-1] += uwi*weight011;
+        Pxz[ix-1][iy  ][iz  ] += uwi*weight100;
+        Pxz[ix-1][iy  ][iz-1] += uwi*weight101;
+        Pxz[ix-1][iy-1][iz  ] += uwi*weight110;
+        Pxz[ix-1][iy-1][iz-1] += uwi*weight111;
+        // Pyy - add pressure tensor
+        //speciesMoments.addPyy(temp, ix, iy, iz);
+        Pyy[ix  ][iy  ][iz  ] += vvi*weight000;
+        Pyy[ix  ][iy  ][iz-1] += vvi*weight001;
+        Pyy[ix  ][iy-1][iz  ] += vvi*weight010;
+        Pyy[ix  ][iy-1][iz-1] += vvi*weight011;
+        Pyy[ix-1][iy  ][iz  ] += vvi*weight100;
+        Pyy[ix-1][iy  ][iz-1] += vvi*weight101;
+        Pyy[ix-1][iy-1][iz  ] += vvi*weight110;
+        Pyy[ix-1][iy-1][iz-1] += vvi*weight111;
+        // Pyz - add pressure tensor
+        //speciesMoments.addPyz(temp, ix, iy, iz);
+        Pyz[ix  ][iy  ][iz  ] += vwi*weight000;
+        Pyz[ix  ][iy  ][iz-1] += vwi*weight001;
+        Pyz[ix  ][iy-1][iz  ] += vwi*weight010;
+        Pyz[ix  ][iy-1][iz-1] += vwi*weight011;
+        Pyz[ix-1][iy  ][iz  ] += vwi*weight100;
+        Pyz[ix-1][iy  ][iz-1] += vwi*weight101;
+        Pyz[ix-1][iy-1][iz  ] += vwi*weight110;
+        Pyz[ix-1][iy-1][iz-1] += vwi*weight111;
+        // Pzz - add pressure tensor
+        //speciesMoments.addPzz(temp, ix, iy, iz);
+        Pzz[ix  ][iy  ][iz  ] += wwi*weight000;
+        Pzz[ix  ][iy  ][iz-1] += wwi*weight001;
+        Pzz[ix  ][iy-1][iz  ] += wwi*weight010;
+        Pzz[ix  ][iy-1][iz-1] += wwi*weight011;
+        Pzz[ix-1][iy  ][iz  ] += wwi*weight100;
+        Pzz[ix-1][iy  ][iz-1] += wwi*weight101;
+        Pzz[ix-1][iy-1][iz  ] += wwi*weight110;
+        Pzz[ix-1][iy-1][iz-1] += wwi*weight111;
+      }
+
+      if(b10moments && bmoments10)
+      {
+        // check work
+        for(int jx=0;jx<2;jx++)
+        for(int jy=0;jy<2;jy++)
+        for(int jz=0;jz<2;jz++)
+        {
+          //dprintf("gothere");
+          //dprintf("%24.16f == rho[ix-jx][iy-jy][iz-jz]", rho[ix-jx][iy-jy][iz-jz]);
+          //dprintf("%24.16f == moments[ix-jx][iy-jy][iz-jz][0]", moments[ix-jx][iy-jy][iz-jz][0]);
+          assert_eq(rho[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][0]);
+          //dprintf("gothere");
+          assert_eq(Jx [ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][1]);
+          assert_eq(Jy [ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][2]);
+          assert_eq(Jz [ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][3]);
+          assert_eq(Pxx[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][4]);
+          assert_eq(Pxy[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][5]);
+          assert_eq(Pxz[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][6]);
+          assert_eq(Pyy[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][7]);
+          assert_eq(Pyz[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][8]);
+          assert_eq(Pzz[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][9]);
+        }
+      }
+    }
+    // split up the reduction tasks.
+    //
+    if(b10moments)
+    {
       //
-      moments[ix-1][iy-1][iz-1][0] += velmoments[0]*weight111;
-      moments[ix-1][iy-1][iz-1][1] += velmoments[1]*weight111;
-      moments[ix-1][iy-1][iz-1][2] += velmoments[2]*weight111;
-      moments[ix-1][iy-1][iz-1][3] += velmoments[3]*weight111;
-      moments[ix-1][iy-1][iz-1][4] += velmoments[4]*weight111;
-      moments[ix-1][iy-1][iz-1][5] += velmoments[5]*weight111;
-      moments[ix-1][iy-1][iz-1][6] += velmoments[6]*weight111;
-      moments[ix-1][iy-1][iz-1][7] += velmoments[7]*weight111;
-      moments[ix-1][iy-1][iz-1][8] += velmoments[8]*weight111;
-      moments[ix-1][iy-1][iz-1][9] += velmoments[9]*weight111;
+      // One-dimensional array access is presumably
+      // more efficient on poor compilers.
+      double* rho1d = &rho[0][0][0];
+      double* Jx1d  = &Jx [0][0][0];
+      double* Jy1d  = &Jy [0][0][0];
+      double* Jz1d  = &Jz [0][0][0];
+      double* Pxx1d = &Pxx[0][0][0];
+      double* Pxy1d = &Pxy[0][0][0];
+      double* Pxz1d = &Pxz[0][0][0];
+      double* Pyy1d = &Pyy[0][0][0];
+      double* Pyz1d = &Pyz[0][0][0];
+      double* Pzz1d = &Pzz[0][0][0];
+      ////
+      assert_eq(speciesMoments.get_nx(), nxn);
+      assert_eq(speciesMoments.get_ny(), nyn);
+      assert_eq(speciesMoments.get_nz(), nzn);
+      const int numel = nxn*nyn*nzn;
+      #pragma omp critical
+      for(int i=0;i<numel;i++) rhons1d[i] += invVOL*rho1d[i];
+      #pragma omp critical
+      for(int i=0;i<numel;i++) Jxs1d  [i] += invVOL*Jx1d [i];
+      #pragma omp critical
+      for(int i=0;i<numel;i++) Jys1d  [i] += invVOL*Jy1d [i];
+      #pragma omp critical
+      for(int i=0;i<numel;i++) Jzs1d  [i] += invVOL*Jz1d [i];
+      #pragma omp critical
+      for(int i=0;i<numel;i++) pXXsn1d[i] += invVOL*Pxx1d[i];
+      #pragma omp critical
+      for(int i=0;i<numel;i++) pXYsn1d[i] += invVOL*Pxy1d[i];
+      #pragma omp critical
+      for(int i=0;i<numel;i++) pXZsn1d[i] += invVOL*Pxz1d[i];
+      #pragma omp critical
+      for(int i=0;i<numel;i++) pYYsn1d[i] += invVOL*Pyy1d[i];
+      #pragma omp critical
+      for(int i=0;i<numel;i++) pYZsn1d[i] += invVOL*Pyz1d[i];
+      #pragma omp critical
+      for(int i=0;i<numel;i++) pZZsn1d[i] += invVOL*Pzz1d[i];
+    }
+    else if(bmoments10)
+    {
       //
-      //for(int jx=0;jx<2;jx++)
-      //for(int jy=0;jy<2;jy++)
-      //for(int jz=0;jz<2;jz++)
-      //for(int m=0;m<10;m++)
-      //{
-      //  moments[ix-jx][iy-jy][iz-jz][m] += velmoments[m]*weight[jx][jy][jz];
-      //}
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { rhons[is][i][j][k] += invVOL*moments[i][j][k][0]; }
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1]; }
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { Jys  [is][i][j][k] += invVOL*moments[i][j][k][2]; }
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3]; }
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4]; }
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5]; }
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6]; }
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7]; }
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8]; }
+      #pragma omp critical
+      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }
+    }
+    else
+    {
+      eprintf("reduction impossible without data!");
     }
-    // We split up the reduction tasks.
-    //
-    const int numel = nxn*nyn*nzn;
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      rhons[is][i][j][k] += invVOL*moments[i][j][k][0];
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1];
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      Jys  [is][i][j][k] += invVOL*moments[i][j][k][2];
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3];
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4];
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5];
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6];
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7];
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8];
-    #pragma omp critical collapse(3)
-    for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-      pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9];
   }
   communicateGhostP2G(is, 0, 0, 0, 0, vct);
 }
@@ -1365,26 +1574,26 @@ void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft,
 }
 
 /* add moments (e.g. from an OpenMP thread) to the accumulated moments */
-void EMfields3D::addToSpeciesMoments(const Moments & in, int is) {
-  assert_eq(in.get_nx(), nxn);
-  assert_eq(in.get_ny(), nyn);
-  assert_eq(in.get_nz(), nzn);
-  for (register int i = 0; i < nxn; i++) {
-    for (register int j = 0; j < nyn; j++)
-      for (register int k = 0; k < nzn; k++) {
-        rhons[is][i][j][k] += invVOL*in.get_rho(i, j, k);
-        Jxs  [is][i][j][k] += invVOL*in.get_Jx(i, j, k);
-        Jys  [is][i][j][k] += invVOL*in.get_Jy(i, j, k);
-        Jzs  [is][i][j][k] += invVOL*in.get_Jz(i, j, k);
-        pXXsn[is][i][j][k] += invVOL*in.get_pXX(i, j, k);
-        pXYsn[is][i][j][k] += invVOL*in.get_pXY(i, j, k);
-        pXZsn[is][i][j][k] += invVOL*in.get_pXZ(i, j, k);
-        pYYsn[is][i][j][k] += invVOL*in.get_pYY(i, j, k);
-        pYZsn[is][i][j][k] += invVOL*in.get_pYZ(i, j, k);
-        pZZsn[is][i][j][k] += invVOL*in.get_pZZ(i, j, k);
-      }
-  }
-}
+//void EMfields3D::addToSpeciesMoments(const Moments & in, int is) {
+//  assert_eq(in.get_nx(), nxn);
+//  assert_eq(in.get_ny(), nyn);
+//  assert_eq(in.get_nz(), nzn);
+//  for (register int i = 0; i < nxn; i++) {
+//    for (register int j = 0; j < nyn; j++)
+//      for (register int k = 0; k < nzn; k++) {
+//        rhons[is][i][j][k] += invVOL*in.get_rho(i, j, k);
+//        Jxs  [is][i][j][k] += invVOL*in.get_Jx(i, j, k);
+//        Jys  [is][i][j][k] += invVOL*in.get_Jy(i, j, k);
+//        Jzs  [is][i][j][k] += invVOL*in.get_Jz(i, j, k);
+//        pXXsn[is][i][j][k] += invVOL*in.get_pXX(i, j, k);
+//        pXYsn[is][i][j][k] += invVOL*in.get_pXY(i, j, k);
+//        pXZsn[is][i][j][k] += invVOL*in.get_pXZ(i, j, k);
+//        pYYsn[is][i][j][k] += invVOL*in.get_pYY(i, j, k);
+//        pYZsn[is][i][j][k] += invVOL*in.get_pYZ(i, j, k);
+//        pZZsn[is][i][j][k] += invVOL*in.get_pZZ(i, j, k);
+//      }
+//  }
+//}
 
 /*! set to 0 all the densities fields */
 void EMfields3D::setZeroDensities() {
@@ -3190,7 +3399,9 @@ EMfields3D::~EMfields3D() {
   delete injFieldsRear;
   for(int i=0;i<sizeMomentsArray;i++)
   {
-    momentsArray[i]->free();
+    delete momentsArray[i];
+    moments10[i]->free();
   }
-  free(momentsArray);
+  delete [] momentsArray;
+  free(moments10);
 }
diff --git a/include/Alloc.h b/include/Alloc.h
index 46c68fd9..8e589321 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -609,7 +609,7 @@ namespace iPic3D
         { check_idx_bounds(n4,n3,n2,n1); arr4[n4][n3][n2][n1] = value; }
     #endif
     protected:
-      void setall(double val)
+      void setall(type val)
       {
         #pragma omp for
         for(int i=0;i<size;i++)
@@ -651,7 +651,7 @@ namespace iPic3D
         { const_array_ref4<type>::set(n4,n3,n2,n1, value); }
       void free(){ delArray4<type>((type****)arr4); }
       type**** fetch_arr4(){ return (type****) arr4; }
-      void setall(double val) { const_array_ref4<type>::setall(val); }
+      void setall(type val) { const_array_ref4<type>::setall(val); }
       //bool verify_dims(size_t s4, size_t s3, size_t s2, size_t s1){
       //  if(s4==S4 && s3==S3 && s2==S2 && s1==S1) return true;
       //  Wprintf("%d==%d && %d==%d && %d==%d && %d==%d failed",
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index d0653090..b2105b0b 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -119,7 +119,7 @@ class EMfields3D                // :public Field
     void communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, VirtualTopology3D * vct);
     void sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
     /*! add accumulated moments to the moments for a given species */
-    void addToSpeciesMoments(const Moments & in, int is);
+    //void addToSpeciesMoments(const Moments & in, int is);
     /*! add an amount of charge density to charge density field at node X,Y,Z */
     void addRho(double weight[][2][2], int X, int Y, int Z, int is);
     /*! add an amount of current density - direction X to current density field at node X,Y,Z */
@@ -256,11 +256,16 @@ class EMfields3D                // :public Field
     double getBenergy();
 
     /*! fetch array for summing moments of thread i */
-    arr4_double fetch_momentsArray(int i){
+    Moments& fetch_momentsArray(int i){
       assert_le(0,i);
       assert_le(i,sizeMomentsArray);
       return *momentsArray[i];
     }
+    arr4_double fetch_moments10(int i){
+      assert_le(0,i);
+      assert_le(i,sizeMomentsArray);
+      return *moments10[i];
+    }
 
     /*! print electromagnetic fields info */
     void print(void) const;
@@ -384,8 +389,8 @@ class EMfields3D                // :public Field
     array3_double divC;
     /* temporary arrays for summing moments */
     int sizeMomentsArray;
-    //Moments **momentsArray;
-    arr4_double** momentsArray;
+    Moments **momentsArray;
+    arr4_double** moments10;
 
 
     // *******************************************************************************

From 9f0e972bceb05c789343c0ba903388e0e58e1b1b Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 12 Aug 2013 21:15:18 +0200
Subject: [PATCH 032/118] turned off use of moments10 introduced in dc4a2dcb81:
 not working with -openmp for some unclear reason that I don't have time to
 isolate.

---
 fields/EMfields3D.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 7cfea8f1..93e4cf9c 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -223,8 +223,8 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   double const*const q = pcls.getQall();
   //
   const int is = pcls.get_ns();
-  bool bmoments10 = true;
-  bool b10moments = false;
+  bool bmoments10 = false;
+  bool b10moments = true; // turn on doing it the old way
 
   // if b10moments
   double* rhons1d = &rhons[is][0][0][0];
@@ -536,6 +536,16 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
         Pzz[ix-1][iy-1][iz-1] += wwi*weight111;
       }
 
+      // why on earth do I observe the following:
+      // * without openmp, b10moments and bmoments10 gives same results,
+      // * b10moments gives same results with and without openmp, and
+      // * bmoments10 gives wrong results when I use openmp.
+      // I'm using Moments class and moments array exactly the same way
+      // as far as openmp is concerned...  To isolate the problem,
+      // gradually morph Moments class until implemented via arr4_double...
+      // Problem in constructor?
+      // 
+      // 
       if(b10moments && bmoments10)
       {
         // check work

From 07a20af3f2092aec1718f95b3ae415ff4952ad34 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 24 Sep 2013 11:42:03 +0200
Subject: [PATCH 033/118] issue #46: implemented cmake support for MIC
 cross-compile (Xeon and Xeon Phi)

---
 CMakeLists.txt                     |  59 ++++++++++--
 cmake/cmake_template.cmake.XeonPhi |  15 +++
 fields/EMfields3D.cpp              | 141 ++++++++++++++++++++++++++++-
 inputfiles/Random.inp              |  56 +++++++-----
 main/iPic3Dlib.cpp                 |   7 ++
 5 files changed, 248 insertions(+), 30 deletions(-)
 create mode 100755 cmake/cmake_template.cmake.XeonPhi

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e94ab5e..fc94308a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,9 @@
-cmake_minimum_required(VERSION 2.8.8)
-
+cmake_minimum_required(VERSION 2.8.8) 
+# compiler set in ../cmake/cmake_template.cmake.XeonPhi
+message ("for Xeon Phi:")
+message ("cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/cmake_template.cmake.XeonPhi")
+#message ("for Xeon:")
+#message ("cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/cmake_template.cmake.Xeon")
 #
 # Project declaration
 #
@@ -13,17 +17,40 @@ project(iPic3D)
 set(EXECUTABLE_OUTPUT_PATH work/${CMAKE_BUILD_TYPE})
 set(LIBRARY_OUTPUT_PATH lib)
 
+#
+# Set compiler flags per system
+#
+if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "k1om") ## Xeon Phi
+   set(CMAKE_CXX_FLAGS "-O3 -openmp -fno-exceptions -vec-report -mmic")
+   #set(CMAKE_CXX_FLAGS "-openmp -g -mmic") # set flags for Xeon Phi, totalview
+elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") ## Xeon
+   set(CMAKE_CXX_COMPILER "icpc")  
+   set(CMAKE_CXX_FLAGS "-O3 -openmp -fno-exceptions -vec-report")
+else()
+   set(CMAKE_CXX_FLAGS "-O3")
+endif()
+
 #
 # Find third class libraries
 #
+if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "k1om") ## Xeon Phi
+   set(CMAKE_PREFIX_PATH /opt/hdf5/1.8.10-patch1-mic)			
+   set(VARIOUS_LIB /opt/intel/lib/mic)
+   set (EXTRA_LIBS ${VARIOUS_LIB}/libimf.so ${VARIOUS_LIB}/libsvml.so ${VARIOUS_LIB}/libirng.so ${VARIOUS_LIB}/libintlc.so ${MPELIB})
+else()
+   find_package(MPI REQUIRED)
+   set(EXTRA_LIBS "")   
+endif()
 
 find_package(HDF5 COMPONENTS HL C REQUIRED)
-find_package(MPI REQUIRED)
+message("HDF5_INCLUDE_DIRS is ${HDF5_INCLUDE_DIRS}")
+
 
 #
 # include and lib directories
 #
 
+# include_directories: files there are accessible to the project
 include_directories(
         include
         ${HDF5_INCLUDE_DIRS}
@@ -33,6 +60,7 @@ include_directories(
 link_directories(
         ${HDF5_LIBRARY_DIRS}
         ${MPI_LIBRARY_DIRS}
+	${EXTRA_LIBS}
 )
 
 #
@@ -91,10 +119,12 @@ add_executable(
         iPic3D.cpp
 )
 
+#build iPic as a library also
+#libiPic3Dlib.so in folder lib
 add_library(
-        iPic3Dlib
-        SHARED
-        ${inc_files}
+        iPic3Dlib  #name of the library
+        SHARED	   #type of the library
+        ${inc_files}    # stuff to build the library
         ${src_files}
 )
 
@@ -107,9 +137,26 @@ target_link_libraries(
          ${HDF5_LIBRARIES}
          ${HDF5_HL_LIBRARIES}
          ${MPI_LIBRARIES}
+	 ${EXTRA_LIBS}
 )
 
 target_link_libraries(
          iPic3D
          iPic3Dlib
 )
+
+
+## to save the executable in the folder where the CMakeLists.txt file is, i.e. CMAKE_CURRENT_SOURCE_DIR
+set_target_properties(iPic3D PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+## debug releases have a _d appended to the executable
+set_target_properties(iPic3D PROPERTIES DEBUG_POSTFIX "_d")
+
+
+message("Which system am I compiling for:")
+message("CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
+
+message("Compiler & compiler flags:")
+message("CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER}")
+message("CMAKE_CXX_FLAGS is ${CMAKE_CXX_FLAGS}")
+
diff --git a/cmake/cmake_template.cmake.XeonPhi b/cmake/cmake_template.cmake.XeonPhi
new file mode 100755
index 00000000..fbd0d0c3
--- /dev/null
+++ b/cmake/cmake_template.cmake.XeonPhi
@@ -0,0 +1,15 @@
+SET(CMAKE_SYSTEM_NAME Linux)
+SET(CMAKE_SYSTEM_PROCESSOR k1om)
+SET(CMAKE_SYSTEM_VERSION 1)
+
+# specify the cross compiler
+SET(CMAKE_C_COMPILER   icc)
+SET(CMAKE_CXX_COMPILER icpc)
+SET(MPI_C_COMPILER mpiicc)
+SET(CMAKE_CXX_COMPILER mpiicpc)
+#SET(CMAKE_CXX_FLAGS "-openmp -O3 -mmic")
+SET(_CMAKE_TOOLCHAIN_PREFIX  x86_64-k1om-linux-)
+
+# where is the target environment 
+#SET(CMAKE_FIND_ROOT_PATH /usr/linux-k1om-4.7)
+SET(CMAKE_FIND_ROOT_PATH /opt/modules/knc/mic) #path to the Intel(R) Manycore Platform Software Stack, as in http://software.intel.com/en-us/articles/cross-compilation-for-intel-xeon-phi-coprocessor-with-cmake
diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 93e4cf9c..d878b3a8 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -2275,8 +2275,9 @@ void EMfields3D::initGEMnoPert(VirtualTopology3D * vct, Grid * grid, Collective
     init(vct, grid, col);            // use the fields from restart file
   }
 }
-
-void EMfields3D::initRandomField(VirtualTopology3D * vct, Grid * grid, Collective *col) {
+/* old init, Random problem */
+#if 0
+void EMfields3D::initRandomFieldOld(VirtualTopology3D * vct, Grid * grid, Collective *col) {
   double **modes_seed = newArr2(double, 7, 7);
   if (restart1 == 0) {
     // initialize
@@ -2336,7 +2337,7 @@ void EMfields3D::initRandomField(VirtualTopology3D * vct, Grid * grid, Collectiv
               Bzn[i][j][k] += B0x * cos(grid->getXN(i, j, k) * kx + grid->getYN(i, j, k) * ky + 2.0 * M_PI * modes_seed[m + 3][n + 3]);
             }
 
-          /* for (int m=1; m < 4; m++) for (int n=1; n < 4; n++){ kx=2.0*M_PI*m/Lx; ky=2.0*M_PI*n/Ly; Bxn[i][j][k] += B0x/kx*cos(grid->getXN(i,j,k)*kx+grid->getYN(i,j,k)*ky+2.0*M_PI*phixy); Byn[i][j][k] += B0x/ky*cos(grid->getXN(i,j,k)*kx+grid->getYN(i,j,k)*ky+2.0*M_PI*phixy); Bzn[i][j][k] += B0x/(kx+ky)*cos(grid->getXN(i,j,k)*kx+grid->getYN(i,j,k)*ky+2.0*M_PI*phiz); } for(int n=1; n < 4; n++){ ky=2.0*M_PI*n/Ly; Bxn[i][j][k] += B0x/(2.0*M_PI/Lx)*cos(grid->getYN(i,j,k)*ky+2.0*M_PI*phix); } for(int m=1; m < 4; m++){ kx=2.0*M_PI*m/Lx; Byn[i][j][k] += B0x/(2.0*M_PI/Ly)*cos(grid->getXN(i,j,k)*kx+2.0*M_PI*phiy); } */
+
         }
     // communicate ghost
     communicateNodeBC(nxn, nyn, nzn, Bxn, col->bcBx[0],col->bcBx[1],col->bcBx[2],col->bcBx[3],col->bcBx[4],col->bcBx[5], vct);
@@ -2358,6 +2359,140 @@ void EMfields3D::initRandomField(VirtualTopology3D * vct, Grid * grid, Collectiv
   }
   delArr2(modes_seed, 7);
 }
+#endif
+
+// new init, random problem
+void EMfields3D::initRandomField(VirtualTopology3D *vct, Grid *grid, Collective *col)
+{
+  double **modes_seed = newArr2(double, 7, 7);
+  if (restart1 ==0){
+    // initialize
+    if (vct->getCartesian_rank() ==0){
+      cout << "------------------------------------------" << endl;
+      cout << "Initialize GEM Challenge with Pertubation" << endl;
+      cout << "------------------------------------------" << endl;
+      cout << "B0x                              = " << B0x << endl;
+      cout << "B0y                              = " << B0y << endl;
+      cout << "B0z                              = " << B0z << endl;
+      cout << "Delta (current sheet thickness) = " << delta << endl;
+      for (int i=0; i < ns; i++){
+	cout << "rho species " << i <<" = " << rhoINIT[i];
+	if (DriftSpecies[i])
+	  cout << " DRIFTING " << endl;
+	else
+	  cout << " BACKGROUND " << endl;
+      }
+      cout << "-------------------------" << endl;
+    }
+    double kx;
+    double ky;
+        
+    /*       stringstream num_proc;
+	     num_proc << vct->getCartesian_rank() ;
+	     string cqsat = SaveDirName + "/RandomNumbers" + num_proc.str() + ".txt";
+        ofstream my_file(cqsat.c_str(), fstream::binary);
+	for (int m=-3; m < 4; m++)
+            for (int n=-3; n < 4; n++){
+            modes_seed[m+3][n+3] = rand() / (double) RAND_MAX;
+            my_file <<"modes_seed["<< m+3<<"][" << "\t" << n+3 << "] = " << modes_seed[m+3][n+3] << endl;
+            }
+              my_file.close();
+    */
+    modes_seed[0][0] = 0.532767;
+    modes_seed[0][1] = 0.218959;
+    modes_seed[0][2] = 0.0470446;
+    modes_seed[0][3] = 0.678865;
+    modes_seed[0][4] = 0.679296;
+    modes_seed[0][5] = 0.934693;
+    modes_seed[0][6] = 0.383502;
+    modes_seed[1][0] = 0.519416;
+    modes_seed[1][1] = 0.830965;
+    modes_seed[1][2] = 0.0345721;
+    modes_seed[1][3] = 0.0534616;
+    modes_seed[1][4] = 0.5297;
+    modes_seed[1][5] = 0.671149;
+    modes_seed[1][6] = 0.00769819;
+    modes_seed[2][0] = 0.383416;
+    modes_seed[2][1] = 0.0668422;
+    modes_seed[2][2] = 0.417486;
+    modes_seed[2][3] = 0.686773;
+    modes_seed[2][4] = 0.588977;
+    modes_seed[2][5] = 0.930436;
+    modes_seed[2][6] = 0.846167;
+    modes_seed[3][0] = 0.526929;
+    modes_seed[3][1] = 0.0919649;
+    modes_seed[3][2] = 0.653919;
+    modes_seed[3][3] = 0.415999;
+    modes_seed[3][4] = 0.701191;
+    modes_seed[3][5] = 0.910321;
+    modes_seed[3][6] = 0.762198;
+    modes_seed[4][0] = 0.262453;
+    modes_seed[4][1] = 0.0474645;
+    modes_seed[4][2] = 0.736082;
+    modes_seed[4][3] = 0.328234;
+    modes_seed[4][4] = 0.632639;
+    modes_seed[4][5] = 0.75641;
+    modes_seed[4][6] = 0.991037;
+    modes_seed[5][0] = 0.365339;
+    modes_seed[5][1] = 0.247039;
+    modes_seed[5][2] = 0.98255;
+    modes_seed[5][3] = 0.72266;
+    modes_seed[5][4] = 0.753356;
+    modes_seed[5][5] = 0.651519;
+    modes_seed[5][6] = 0.0726859;
+    modes_seed[6][0] = 0.631635;
+    modes_seed[6][1] = 0.884707;
+    modes_seed[6][2] = 0.27271;
+    modes_seed[6][3] = 0.436411;
+    modes_seed[6][4] = 0.766495;
+    modes_seed[6][5] = 0.477732;
+    modes_seed[6][6] = 0.237774;
+
+    for (int i=0; i < nxn; i++)
+      for (int j=0; j < nyn; j++)
+	for (int k=0; k < nzn; k++){
+	  // initialize the density for species
+	  for (int is=0; is < ns; is++){
+	    rhons[is][i][j][k] = rhoINIT[is]/FourPI;
+	  }
+	  // electric field
+	  Ex[i][j][k] =  0.0;
+	  Ey[i][j][k] =  0.0;
+	  Ez[i][j][k] =  0.0;
+	  // Magnetic field
+	  Bxn[i][j][k] =  0.0;
+	  Byn[i][j][k] =  0.0;
+	  Bzn[i][j][k] =  B0z;
+	  for (int m=-3; m < 4; m++)
+	    for (int n=-3; n < 4; n++){
+
+	      kx=2.0*M_PI*m/Lx;
+	      ky=2.0*M_PI*n/Ly;
+	      Bxn[i][j][k] += -B0x*ky*cos(grid->getXN(i,j,k)*kx+grid->getYN(i,j,k)*ky+2.0*M_PI*modes_seed[m+3][n+3]);
+	      Byn[i][j][k] += B0x*kx*cos(grid->getXN(i,j,k)*kx+grid->getYN(i,j,k)*ky+2.0*M_PI*modes_seed[m+3][n+3]);
+	      // Bzn[i][j][k] += B0x*cos(grid->getXN(i,j,k)*kx+grid->getYN(i,j,k)*ky+2.0*M_PI*modes_seed[m+3][n+3]);
+	    }
+	}
+	  // communicate ghost
+	  communicateNodeBC(nxn, nyn, nzn, Bxn, 1, 1, 2, 2, 1, 1, vct);
+	  communicateNodeBC(nxn, nyn, nzn, Byn, 1, 1, 1, 1, 1, 1, vct);
+	  communicateNodeBC(nxn, nyn, nzn, Bzn, 1, 1, 2, 2, 1, 1, vct);
+	  // initialize B on centers
+	  grid->interpN2C(Bxc, Bxn);
+	  grid->interpN2C(Byc, Byn);
+	  grid->interpN2C(Bzc, Bzn);
+	  // communicate ghost
+	  communicateCenterBC(nxc, nyc, nzc, Bxc, 2, 2, 2, 2, 2, 2, vct);
+	  communicateCenterBC(nxc, nyc, nzc, Byc, 1, 1, 1, 1, 1, 1, vct);
+	  communicateCenterBC(nxc, nyc, nzc, Bzc, 2, 2, 2, 2, 2, 2, vct);
+	  for (int is=0 ; is<ns; is++)
+            grid->interpN2C(rhocs,is,rhons);
+	} else {
+    init(vct,grid, col);  // use the fields from restart file
+    }
+  delArr2(modes_seed, 7);
+  }
+
 
 /*! Init Force Free (JxB=0) */
 void EMfields3D::initForceFree(VirtualTopology3D * vct, Grid * grid, Collective *col) {
diff --git a/inputfiles/Random.inp b/inputfiles/Random.inp
index 03209eb2..b48ef3a4 100644
--- a/inputfiles/Random.inp
+++ b/inputfiles/Random.inp
@@ -9,15 +9,26 @@ RestartDirName = data
 # set the maximum number of particles allocated  
 NpMaxNpRatio = 3.0
 
+# New flags:
+Case              = RandomCase #goToDefault #RandomCase       # Select the case
+PoissonCorrection = no       # Poisson correction
+WriteMethod       = default   # Output method [ default | Parallel ]
+SimulationName    = RandomCase  # Simulation name for the output
+
 #  %%%%%%%%%%%%%%%%%%% Magnetic Reconnection %%%%%%%%%%%%%%%%%%
-B0x = 0.001
+B0x = 0.03
 B0y = 0.00
-B0z = 0.00
+B0z = 0.03
 delta = 0.5
 
+# External magnetic field parameters:
+B1x = 0.000
+B1y = 0.000
+B1z = 0.000
+
 #  %%%%%%%%%%%%%%%%%%% TIME %%%%%%%%%%%%%%%%%%
-dt = 0.5                    # dt = time step  
-ncycles = 2001		     # cycles
+dt = 0.05                    # dt = time step  
+ncycles = 2001		 #!!!    # cycles
 th = 1.0                     # th =   decentering parameter
 c = 1.0                      # c = light speed 
 
@@ -26,8 +37,8 @@ Smooth = 0.5                  # Smoothing value (5-points stencil)
 
 
 # %%%%%%%%%%%%%%%%%% BOX SIZE %%%%%%%%%%%%%%%
-Lx =   10.0                  # Lx = simulation box length - x direction   
-Ly =   10.0                  # Ly = simulation box length - y direction
+Lx =   20.0                  # Lx = simulation box length - x direction   
+Ly =   20.0                  # Ly = simulation box length - y direction
 Lz =   1.0                  # Lz = simulation box length - z direction   
 
 x_center =   1.                  # Lx = simulation box length - x direction in m  
@@ -44,32 +55,35 @@ nzc =  1                   # nzc = number of cells - z direction
 #    0 = electrons
 #    1 = protons
 #    2,3,4,5,... = ions
-ns = 4
+ns = 2
 # Initial density (make sure you are neutral)
-rhoINIT =  0.01	0.01  0.2  0.2
+rhoINIT =  1.0	1.0  
+# Injection density (make sure you are neutral)
+rhoINJECT =  0.0   0.0 
 # TrackParticleID[species] = 1=true, 0=false --> Assign ID to particles 
-TrackParticleID = 0	0 0 0
+TrackParticleID = 0	0 
 # npcelx = number of particles per cell - Direction X 
-npcelx =    8	 8	 8	 8
+npcelx =    3	 3
 # npcely = number of particles per cell - Direction Y 
-npcely =    8	 8	 8	 8
+npcely =    3	 3	
 # npcelz = number of particles per cell - Direction Z 
-npcelz =   1	1	1	1
+#####npcelz =   3 	  3
+npcelz =   1         1
 # qom = charge to mass ratio for different species 
-qom =  -256.0	1.0	-256.0	1.0
+qom =  -256.0	1.0	
 # uth = thermal velocity for different species - Direction X  
-uth  = 0.045      0.0063	0.045      0.0063
+uth  = 0.045      0.0063
 # vth = thermal velocity for different species - Direction Y 
-vth  = 0.045      0.0063	0.045      0.0063
+vth  = 0.045      0.0063
 # wth = thermal velocity for different species - Direction Z 
-wth  = 0.045      0.0063	0.045      0.0063
+wth  = 0.045      0.0063
 # u0 = drift velocity   - Direction X    
-u0 = 0.0	0.0  0.0	0.0  
+u0 = 0.0	0.0 
 # v0 = drift velocity   - Direction Y    
-v0 = 0.0	0.0   0.0	0.0  
+v0 = 0.0	0.0 
 # w0 = drift velocity   - Direction Z    
-w0 = 0.000001	-0.000256 0.0	0.0  
-
+w0 = 0.0	0.0
+#w0 = 0.00325	-0.01624 0.0	0.0 
 
 # &&&&&&&&&&&& boundary conditions &&&&&&&&&&&&&&&
 # PHI Electrostatic Potential     
@@ -118,7 +132,7 @@ w0 = 0.000001	-0.000256 0.0	0.0
 # mover predictor corrector iteration
     NiterMover = 3
 # Output for field
-   FieldOutputCycle = 10
+   FieldOutputCycle = 100
 # Output for particles if 1 it doesnt save particles data
    ParticlesOutputCycle = 1
 # restart cycle
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index b0cb1426..dd731c9d 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -69,6 +69,13 @@ int c_Solver::Init(int argc, char **argv) {
   else if (col->getCase()=="BATSRUS")   EMf->initBATSRUS(vct,grid,col);
 #endif
   else if (col->getCase()=="Dipole")    EMf->initDipole(vct,grid,col);
+  else if (col->getCase()=="RandomCase") {
+    EMf->initRandomField(vct,grid,col);
+    if (myrank==0) {
+      cout << "Case is " << col->getCase() <<"\n";
+      cout <<"total # of particle per cell is " << col->getNpcel(0) << "\n";
+    }
+  }
   else {
     if (myrank==0) {
       cout << " =========================================================== " << endl;

From 28665ed6889034a8e75b30d9b9bcc2e340af1502 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 26 Sep 2013 14:22:17 +0200
Subject: [PATCH 034/118] issue #10: XLEN divides nxc should be enforced

---
 grids/Grid3DCU.cpp | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index 755f3096..5e7c3e2b 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -4,24 +4,23 @@
 
 /*! constructor */
 Grid3DCU::Grid3DCU(CollectiveIO * col, VirtualTopology3D * vct) {
-  // FOR TESTS - this must be uncommented next
   // int get_rank();
   // if(!get_rank())
-  // {
-  // fflush(stdout);
-  // bool xerror = false;
-  // bool yerror = false;
-  // bool zerror = false;
-  // if((col->getNxc()) % (vct->getXLEN())) xerror=true;
-  // if((col->getNyc()) % (vct->getYLEN())) yerror=true;
-  // if((col->getNzc()) % (vct->getZLEN())) zerror=true;
-  // if(xerror) printf("!!!ERROR: XLEN=%d does not divide nxc=%d\n", vct->getXLEN(),col->getNxc());
-  // if(yerror) printf("!!!ERROR: YLEN=%d does not divide nyc=%d\n", vct->getYLEN(),col->getNyc());
-  // if(zerror) printf("!!!ERROR: ZLEN=%d does not divide nzc=%d\n", vct->getZLEN(),col->getNzc());
-  // fflush(stdout);
-  // bool error = xerror||yerror||zerror;
-  // if(error) exit(1);
-  // }
+  {
+    fflush(stdout);
+    bool xerror = false;
+    bool yerror = false;
+    bool zerror = false;
+    if((col->getNxc()) % (vct->getXLEN())) xerror=true;
+    if((col->getNyc()) % (vct->getYLEN())) yerror=true;
+    if((col->getNzc()) % (vct->getZLEN())) zerror=true;
+    if(xerror) printf("!!!ERROR: XLEN=%d does not divide nxc=%d\n", vct->getXLEN(),col->getNxc());
+    if(yerror) printf("!!!ERROR: YLEN=%d does not divide nyc=%d\n", vct->getYLEN(),col->getNyc());
+    if(zerror) printf("!!!ERROR: ZLEN=%d does not divide nzc=%d\n", vct->getZLEN(),col->getNzc());
+    fflush(stdout);
+    bool error = xerror||yerror||zerror;
+    if(error) exit(1);
+  }
   // add 2 for the guard cells
   nxc = (col->getNxc()) / (vct->getXLEN()) + 2;
   nyc = (col->getNyc()) / (vct->getYLEN()) + 2;

From 8b81a53ea9b33f5de5f4d462dafa8e7621c7b759 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 26 Sep 2013 14:23:47 +0200
Subject: [PATCH 035/118] issue #47: created communication/VCtopology3D.cpp to
 speed recompile for XLEN

---
 communication/VCtopology3D.cpp | 118 +++++++++++++
 include/VCtopology3D.h         | 296 +++------------------------------
 2 files changed, 145 insertions(+), 269 deletions(-)
 create mode 100644 communication/VCtopology3D.cpp

diff --git a/communication/VCtopology3D.cpp b/communication/VCtopology3D.cpp
new file mode 100644
index 00000000..5d0da864
--- /dev/null
+++ b/communication/VCtopology3D.cpp
@@ -0,0 +1,118 @@
+#include "mpi.h"
+#include "Alloc.h"
+#include "VCtopology3D.h"
+#include <iostream>
+
+using std::cout;
+using std::endl;
+
+/** DEFINE THE Topology HERE, setting XLEN,YLEN,ZLEN */
+VCtopology3D::VCtopology3D() {
+  // *******************************************
+  // *******************************************
+  // change these values to change the topology
+  XLEN = 2;
+  YLEN = 2;
+  ZLEN = 1;
+  nprocs = XLEN * YLEN * ZLEN;
+  // here you have to set the topology for the fields
+  PERIODICX = true;
+  PERIODICY = false;
+  PERIODICZ = true;
+  // here you have to set the topology for the Particles
+  PERIODICX_P = true;
+  PERIODICY_P = false;
+  PERIODICZ_P = true;
+  // *******************************************
+  // *******************************************
+  XDIR = 0;
+  YDIR = 1;
+  ZDIR = 2;
+  RIGHT = 1;
+  LEFT = -1;
+
+  reorder = 1;
+
+  divisions[0] = XLEN;
+  divisions[1] = YLEN;
+  divisions[2] = ZLEN;
+
+  periods[0] = PERIODICX;
+  periods[1] = PERIODICY;
+  periods[2] = PERIODICZ;
+
+  periods_P[0] = PERIODICX_P;
+  periods_P[1] = PERIODICY_P;
+  periods_P[2] = PERIODICZ_P;
+
+
+  cVERBOSE = false;             // communication verbose ?
+
+}
+
+
+
+
+
+/** Within CART_COMM, processes find about their new rank numbers, their cartesian coordinates,
+  and their neighbors  */
+void VCtopology3D::setup_vctopology(MPI_Comm old_comm) {
+  // create a matrix with ranks, and neighbours for fields
+  MPI_Cart_create(old_comm, 3, divisions, periods, reorder, &CART_COMM);
+  // create a matrix with ranks, and neighbours for Particles
+  MPI_Cart_create(old_comm, 3, divisions, periods_P, reorder, &CART_COMM_P);
+  // field Communicator
+  if (CART_COMM != MPI_COMM_NULL) {
+    MPI_Comm_rank(CART_COMM, &cartesian_rank);
+    MPI_Cart_coords(CART_COMM, cartesian_rank, 3, coordinates);
+
+    MPI_Cart_shift(CART_COMM, XDIR, RIGHT, &xleft_neighbor, &xright_neighbor);
+    MPI_Cart_shift(CART_COMM, YDIR, RIGHT, &yleft_neighbor, &yright_neighbor);
+    MPI_Cart_shift(CART_COMM, ZDIR, RIGHT, &zleft_neighbor, &zright_neighbor);
+  }
+  else {
+    // EXCEPTION
+    cout << "A process is trown away from the new topology for fields. VCtopology3D.h" << endl;
+  }
+  // Particles Communicator
+  if (CART_COMM_P != MPI_COMM_NULL) {
+    MPI_Comm_rank(CART_COMM_P, &cartesian_rank);
+    MPI_Cart_coords(CART_COMM_P, cartesian_rank, 3, coordinates);
+
+    MPI_Cart_shift(CART_COMM_P, XDIR, RIGHT, &xleft_neighbor_P, &xright_neighbor_P);
+    MPI_Cart_shift(CART_COMM_P, YDIR, RIGHT, &yleft_neighbor_P, &yright_neighbor_P);
+    MPI_Cart_shift(CART_COMM_P, ZDIR, RIGHT, &zleft_neighbor_P, &zright_neighbor_P);
+  }
+  else {
+    // EXCEPTION
+    cout << "A process is trown away from the new topology for Particles. VCtopology3D.h" << endl;
+  }
+
+}
+/** destructor */
+VCtopology3D::~VCtopology3D() {
+
+}
+/** print topology info */
+void VCtopology3D::Print() {
+  cout << endl;
+  cout << "Virtual Cartesian Processors Topology" << endl;
+  cout << "-------------------------------------" << endl;
+  cout << "Processors grid: " << XLEN << "x" << YLEN << "x" << ZLEN << endl;
+  cout << "Periodicity Field X: " << periods[0] << endl;
+  cout << "Periodicity Field Y: " << periods[1] << endl;
+  cout << "Periodicity Field z: " << periods[2] << endl;
+  cout << "Periodicity Particles X: " << periods_P[0] << endl;
+  cout << "Periodicity Particles Y: " << periods_P[1] << endl;
+  cout << "Periodicity Particles z: " << periods_P[2] << endl;
+  cout << endl;
+}
+/** print cartesian rank of neighbors and coordinate of process */
+void VCtopology3D::PrintMapping() {
+  cout << endl;
+  cout << "Mapping of process " << cartesian_rank << endl;
+  cout << "----------------------" << endl;
+  cout << "Coordinates: X = " << coordinates[0] << "; Y = " << coordinates[1] << "; Z = " << coordinates[2] << endl;
+  cout << "Neighbors: xLeft = " << xleft_neighbor << "; xRight = " << xright_neighbor << "; yLeft = " << yleft_neighbor << "; yRight = " << yright_neighbor << "; zLeft = " << zleft_neighbor << "; zRight = " << zright_neighbor << endl;
+  cout << endl;
+}
diff --git a/include/VCtopology3D.h b/include/VCtopology3D.h
index c7e2cc06..2bab31ed 100644
--- a/include/VCtopology3D.h
+++ b/include/VCtopology3D.h
@@ -14,15 +14,7 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #ifndef VCtopology3D_H
 #define VCtopology3D_H
 
-#include "mpi.h"
 #include "VirtualTopology3D.h"
-#include "Alloc.h"
-#include <iostream>
-
-
-
-using std::cout;
-using std::endl;
 
 /**
  *  
@@ -48,56 +40,33 @@ class VCtopology3D:public VirtualTopology3D {
   void Print();
   /** Print the mapping of topology */
   void PrintMapping();
-  /** get XLEN */
-  int getXLEN();
-  /** get YLEN */
-  int getYLEN();
-  /** get ZLEN */
-  int getZLEN();
-  /** get nprocs */
-  int getNprocs();
-  /** get periodicity on boundaries - DIRECTION X*/
-  bool getPERIODICX();
-  /** get periodicity on boundaries - DIRECTION Y*/
-  bool getPERIODICY();
-  /** get periodicity on boundaries - DIRECTION Z*/
-  bool getPERIODICZ();
-  /** get the cartesian rank of the process */
-  int getCartesian_rank();
-  /** get the cartesian rank of XLEFT neighbor */
-  int getXleft_neighbor();
-  /** get the cartesian rank of XRIGHT neighbor */
-  int getXright_neighbor();
-  /** get the cartesian rank of YLEFT neighbor */
-  int getYleft_neighbor();
-  /** get the cartesian rank of YRIGHT neighbor */
-  int getYright_neighbor();
-  /** get the cartesian rank of ZLEFT neighbor */
-  int getZleft_neighbor();
-  /** get the cartesian rank of ZRIGHT neighbor */
-  int getZright_neighbor();
-  /** get the cartesian rank of XLEFT neighbor */
-  int getXleft_neighbor_P();
-  /** get the cartesian rank of XRIGHT neighbor */
-  int getXright_neighbor_P();
-  /** get the cartesian rank of YLEFT neighbor */
-  int getYleft_neighbor_P();
-  /** get the cartesian rank of YRIGHT neighbor */
-  int getYright_neighbor_P();
-  /** get the cartesian rank of ZLEFT neighbor */
-  int getZleft_neighbor_P();
-  /** get the cartesian rank of ZRIGHT neighbor */
-  int getZright_neighbor_P();
-  /** get the coordinates in dir direction of process*/
-  int getCoordinates(int dir);
-  /** get the coordinates of process*/
-  int *getCoordinates();
-  /** get Periodicity condition in dir direction */
-  int getPeriods(int dir);
-  /** if cVERBOSE == true, print to the screen all the comunication */
-  bool getcVERBOSE();
-  /** get the MPI communicator */
-  MPI_Comm getComm();
+
+  int getXLEN() { return (XLEN); }
+  int getYLEN() { return (YLEN); }
+  int getZLEN() { return (ZLEN); }
+  int getNprocs() { return (nprocs); }
+  bool getPERIODICX() { return (PERIODICX); }
+  bool getPERIODICY() { return (PERIODICY); }
+  bool getPERIODICZ() { return (PERIODICZ); }
+  int getCartesian_rank() { return (cartesian_rank); }
+  int getXleft_neighbor() { return (xleft_neighbor); }
+  int getXright_neighbor() { return (xright_neighbor); }
+  int getYleft_neighbor() { return (yleft_neighbor); }
+  int getYright_neighbor() { return (yright_neighbor); }
+  int getZleft_neighbor() { return (zleft_neighbor); }
+  int getZright_neighbor() { return (zright_neighbor); }
+  int getXleft_neighbor_P() { return (xleft_neighbor_P); }
+  int getXright_neighbor_P() { return (xright_neighbor_P); }
+  int getYleft_neighbor_P() { return (yleft_neighbor_P); }
+  int getYright_neighbor_P() { return (yright_neighbor_P); }
+  int getZleft_neighbor_P() { return (zleft_neighbor_P); }
+  int getZright_neighbor_P() { return (zright_neighbor_P); }
+  bool getcVERBOSE() { return (cVERBOSE); }
+  int getCoordinates(int dir) { return (coordinates[dir]); }
+  int *getCoordinates() { return (coordinates); }
+  int getPeriods(int dir) { return (periods[dir]); }
+  MPI_Comm getComm(){ return (CART_COMM); }
+
 
 private:
   /** New communicator with virtual cartesian topology */
@@ -178,215 +147,4 @@ class VCtopology3D:public VirtualTopology3D {
   bool cVERBOSE;
 };
 
-/** DEFINE THE Topology HERE, setting XLEN,YLEN,ZLEN */
-inline VCtopology3D::VCtopology3D() {
-  // *******************************************
-  // *******************************************
-  // change these values to change the topology
-  XLEN = 2;
-  YLEN = 2;
-  ZLEN = 1;
-  nprocs = XLEN * YLEN * ZLEN;
-  // here you have to set the topology for the fields
-  PERIODICX = true;
-  PERIODICY = false;
-  PERIODICZ = true;
-  // here you have to set the topology for the Particles
-  PERIODICX_P = true;
-  PERIODICY_P = false;
-  PERIODICZ_P = true;
-  // *******************************************
-  // *******************************************
-  XDIR = 0;
-  YDIR = 1;
-  ZDIR = 2;
-  RIGHT = 1;
-  LEFT = -1;
-
-  reorder = 1;
-
-  divisions[0] = XLEN;
-  divisions[1] = YLEN;
-  divisions[2] = ZLEN;
-
-  periods[0] = PERIODICX;
-  periods[1] = PERIODICY;
-  periods[2] = PERIODICZ;
-
-  periods_P[0] = PERIODICX_P;
-  periods_P[1] = PERIODICY_P;
-  periods_P[2] = PERIODICZ_P;
-
-
-  cVERBOSE = false;             // communication verbose ?
-
-}
-
-
-
-
-
-/** Within CART_COMM, processes find about their new rank numbers, their cartesian coordinates,
-  and their neighbors  */
-inline void VCtopology3D::setup_vctopology(MPI_Comm old_comm) {
-  // create a matrix with ranks, and neighbours for fields
-  MPI_Cart_create(old_comm, 3, divisions, periods, reorder, &CART_COMM);
-  // create a matrix with ranks, and neighbours for Particles
-  MPI_Cart_create(old_comm, 3, divisions, periods_P, reorder, &CART_COMM_P);
-  // field Communicator
-  if (CART_COMM != MPI_COMM_NULL) {
-    MPI_Comm_rank(CART_COMM, &cartesian_rank);
-    MPI_Cart_coords(CART_COMM, cartesian_rank, 3, coordinates);
-
-    MPI_Cart_shift(CART_COMM, XDIR, RIGHT, &xleft_neighbor, &xright_neighbor);
-    MPI_Cart_shift(CART_COMM, YDIR, RIGHT, &yleft_neighbor, &yright_neighbor);
-    MPI_Cart_shift(CART_COMM, ZDIR, RIGHT, &zleft_neighbor, &zright_neighbor);
-  }
-  else {
-    // EXCEPTION
-    cout << "A process is trown away from the new topology for fields. VCtopology3D.h" << endl;
-  }
-  // Particles Communicator
-  if (CART_COMM_P != MPI_COMM_NULL) {
-    MPI_Comm_rank(CART_COMM_P, &cartesian_rank);
-    MPI_Cart_coords(CART_COMM_P, cartesian_rank, 3, coordinates);
-
-    MPI_Cart_shift(CART_COMM_P, XDIR, RIGHT, &xleft_neighbor_P, &xright_neighbor_P);
-    MPI_Cart_shift(CART_COMM_P, YDIR, RIGHT, &yleft_neighbor_P, &yright_neighbor_P);
-    MPI_Cart_shift(CART_COMM_P, ZDIR, RIGHT, &zleft_neighbor_P, &zright_neighbor_P);
-  }
-  else {
-    // EXCEPTION
-    cout << "A process is trown away from the new topology for Particles. VCtopology3D.h" << endl;
-  }
-
-}
-/** destructor */
-inline VCtopology3D::~VCtopology3D() {
-
-}
-/** print topology info */
-inline void VCtopology3D::Print() {
-  cout << endl;
-  cout << "Virtual Cartesian Processors Topology" << endl;
-  cout << "-------------------------------------" << endl;
-  cout << "Processors grid: " << XLEN << "x" << YLEN << "x" << ZLEN << endl;
-  cout << "Periodicity Field X: " << periods[0] << endl;
-  cout << "Periodicity Field Y: " << periods[1] << endl;
-  cout << "Periodicity Field z: " << periods[2] << endl;
-  cout << "Periodicity Particles X: " << periods_P[0] << endl;
-  cout << "Periodicity Particles Y: " << periods_P[1] << endl;
-  cout << "Periodicity Particles z: " << periods_P[2] << endl;
-  cout << endl;
-}
-/** print cartesian rank of neighbors and coordinate of process */
-inline void VCtopology3D::PrintMapping() {
-  cout << endl;
-  cout << "Mapping of process " << cartesian_rank << endl;
-  cout << "----------------------" << endl;
-  cout << "Coordinates: X = " << coordinates[0] << "; Y = " << coordinates[1] << "; Z = " << coordinates[2] << endl;
-  cout << "Neighbors: xLeft = " << xleft_neighbor << "; xRight = " << xright_neighbor << "; yLeft = " << yleft_neighbor << "; yRight = " << yright_neighbor << "; zLeft = " << zleft_neighbor << "; zRight = " << zright_neighbor << endl;
-  cout << endl;
-}
-/** get XLEN */
-inline int VCtopology3D::getXLEN() {
-  return (XLEN);
-}
-/** get YLEN */
-inline int VCtopology3D::getYLEN() {
-  return (YLEN);
-}
-/** get ZLEN */
-inline int VCtopology3D::getZLEN() {
-  return (ZLEN);
-}
-/** get nprocs */
-inline int VCtopology3D::getNprocs() {
-  return (nprocs);
-}
-/** get periodicity on boundaries - DIRECTION X*/
-inline bool VCtopology3D::getPERIODICX() {
-  return (PERIODICX);
-}
-/** get periodicity on boundaries - DIRECTION Y*/
-inline bool VCtopology3D::getPERIODICY() {
-  return (PERIODICY);
-}
-/** get periodicity on boundaries - DIRECTION Z*/
-inline bool VCtopology3D::getPERIODICZ() {
-  return (PERIODICZ);
-}
-/** get the cartesian rank of the process */
-inline int VCtopology3D::getCartesian_rank() {
-  return (cartesian_rank);
-}
-/** get the cartesian rank of XLEFT neighbor */
-inline int VCtopology3D::getXleft_neighbor() {
-  return (xleft_neighbor);
-}
-/** get the cartesian rank of XRIGHT neighbor */
-inline int VCtopology3D::getXright_neighbor() {
-  return (xright_neighbor);
-}
-/** get the cartesian rank of YLEFT neighbor */
-inline int VCtopology3D::getYleft_neighbor() {
-  return (yleft_neighbor);
-}
-/** get the cartesian rank of YRIGHT neighbor */
-inline int VCtopology3D::getYright_neighbor() {
-  return (yright_neighbor);
-}
-/** get the cartesian rank of ZLEFT neighbor */
-inline int VCtopology3D::getZleft_neighbor() {
-  return (zleft_neighbor);
-}
-/** get the cartesian rank of ZRIGHT neighbor */
-inline int VCtopology3D::getZright_neighbor() {
-  return (zright_neighbor);
-}
-/** get the cartesian rank of XLEFT neighbor */
-inline int VCtopology3D::getXleft_neighbor_P() {
-  return (xleft_neighbor_P);
-}
-/** get the cartesian rank of XRIGHT neighbor */
-inline int VCtopology3D::getXright_neighbor_P() {
-  return (xright_neighbor_P);
-}
-/** get the cartesian rank of YLEFT neighbor */
-inline int VCtopology3D::getYleft_neighbor_P() {
-  return (yleft_neighbor_P);
-}
-/** get the cartesian rank of YRIGHT neighbor */
-inline int VCtopology3D::getYright_neighbor_P() {
-  return (yright_neighbor_P);
-}
-/** get the cartesian rank of ZLEFT neighbor */
-inline int VCtopology3D::getZleft_neighbor_P() {
-  return (zleft_neighbor_P);
-}
-/** get the cartesian rank of ZRIGHT neighbor */
-inline int VCtopology3D::getZright_neighbor_P() {
-  return (zright_neighbor_P);
-}
-/** if cVERBOSE == true, print to the screen all the comunication */
-inline bool VCtopology3D::getcVERBOSE() {
-  return (cVERBOSE);
-}
-/** get the coordinates in dir direction of process*/
-inline int VCtopology3D::getCoordinates(int dir) {
-  return (coordinates[dir]);
-}
-/** get the coordinates in dir direction of process*/
-inline int *VCtopology3D::getCoordinates() {
-  return (coordinates);
-}
-/** get Periodicity condition in dir direction */
-inline int VCtopology3D::getPeriods(int dir) {
-  return (periods[dir]);
-}
-/** Get the MPI communicator */
-inline MPI_Comm VCtopology3D::getComm(){
-  return (CART_COMM);
-}
-
 #endif

From d68535c54b5c87eb575f5649b502873fe6348958 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 26 Sep 2013 17:21:57 +0200
Subject: [PATCH 036/118] issue #48: executable is now created as exec/iPic3D
 in the build directory.

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc94308a..c6b9473e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,7 @@ project(iPic3D)
 # Set exec path
 #
 
-set(EXECUTABLE_OUTPUT_PATH work/${CMAKE_BUILD_TYPE})
+set(EXECUTABLE_OUTPUT_PATH exec/${CMAKE_BUILD_TYPE})
 set(LIBRARY_OUTPUT_PATH lib)
 
 #
@@ -147,7 +147,7 @@ target_link_libraries(
 
 
 ## to save the executable in the folder where the CMakeLists.txt file is, i.e. CMAKE_CURRENT_SOURCE_DIR
-set_target_properties(iPic3D PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+#set_target_properties(iPic3D PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 
 ## debug releases have a _d appended to the executable
 set_target_properties(iPic3D PROPERTIES DEBUG_POSTFIX "_d")

From 46fd165ab06c80ee3826e65122a3b83df4604f22 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 1 Oct 2013 16:13:52 +0200
Subject: [PATCH 037/118] issue #49: create ipic scripts system: ctags and help

---
 scripts/ipic            | 16 ++++++++++++++++
 scripts/ipic-ctags      |  8 ++++++++
 scripts/ipic-help       | 37 +++++++++++++++++++++++++++++++++++++
 scripts/ipic-help-ctags | 22 ++++++++++++++++++++++
 scripts/ipic-help-mic   | 39 +++++++++++++++++++++++++++++++++++++++
 scripts/makefiletags    | 14 ++++++++++++++
 scripts/tags            |  7 +++++++
 7 files changed, 143 insertions(+)
 create mode 100755 scripts/ipic
 create mode 100755 scripts/ipic-ctags
 create mode 100755 scripts/ipic-help
 create mode 100755 scripts/ipic-help-ctags
 create mode 100755 scripts/ipic-help-mic
 create mode 100755 scripts/makefiletags
 create mode 100644 scripts/tags

diff --git a/scripts/ipic b/scripts/ipic
new file mode 100755
index 00000000..25158819
--- /dev/null
+++ b/scripts/ipic
@@ -0,0 +1,16 @@
+#!/bin/sh
+if test $# -lt 1
+then 
+  echo '
+  usage: ipic <command>
+
+  Available ipic commands:
+    ipic ctags
+    ipic help
+'
+  exit
+fi
+DIRNAME=`dirname $0`
+APPENDIX="$1"
+shift
+exec "${DIRNAME}/ipic-${APPENDIX}" "$@"
diff --git a/scripts/ipic-ctags b/scripts/ipic-ctags
new file mode 100755
index 00000000..df4767be
--- /dev/null
+++ b/scripts/ipic-ctags
@@ -0,0 +1,8 @@
+#!/bin/sh
+DIRNAME=`dirname $0`
+echo creating tags file using ctags
+find . -name '*.cpp' -or -name '*.h' | xargs ctags --extra=+q
+echo creating tag for each C++ file
+find . -name '*.cpp' -or -name '*.h' | xargs $DIRNAME/makefiletags >> tags
+echo sorting tags file
+LC_ALL=C sort -u tags -o tags
diff --git a/scripts/ipic-help b/scripts/ipic-help
new file mode 100755
index 00000000..5dc5cf41
--- /dev/null
+++ b/scripts/ipic-help
@@ -0,0 +1,37 @@
+#!/bin/sh
+if test $# -lt 1
+then
+  echo '
+  To build, in the iPic3D directory you can use:
+  
+    rm -rf build # if necessary
+    mkdir build
+    cd build
+    cmake ..
+    make # or "make -j" to compile in parallel
+  
+  Before you build, you should first configure the number of MPI
+  processes you will use.  To do so, you currently have to edit
+  "communication/VCtopology3D.cpp" (and then recompile in the build
+  directory).  The lines you need to change are:
+  
+    XLEN = 2;
+    YLEN = 2;
+    ZLEN = 1;
+  
+  To run the code you can use
+  
+    mkdir data
+    mpiexec.hydra -n 4 -env OMP_NUM_THREADS=1 exec/iPic3D ../inputfiles/GEM.inp
+  
+  where 4 = XLEN times YLEN times ZLEN.
+  
+  Available subcommands:
+    ipic help mic
+'
+  exit
+fi
+DIRNAME=`dirname $0`
+APPENDIX="$1"
+shift
+exec "${DIRNAME}/ipic-help-${APPENDIX}" "$@"
diff --git a/scripts/ipic-help-ctags b/scripts/ipic-help-ctags
new file mode 100755
index 00000000..e80c00c1
--- /dev/null
+++ b/scripts/ipic-help-ctags
@@ -0,0 +1,22 @@
+#!/bin/sh
+DIRNAME=`dirname $0`
+SCRIPTSDIRNAME=`cd "${DIRNAME}"; pwd`
+PARENTOFSCRIPTSDIRNAME=`dirname "${SCRIPTSDIRNAME}"`
+#if test $# -lt 1
+#then
+  echo '
+  Make sure that you are in the source code directory via e.g.
+
+    cd '"${PARENTOFSCRIPTSDIRNAME}"'
+
+  and then run the script, e.g. via
+
+    '"${SCRIPTSDIRNAME}"'/ipic ctags
+
+  or
+
+    ipic ctags
+
+  if you have '"${DIRNAME}"' in your path.
+'
+
diff --git a/scripts/ipic-help-mic b/scripts/ipic-help-mic
new file mode 100755
index 00000000..ecb81bce
--- /dev/null
+++ b/scripts/ipic-help-mic
@@ -0,0 +1,39 @@
+#!/bin/sh
+#if test $# -lt 1
+#then
+  echo '
+  For the Xeon, you might want to change this to:
+  
+    XLEN = 4;
+    YLEN = 2;
+    ZLEN = 1;
+  
+  For the Xeon Phi, you might want:
+  
+    XLEN = 10;
+    YLEN = 5;
+    ZLEN = 1;
+  
+  Then to run the code you would use something like:
+  
+    mkdir data
+    mpiexec.hydra -n 8 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
+  
+  where 8 = XLEN times YLEN times ZLEN.
+  
+  If you want to cross-compile for the MIC, then the instructions are
+  different:
+  
+      mkdir build.phi
+      cd build.phi
+      cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/cmake_template.cmake.XeonPhi
+      make -j
+  
+  And to run you use, e.g.:
+  
+    mkdir data
+    mpiexec.hydra -host knc2-mic0 -n 50 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
+  
+  where 50 = XLEN times YLEN times ZLEN.
+'
+
diff --git a/scripts/makefiletags b/scripts/makefiletags
new file mode 100755
index 00000000..da0fb7d1
--- /dev/null
+++ b/scripts/makefiletags
@@ -0,0 +1,14 @@
+#!/bin/sh
+# generate a tag for each file name argument
+ls $* 2>&1| sed 's/ /\
+/g' \
+| perl -ne '
+    if(m@/@) {
+      m@(.*?)/([^/\n]+)$@;
+      print "$2\t$1/$2\t1\n";
+    } else {
+      m@(.*)@;
+      print "$1\t$1\t1\n";
+    }
+  '
+echo "tags	tags	1" >> tags
diff --git a/scripts/tags b/scripts/tags
new file mode 100644
index 00000000..9f0219d6
--- /dev/null
+++ b/scripts/tags
@@ -0,0 +1,7 @@
+ipic	ipic	1
+ipic-ctags	ipic-ctags	1
+ipic-help	ipic-help	1
+ipic-help-ctags	ipic-help-ctags	1
+ipic-help-mic	ipic-help-mic	1
+makefiletags	makefiletags	1
+tags	tags	1

From e3d1efd86d339fa9a017c87e08ff8e0e7d00cbb3 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 1 Oct 2013 23:33:04 +0200
Subject: [PATCH 038/118] Removing "siamo qua" comment.

---
 ConfigFile/src/ConfigFile.h | 1 -
 include/ConfigFile.h        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/ConfigFile/src/ConfigFile.h b/ConfigFile/src/ConfigFile.h
index d8d8108a..02c89ea4 100644
--- a/ConfigFile/src/ConfigFile.h
+++ b/ConfigFile/src/ConfigFile.h
@@ -166,7 +166,6 @@ template <> inline bool ConfigFile::string_as_T < bool > (const string & s) {
     *p = toupper(*p);           // make string all caps
   if (sup == string("FALSE") || sup == string("F") || sup == string("NO") || sup == string("N") || sup == string("0") || sup == string("NONE")) {
     b = false;
-    cout << "siamo qua " << endl;
   }
   return b;
 }
diff --git a/include/ConfigFile.h b/include/ConfigFile.h
index 79008be2..62f2d2db 100644
--- a/include/ConfigFile.h
+++ b/include/ConfigFile.h
@@ -166,7 +166,6 @@ template <> inline bool ConfigFile::string_as_T < bool > (const string & s) {
     *p = toupper(*p);           // make string all caps
   if (sup == string("FALSE") || sup == string("F") || sup == string("NO") || sup == string("N") || sup == string("0") || sup == string("NONE")) {
     b = false;
-    cout << "siamo qua " << endl;
   }
   return b;
 }

From 2ac8e7644b3b375e20a2711bd616b53782968e31 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 1 Oct 2013 23:52:14 +0200
Subject: [PATCH 039/118] iss #47: put XLEN and PERIODICX in GEM.inp; iss #50:
 CollectiveIO removed

---
 communication/VCtopology3D.cpp |  21 +-
 include/Collective.h           | 283 ++++++++++---------------
 include/CollectiveIO.h         | 367 +++++++++++++++++----------------
 include/VCtopology3D.h         |   3 +-
 inputfiles/GEM.inp             |  14 +-
 inputfiles/Random.inp          |  10 +-
 inputoutput/Collective.cpp     | 319 +---------------------------
 main/iPic3Dlib.cpp             |   2 +-
 8 files changed, 335 insertions(+), 684 deletions(-)

diff --git a/communication/VCtopology3D.cpp b/communication/VCtopology3D.cpp
index 5d0da864..b88d0878 100644
--- a/communication/VCtopology3D.cpp
+++ b/communication/VCtopology3D.cpp
@@ -1,5 +1,6 @@
 #include "mpi.h"
 #include "Alloc.h"
+#include "Collective.h"
 #include "VCtopology3D.h"
 #include <iostream>
 
@@ -7,22 +8,22 @@ using std::cout;
 using std::endl;
 
 /** DEFINE THE Topology HERE, setting XLEN,YLEN,ZLEN */
-VCtopology3D::VCtopology3D() {
+VCtopology3D::VCtopology3D(const Collective& col) {
   // *******************************************
   // *******************************************
   // change these values to change the topology
-  XLEN = 2;
-  YLEN = 2;
-  ZLEN = 1;
+  XLEN = col.getXLEN();
+  YLEN = col.getYLEN();
+  ZLEN = col.getZLEN();
   nprocs = XLEN * YLEN * ZLEN;
   // here you have to set the topology for the fields
-  PERIODICX = true;
-  PERIODICY = false;
-  PERIODICZ = true;
+  PERIODICX = col.getPERIODICX();
+  PERIODICY = col.getPERIODICY();
+  PERIODICZ = col.getPERIODICZ();
   // here you have to set the topology for the Particles
-  PERIODICX_P = true;
-  PERIODICY_P = false;
-  PERIODICZ_P = true;
+  PERIODICX_P = col.getPERIODICX();
+  PERIODICY_P = col.getPERIODICY();
+  PERIODICZ_P = col.getPERIODICZ();
   // *******************************************
   // *******************************************
   XDIR = 0;
diff --git a/include/Collective.h b/include/Collective.h
index 7126522d..e4155bcf 100644
--- a/include/Collective.h
+++ b/include/Collective.h
@@ -8,25 +8,33 @@
 #ifndef Collective_H
 #define Collective_H
 
+#ifdef BATSRUS
+#include "InterfaceFluid.h"
+#endif
 
 
 #include <math.h>
-#include <iostream>
-#include <fstream>
+//#include <iostream>
+//#include <fstream>
 #include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "ConfigFile.h"
 #include "input_array.h"
 #include "hdf5.h"
-#include "CollectiveIO.h"
+//#include "CollectiveIO.h"
+using namespace std;
 
 using std::cout;
 using std::endl;
 using std::ofstream;
 using namespace std;
 
-class Collective:public CollectiveIO {
+class Collective
+#ifdef BATSRUS
+: public InterfaceFluid
+#endif
+{
   public:
     /*! constructor: initialize physical parameters with values */
     Collective(int argc, char **argv);
@@ -40,174 +48,96 @@ class Collective:public CollectiveIO {
     void Print();
     /*! save setting in a file */
     void save();
-    /*! get the physical space dimensions */
-    int getDim();
-    /*! Get length of the system - direction X */
-    double getLx();
-    /*! Get length of the system - direction Y */
-    double getLy();
-    /*! Get length of the system - direction Z */
-    double getLz();
-    /*! Get object center - direction X */
-    double getx_center();
-    /*! Get object center - direction Y */
-    double gety_center();
-    /*! Get object center - direction Z */
-    double getz_center();
-    /*! Get object size - cubic box */
-    double getL_square();
-    /*! Get the number of cells - direction X */
-    int getNxc();
-    /*! Get the number of cells - direction Y */
-    int getNyc();
-    /*! Get the number of cells - direction Z */
-    int getNzc();
-    /*! Get the grid spacing - direction X */
-    double getDx();
-    /*! Get the grid spacing - direction Y */
-    double getDy();
-    /*! Get the grid spacing - direction Z */
-    double getDz();
-    /*! get the light speed */
-    double getC();
-    /*! get the time step */
-    double getDt();
-    /*! get the decentering parameter */
-    double getTh();
-    /*! get the Smoothing value */
-    double getSmooth();
-    /*! get the number of time cycles */
-    int getNcycles();
-    /*! get the number of species */
-    int getNs();
-    /*! get the number of particles for different species */
-    long getNp(int nspecies);
-    /*! get the number of particles per cell */
-    int getNpcel(int nspecies);
-    /*! get the number of particles per cell - direction X */
-    int getNpcelx(int nspecies);
-    /*! get the number of particles per cell - direction Y */
-    int getNpcely(int nspecies);
-    /*! get the number of particles per cell - direction Z */
-    int getNpcelz(int nspecies);
-    /*! get maximum number of particles for different species */
-    long getNpMax(int nspecies);
-    /*! NpMax/Np is the ratio between the maximum number of particles allowed on a processor and the number of particles */
-    double getNpMaxNpRatio();
-    /*! get charge to mass ratio for different species */
-    double getQOM(int nspecies);
-    /*! get background charge for GEM challenge */
-    double getRHOinit(int nspecies);
-    /*! get rho injection */
-    double getRHOinject(int nspecies);
-    /*! get thermal velocity - X direction */
-    double getUth(int nspecies);
-    /*! get thermal velocity - Y direction */
-    double getVth(int nspecies);
-    /*! get thermal velocity - Z direction */
-    double getWth(int nspecies);
-    /*! get Drift velocity - Direction X */
-    double getU0(int nspecies);
-    /*! get Drift velocity - Direction Y */
-    double getV0(int nspecies);
-    /*! get Drift velocity - Direction Z */
-    double getW0(int nspecies);
-    /*! get the boolean value for TrackParticleID */
-    bool getTrackParticleID(int nspecies);
-    /*! get SaveDirName */
-    string getSaveDirName();
-    /*! get last_cycle */
-    int getLast_cycle();
-    /*! get RestartDirName */
-    string getRestartDirName();
-
-    /*! get Case type */
-    string getCase();
-    /*! get output writing method */
-    string getWriteMethod();
-    /*! get simulation name */
-    string getSimName();
-    /*! get Poisson correction flag */
-    string getPoissonCorrection();
-
-    /*! get Boundary Condition Particles: FaceXright */
-    int getBcPfaceXright();
-    /*! get Boundary Condition Particles: FaceXleft */
-    int getBcPfaceXleft();
-    /*! get Boundary Condition Particles: FaceYright */
-    int getBcPfaceYright();
-    /*! get Boundary Condition Particles: FaceYleft */
-    int getBcPfaceYleft();
-    /*! get Boundary Condition Particles: FaceYright */
-    int getBcPfaceZright();
-    /*! get Boundary Condition Particles: FaceYleft */
-    int getBcPfaceZleft();
-
-    /*! get Boundary Condition Electrostatic Potential: FaceXright */
-    int getBcPHIfaceXright();
-    /*! get Boundary Condition Electrostatic Potential:FaceXleft */
-    int getBcPHIfaceXleft();
-    /*! get Boundary Condition Electrostatic Potential:FaceYright */
-    int getBcPHIfaceYright();
-    /*! get Boundary Condition Electrostatic Potential:FaceYleft */
-    int getBcPHIfaceYleft();
-    /*! get Boundary Condition Electrostatic Potential:FaceYright */
-    int getBcPHIfaceZright();
-    /*! get Boundary Condition Electrostatic Potential:FaceYleft */
-    int getBcPHIfaceZleft();
-
-    /*! get Boundary ConditionElectric Field: FaceXright */
-    int getBcEMfaceXright();
-    /*! get Boundary Condition Electric Field: FaceXleft */
-    int getBcEMfaceXleft();
-    /*! get Boundary Condition Electric Field: FaceYright */
-    int getBcEMfaceYright();
-    /*! get Boundary Condition Electric Field: FaceYleft */
-    int getBcEMfaceYleft();
-    /*! get Boundary Condition Electric Field: FaceZright */
-    int getBcEMfaceZright();
-    /*! get Boundary Condition Electric Field: FaceZleft */
-    int getBcEMfaceZleft();
-
-    /*! get RESTART */
-    int getRestart_status();
-
-    /*! get the sheet thickness */
-    double getDelta();
-    /*! get the amplitude of the magnetic field along x */
-    double getB0x();
-    /*! get the amplitude of the magnetic field along y */
-    double getB0y();
-    /*! get the amplitude of the magnetic field along z */
-    double getB0z();
-    /*! get the amplitude of the magnetic field 1 along x */
-    double getB1x();
-    /*! get the amplitude of the magnetic field 1 along y */
-    double getB1y();
-    /*! get the amplitude of the magnetic field 1 along z */
-    double getB1z();
-
-    /*! get the boolean value for verbose results */
-    bool getVerbose();
-
-    /*! get the velocity of injection of the plasma from the wall */
-    double getVinj();
-
-    /*! get the converging tolerance for CG solver */
-    double getCGtol();
-    /*! get the converging tolerance for GMRES solver */
-    double getGMREStol();
-    /*! get the numbers of iteration for the PC mover */
-    int getNiterMover();
 
-    /*! output of fields */
-    int getFieldOutputCycle();
-    /*! output of particles */
-    int getParticlesOutputCycle();
-    /*! output of restart */
-    int getRestartOutputCycle();
-    /*! output of diagnostics */
-    int getDiagnosticsOutputCycle();
+    // accessors
+    //
+    int getDim()const{ return (dim); }
+    double getLx()const{ return (Lx); }
+    double getLy()const{ return (Ly); }
+    double getLz()const{ return (Lz); }
+    double getx_center()const{ return (x_center); }
+    double gety_center()const{ return (y_center); }
+    double getz_center()const{ return (z_center); }
+    double getL_square()const{ return (L_square); }
+    int getNxc()const{ return (nxc); }
+    int getNyc()const{ return (nyc); }
+    int getNzc()const{ return (nzc); }
+    int getXLEN()const{ return (XLEN); }
+    int getYLEN()const{ return (YLEN); }
+    int getZLEN()const{ return (ZLEN); }
+    bool getPERIODICX()const{ return (PERIODICX); }
+    bool getPERIODICY()const{ return (PERIODICY); }
+    bool getPERIODICZ()const{ return (PERIODICZ); }
+    double getDx()const{ return (dx); }
+    double getDy()const{ return (dy); }
+    double getDz()const{ return (dz); }
+    double getC()const{ return (c); }
+    double getDt()const{ return (dt); }
+    double getTh()const{ return (th); }
+    double getSmooth()const{ return (Smooth); }
+    int getNcycles()const{ return (ncycles); }
+    int getNs()const{ return (ns); }
+    int getNpcel(int nspecies)const{ return (npcel[nspecies]); }
+    int getNpcelx(int nspecies)const{ return (npcelx[nspecies]); }
+    int getNpcely(int nspecies)const{ return (npcely[nspecies]); }
+    int getNpcelz(int nspecies)const{ return (npcelz[nspecies]); }
+    long getNp(int nspecies)const{ return (np[nspecies]); }
+    long getNpMax(int nspecies)const{ return (npMax[nspecies]); }
+    double getNpMaxNpRatio()const{ return (NpMaxNpRatio); }
+    double getQOM(int nspecies)const{ return (qom[nspecies]); }
+    double getRHOinit(int nspecies)const{ return (rhoINIT[nspecies]); }
+    double getRHOinject(int nspecies)const { return(rhoINJECT[nspecies]); }
+    double getUth(int nspecies)const{ return (uth[nspecies]); }
+    double getVth(int nspecies)const{ return (vth[nspecies]); }
+    double getWth(int nspecies)const{ return (wth[nspecies]); }
+    double getU0(int nspecies)const{ return (u0[nspecies]); }
+    double getV0(int nspecies)const{ return (v0[nspecies]); }
+    double getW0(int nspecies)const{ return (w0[nspecies]); }
+    int getBcPfaceXright()const{ return (bcPfaceXright); }
+    int getBcPfaceXleft()const{ return (bcPfaceXleft); }
+    int getBcPfaceYright()const{ return (bcPfaceYright); }
+    int getBcPfaceYleft()const{ return (bcPfaceYleft); }
+    int getBcPfaceZright()const{ return (bcPfaceZright); }
+    int getBcPfaceZleft()const{ return (bcPfaceZleft); }
+    int getBcPHIfaceXright()const{ return (bcPHIfaceXright); }
+    int getBcPHIfaceXleft()const{ return (bcPHIfaceXleft); }
+    int getBcPHIfaceYright()const{ return (bcPHIfaceYright); }
+    int getBcPHIfaceYleft()const{ return (bcPHIfaceYleft); }
+    int getBcPHIfaceZright()const{ return (bcPHIfaceZright); }
+    int getBcPHIfaceZleft()const{ return (bcPHIfaceZleft); }
+    int getBcEMfaceXright()const{ return (bcEMfaceXright); }
+    int getBcEMfaceXleft()const{ return (bcEMfaceXleft); }
+    int getBcEMfaceYright()const{ return (bcEMfaceYright); }
+    int getBcEMfaceYleft()const{ return (bcEMfaceYleft); }
+    int getBcEMfaceZright()const{ return (bcEMfaceZright); }
+    int getBcEMfaceZleft()const{ return (bcEMfaceZleft); }
+    double getDelta()const{ return (delta); }
+    double getB0x()const{ return (B0x); }
+    double getB0y()const{ return (B0y); }
+    double getB0z()const{ return (B0z); }
+    double getB1x()const{ return (B1x); }
+    double getB1y()const{ return (B1y); }
+    double getB1z()const{ return (B1z); }
+    bool getVerbose()const{ return (verbose); }
+    bool getTrackParticleID(int nspecies)const
+      { return (TrackParticleID[nspecies]); }
+    int getRestart_status()const{ return (restart_status); }
+    string getSaveDirName()const{ return (SaveDirName); }
+    string getRestartDirName()const{ return (RestartDirName); }
+    string getinputfile()const{ return (inputfile); }
+    string getCase()const{ return (Case); }
+    string getSimName()const{ return (SimName); }
+    string getWriteMethod()const{ return (wmethod); }
+    string getPoissonCorrection()const{ return (PoissonCorrection); }
+    int getLast_cycle()const{ return (last_cycle); }
+    double getVinj()const{ return (Vinj); }
+    double getCGtol()const{ return (CGtol); }
+    double getGMREStol()const{ return (GMREStol); }
+    int getNiterMover()const{ return (NiterMover); }
+    int getFieldOutputCycle()const{ return (FieldOutputCycle); }
+    int getParticlesOutputCycle()const{ return (ParticlesOutputCycle); }
+    int getRestartOutputCycle()const{ return (RestartOutputCycle); }
+    int getDiagnosticsOutputCycle()const{ return (DiagnosticsOutputCycle); }
 
     /*! Boundary condition selection for BCFace for the electric field components */
     int bcEx[6], bcEy[6], bcEz[6];
@@ -257,6 +187,14 @@ class Collective:public CollectiveIO {
     double dy;
     /*! grid spacing - Z direction */
     double dz;
+    /*! number of MPI subdomains in each direction */
+    int XLEN;
+    int YLEN;
+    int ZLEN;
+    /*! periodicity in each direction */
+    bool PERIODICX;
+    bool PERIODICY;
+    bool PERIODICZ;
     /*! number of species */
     int ns;
     /*! number of particles per cell */
@@ -307,8 +245,6 @@ class Collective:public CollectiveIO {
     string SaveDirName;
     /*! RestartDirName */
     string RestartDirName;
-    /*! get inputfile */
-    string getinputfile();
     /*! restart_status 0 --> no restart; 1--> restart, create new; 2--> restart, append; */
     int restart_status;
     /*! last cycle */
@@ -393,5 +329,6 @@ class Collective:public CollectiveIO {
     /*! Output for diagnostics */
     int DiagnosticsOutputCycle;
 };
+typedef Collective CollectiveIO;
 
 #endif
diff --git a/include/CollectiveIO.h b/include/CollectiveIO.h
index 2fd7ffe5..f208b5f6 100644
--- a/include/CollectiveIO.h
+++ b/include/CollectiveIO.h
@@ -8,16 +8,17 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #ifndef CollectiveIO_H
 #define CollectiveIO_H
 
-#include <math.h>
-#include <iostream>
-#include <string.h>
-#include <stdlib.h>
+#include "Collective.h"
+//#include <math.h>
+//#include <iostream>
+//#include <string.h>
+//#include <stdlib.h>
 
-#ifdef BATSRUS
-#include "InterfaceFluid.h"
-#endif
+//#ifdef BATSRUS
+//#include "InterfaceFluid.h"
+//#endif
 
-using namespace std;
+//using namespace std;
 /**
  *  Abstract base class for inputing physical parameters for simulation.
  *
@@ -28,179 +29,179 @@ using namespace std;
  * @version 1.0
  */
 
-#ifdef BATSRUS
-class CollectiveIO : public InterfaceFluid{
-#else
-class CollectiveIO {
-#endif
-public:
-  /** read input file */
-  virtual void ReadInput(string inputfile) = 0;
-  /** read the restart input file from HDF5 */
-  virtual int ReadRestart(string inputfile) = 0;
-  /** print simulation parameters */
-  virtual void Print(void) = 0;
-  /** print simulation parameters */
-  virtual void save(void) = 0;
-  /** get the physical space dimensions            */
-  virtual int getDim() = 0;
-  /** get simulation box length - direction X */
-  virtual double getLx(void) = 0;
-  /** get simulation box length - direction Y */
-  virtual double getLy(void) = 0;
-  /** get simulation box length - direction Z */
-  virtual double getLz(void) = 0;
-  /** get object center - direction X */
-  virtual double getx_center(void) = 0;
-  /** get object center - direction Y */
-  virtual double gety_center(void) = 0;
-  /** get object center - direction Z */
-  virtual double getz_center(void) = 0;
-  /** get object size - cubic box */
-  virtual double getL_square(void) = 0;
-  /** get number of cells - direction X */
-  virtual int getNxc(void) = 0;
-  /** get number of cells - direction Y */
-  virtual int getNyc(void) = 0;
-  /** get number of cells - direction Z */
-  virtual int getNzc(void) = 0;
-  /** get grid spacing - direction X */
-  virtual double getDx(void) = 0;
-  /** get grid spacing - direction Y */
-  virtual double getDy(void) = 0;
-  /** get grid spacing - direction z */
-  virtual double getDz(void) = 0;
-  /** get the light speed */
-  virtual double getC() = 0;
-  /** get the time step */
-  virtual double getDt() = 0;
-  /** get the decentering parameter */
-  virtual double getTh() = 0;
-  /** get the Smoothing value*/
-  virtual double getSmooth() = 0;
-  /** get the number of time cycles */
-  virtual int getNcycles() = 0;
-  /** get the number of species */
-  virtual int getNs() = 0;
-  /** get the number of particles array for different species */
-  virtual long getNp(int nspecies) = 0;
-  /** get the number of particles per cell  */
-  virtual int getNpcel(int nspecies) = 0;
-  /** get the number of particles per cell - direction X  */
-  virtual int getNpcelx(int nspecies) = 0;
-  /** get the number of particles per cell - direction Y  */
-  virtual int getNpcely(int nspecies) = 0;
-  /** get the number of particles per cell - direction Z  */
-  virtual int getNpcelz(int nspecies) = 0;
-  /** get maximum number of particles for different species */
-  virtual long getNpMax(int nspecies) = 0;
-  /** NpMax/Np is the ratio between the maximum number of particles allowed on a processor and the number of particles*/
-  virtual double getNpMaxNpRatio() = 0;
-  /** get charge to mass ratio for different species */
-  virtual double getQOM(int nspecies) = 0;
-  /** get background charge for GEM challenge */
-  virtual double getRHOinit(int nspecies) = 0;
-  /** get rho for injection */
-  virtual double getRHOinject(int nspecies)=0;
-  /** get thermal velocity  - X direction    */
-  virtual double getUth(int nspecies) = 0;
-  /** get thermal velocity  - Y direction    */
-  virtual double getVth(int nspecies) = 0;
-  /** get thermal velocity  - Z direction    */
-  virtual double getWth(int nspecies) = 0;
-  /** get Drift velocity - Direction X         */
-  virtual double getU0(int nspecies) = 0;
-  /** get Drift velocity - Direction Y         */
-  virtual double getV0(int nspecies) = 0;
-  /** get Drift velocity - Direction Z         */
-  virtual double getW0(int nspecies) = 0;
-  /** get the boolean value for TrackParticleID */
-  virtual bool getTrackParticleID(int nspecies) = 0;
-  /** get SaveDirName  */
-  virtual string getSaveDirName() = 0;
-  /** get last_cycle  */
-  virtual int getLast_cycle() = 0;
-  /** get RestartDirName  */
-  virtual string getRestartDirName() = 0;
-
-  /** get Case type */
-  virtual string getCase() = 0;
-  /** get simulation name */
-  virtual string getSimName() = 0;
-  /** get Poisson correction flag */
-  virtual string getPoissonCorrection() = 0;
-
-  /** get Boundary Condition Particles: FaceXright */
-  virtual int getBcPfaceXright() = 0;
-  /** get Boundary Condition Particles: FaceXleft */
-  virtual int getBcPfaceXleft() = 0;
-  /** get Boundary Condition Particles: FaceYright */
-  virtual int getBcPfaceYright() = 0;
-  /** get Boundary Condition Particles: FaceYleft */
-  virtual int getBcPfaceYleft() = 0;
-  /** get Boundary Condition Particles: FaceYright */
-  virtual int getBcPfaceZright() = 0;
-  /** get Boundary Condition Particles: FaceYleft */
-  virtual int getBcPfaceZleft() = 0;
-
-  /** get Boundary Condition Electrostatic Potential: FaceXright */
-  virtual int getBcPHIfaceXright() = 0;
-  /** get Boundary Condition Electrostatic Potential:FaceXleft */
-  virtual int getBcPHIfaceXleft() = 0;
-  /** get Boundary Condition Electrostatic Potential:FaceYright */
-  virtual int getBcPHIfaceYright() = 0;
-  /** get Boundary Condition Electrostatic Potential:FaceYleft */
-  virtual int getBcPHIfaceYleft() = 0;
-  /** get Boundary Condition Electrostatic Potential:FaceYright */
-  virtual int getBcPHIfaceZright() = 0;
-  /** get Boundary Condition Electrostatic Potential:FaceYleft */
-  virtual int getBcPHIfaceZleft() = 0;
-
-  /** get Boundary ConditionElectric Field: FaceXright */
-  virtual int getBcEMfaceXright() = 0;
-  /** get Boundary Condition Electric Field: FaceXleft */
-  virtual int getBcEMfaceXleft() = 0;
-  /** get Boundary Condition Electric Field: FaceYright */
-  virtual int getBcEMfaceYright() = 0;
-  /** get Boundary Condition Electric Field: FaceYleft */
-  virtual int getBcEMfaceYleft() = 0;
-  /** get Boundary Condition Electric Field: FaceYright */
-  virtual int getBcEMfaceZright() = 0;
-  /** get Boundary Condition Electric Field: FaceYleft */
-  virtual int getBcEMfaceZleft() = 0;
-
-  /** get RESTART */
-  virtual int getRestart_status() = 0;
-
-
-  /** Get GEM Challenge parameters */
-
-  virtual double getDelta() = 0;
-  virtual double getB0x() = 0;
-  virtual double getB0y() = 0;
-  virtual double getB0z() = 0;
-
-  /** get the boolean value for verbose results */
-  virtual bool getVerbose() = 0;
-
-  /** get the converging tolerance for CG solver */
-  virtual double getCGtol() = 0;
-  /** get the converging tolerance for GMRES solver */
-  virtual double getGMREStol() = 0;
-  /** get the numbers of iteration for the PC mover */
-  virtual int getNiterMover() = 0;
-
-  /** output of fields */
-  virtual int getFieldOutputCycle() = 0;
-  /** output of fields */
-  virtual int getParticlesOutputCycle() = 0;
-  /** output of fields */
-  virtual int getRestartOutputCycle() = 0;
-  /** output of fields */
-  virtual int getDiagnosticsOutputCycle() = 0;
-
-  /** get the velocity of injection of the plasma from the wall */
-  virtual double getVinj() = 0;
-
-};
+//#ifdef BATSRUS
+//class CollectiveIO : public InterfaceFluid{
+//#else
+//class CollectiveIO {
+//#endif
+//public:
+//  /** read input file */
+//  virtual void ReadInput(string inputfile) = 0;
+//  /** read the restart input file from HDF5 */
+//  virtual int ReadRestart(string inputfile) = 0;
+//  /** print simulation parameters */
+//  virtual void Print(void) = 0;
+//  /** print simulation parameters */
+//  virtual void save(void) = 0;
+//  /** get the physical space dimensions            */
+//  virtual int getDim() = 0;
+//  /** get simulation box length - direction X */
+//  virtual double getLx(void) = 0;
+//  /** get simulation box length - direction Y */
+//  virtual double getLy(void) = 0;
+//  /** get simulation box length - direction Z */
+//  virtual double getLz(void) = 0;
+//  /** get object center - direction X */
+//  virtual double getx_center(void) = 0;
+//  /** get object center - direction Y */
+//  virtual double gety_center(void) = 0;
+//  /** get object center - direction Z */
+//  virtual double getz_center(void) = 0;
+//  /** get object size - cubic box */
+//  virtual double getL_square(void) = 0;
+//  /** get number of cells - direction X */
+//  virtual int getNxc(void) = 0;
+//  /** get number of cells - direction Y */
+//  virtual int getNyc(void) = 0;
+//  /** get number of cells - direction Z */
+//  virtual int getNzc(void) = 0;
+//  /** get grid spacing - direction X */
+//  virtual double getDx(void) = 0;
+//  /** get grid spacing - direction Y */
+//  virtual double getDy(void) = 0;
+//  /** get grid spacing - direction z */
+//  virtual double getDz(void) = 0;
+//  /** get the light speed */
+//  virtual double getC() = 0;
+//  /** get the time step */
+//  virtual double getDt() = 0;
+//  /** get the decentering parameter */
+//  virtual double getTh() = 0;
+//  /** get the Smoothing value*/
+//  virtual double getSmooth() = 0;
+//  /** get the number of time cycles */
+//  virtual int getNcycles() = 0;
+//  /** get the number of species */
+//  virtual int getNs() = 0;
+//  /** get the number of particles array for different species */
+//  virtual long getNp(int nspecies) = 0;
+//  /** get the number of particles per cell  */
+//  virtual int getNpcel(int nspecies) = 0;
+//  /** get the number of particles per cell - direction X  */
+//  virtual int getNpcelx(int nspecies) = 0;
+//  /** get the number of particles per cell - direction Y  */
+//  virtual int getNpcely(int nspecies) = 0;
+//  /** get the number of particles per cell - direction Z  */
+//  virtual int getNpcelz(int nspecies) = 0;
+//  /** get maximum number of particles for different species */
+//  virtual long getNpMax(int nspecies) = 0;
+//  /** NpMax/Np is the ratio between the maximum number of particles allowed on a processor and the number of particles*/
+//  virtual double getNpMaxNpRatio() = 0;
+//  /** get charge to mass ratio for different species */
+//  virtual double getQOM(int nspecies) = 0;
+//  /** get background charge for GEM challenge */
+//  virtual double getRHOinit(int nspecies) = 0;
+//  /** get rho for injection */
+//  virtual double getRHOinject(int nspecies)=0;
+//  /** get thermal velocity  - X direction    */
+//  virtual double getUth(int nspecies) = 0;
+//  /** get thermal velocity  - Y direction    */
+//  virtual double getVth(int nspecies) = 0;
+//  /** get thermal velocity  - Z direction    */
+//  virtual double getWth(int nspecies) = 0;
+//  /** get Drift velocity - Direction X         */
+//  virtual double getU0(int nspecies) = 0;
+//  /** get Drift velocity - Direction Y         */
+//  virtual double getV0(int nspecies) = 0;
+//  /** get Drift velocity - Direction Z         */
+//  virtual double getW0(int nspecies) = 0;
+//  /** get the boolean value for TrackParticleID */
+//  virtual bool getTrackParticleID(int nspecies) = 0;
+//  /** get SaveDirName  */
+//  virtual string getSaveDirName() = 0;
+//  /** get last_cycle  */
+//  virtual int getLast_cycle() = 0;
+//  /** get RestartDirName  */
+//  virtual string getRestartDirName() = 0;
+//
+//  /** get Case type */
+//  virtual string getCase() = 0;
+//  /** get simulation name */
+//  virtual string getSimName() = 0;
+//  /** get Poisson correction flag */
+//  virtual string getPoissonCorrection() = 0;
+//
+//  /** get Boundary Condition Particles: FaceXright */
+//  virtual int getBcPfaceXright() = 0;
+//  /** get Boundary Condition Particles: FaceXleft */
+//  virtual int getBcPfaceXleft() = 0;
+//  /** get Boundary Condition Particles: FaceYright */
+//  virtual int getBcPfaceYright() = 0;
+//  /** get Boundary Condition Particles: FaceYleft */
+//  virtual int getBcPfaceYleft() = 0;
+//  /** get Boundary Condition Particles: FaceYright */
+//  virtual int getBcPfaceZright() = 0;
+//  /** get Boundary Condition Particles: FaceYleft */
+//  virtual int getBcPfaceZleft() = 0;
+//
+//  /** get Boundary Condition Electrostatic Potential: FaceXright */
+//  virtual int getBcPHIfaceXright() = 0;
+//  /** get Boundary Condition Electrostatic Potential:FaceXleft */
+//  virtual int getBcPHIfaceXleft() = 0;
+//  /** get Boundary Condition Electrostatic Potential:FaceYright */
+//  virtual int getBcPHIfaceYright() = 0;
+//  /** get Boundary Condition Electrostatic Potential:FaceYleft */
+//  virtual int getBcPHIfaceYleft() = 0;
+//  /** get Boundary Condition Electrostatic Potential:FaceYright */
+//  virtual int getBcPHIfaceZright() = 0;
+//  /** get Boundary Condition Electrostatic Potential:FaceYleft */
+//  virtual int getBcPHIfaceZleft() = 0;
+//
+//  /** get Boundary ConditionElectric Field: FaceXright */
+//  virtual int getBcEMfaceXright() = 0;
+//  /** get Boundary Condition Electric Field: FaceXleft */
+//  virtual int getBcEMfaceXleft() = 0;
+//  /** get Boundary Condition Electric Field: FaceYright */
+//  virtual int getBcEMfaceYright() = 0;
+//  /** get Boundary Condition Electric Field: FaceYleft */
+//  virtual int getBcEMfaceYleft() = 0;
+//  /** get Boundary Condition Electric Field: FaceYright */
+//  virtual int getBcEMfaceZright() = 0;
+//  /** get Boundary Condition Electric Field: FaceYleft */
+//  virtual int getBcEMfaceZleft() = 0;
+//
+//  /** get RESTART */
+//  virtual int getRestart_status() = 0;
+//
+//
+//  /** Get GEM Challenge parameters */
+//
+//  virtual double getDelta() = 0;
+//  virtual double getB0x() = 0;
+//  virtual double getB0y() = 0;
+//  virtual double getB0z() = 0;
+//
+//  /** get the boolean value for verbose results */
+//  virtual bool getVerbose() = 0;
+//
+//  /** get the converging tolerance for CG solver */
+//  virtual double getCGtol() = 0;
+//  /** get the converging tolerance for GMRES solver */
+//  virtual double getGMREStol() = 0;
+//  /** get the numbers of iteration for the PC mover */
+//  virtual int getNiterMover() = 0;
+//
+//  /** output of fields */
+//  virtual int getFieldOutputCycle() = 0;
+//  /** output of fields */
+//  virtual int getParticlesOutputCycle() = 0;
+//  /** output of fields */
+//  virtual int getRestartOutputCycle() = 0;
+//  /** output of fields */
+//  virtual int getDiagnosticsOutputCycle() = 0;
+//
+//  /** get the velocity of injection of the plasma from the wall */
+//  virtual double getVinj() = 0;
+//
+//};
 #endif
diff --git a/include/VCtopology3D.h b/include/VCtopology3D.h
index 2bab31ed..97f28788 100644
--- a/include/VCtopology3D.h
+++ b/include/VCtopology3D.h
@@ -27,11 +27,12 @@ developers           : Stefano Markidis, Giovanni Lapenta
  * @version 2.0
  */
 
+class Collective;
 
 class VCtopology3D:public VirtualTopology3D {
 public:
   /** constructor: Define topology parameters: dimension, domain decomposition,... */
-  VCtopology3D();
+  VCtopology3D(const Collective& col);
   /** destructor */
   ~VCtopology3D();
   /** Find the neighbors in the new communicator  */
diff --git a/inputfiles/GEM.inp b/inputfiles/GEM.inp
index ba6173c2..e6f9bc23 100644
--- a/inputfiles/GEM.inp
+++ b/inputfiles/GEM.inp
@@ -47,10 +47,20 @@ y_center =   1.                  # Ly = simulation box length - y direction in m
 z_center =   1.                  # Lz = simulation box length - z direction in m  
 L_square =   .1
 
-nxc = 128                   # nxc = number of cells - x direction        
-nyc = 128                   # nyc = number of cells - y direction
+nxc = 120                   # nxc = number of cells - x direction        
+nyc = 120                   # nyc = number of cells - y direction
 nzc =  1                   # nzc = number of cells - z direction        
 
+# %%%%%%%%%%%%%% MPI TOPOLOGY %%%%%%%%%%%%%%
+# number of MPI subdomains in each direction
+XLEN = 4
+YLEN = 4
+ZLEN = 1
+# topology of subdomains in each dimension (1=true, 0=false)
+PERIODICX = 1
+PERIODICY = 0
+PERIODICZ = 1
+
 # %%%%%%%%%%%%%% PARTICLES %%%%%%%%%%%%%%%%%
 #    ns = number of species
 #    0 = electrons
diff --git a/inputfiles/Random.inp b/inputfiles/Random.inp
index b48ef3a4..b2ee56fe 100644
--- a/inputfiles/Random.inp
+++ b/inputfiles/Random.inp
@@ -49,7 +49,15 @@ L_square =   .1
 nxc =  120                  # nxc = number of cells - x direction        
 nyc =  120                  # nyc = number of cells - y direction
 nzc =  1                   # nzc = number of cells - z direction        
-
+# %%%%%%%%%%%%%% MPI TOPOLOGY %%%%%%%%%%%%%%
+# number of MPI subdomains in each direction
+XLEN = 2
+YLEN = 2
+ZLEN = 1
+# topology of subdomains in each dimension (1=true, 0=false)
+PERIODICX = 1
+PERIODICY = 1
+PERIODICZ = 1
 # %%%%%%%%%%%%%% PARTICLES %%%%%%%%%%%%%%%%%
 #    ns = number of species
 #    0 = electrons
diff --git a/inputoutput/Collective.cpp b/inputoutput/Collective.cpp
index 140e1647..da721b76 100644
--- a/inputoutput/Collective.cpp
+++ b/inputoutput/Collective.cpp
@@ -115,6 +115,12 @@ void Collective::ReadInput(string inputfile) {
     nyc = config.read < int >("nyc");
     nzc = config.read < int >("nzc");
 #endif
+    XLEN = config.read < int >("XLEN",1);
+    YLEN = config.read < int >("YLEN",1);
+    ZLEN = config.read < int >("ZLEN",1);
+    PERIODICX = config.read < bool >("PERIODICX");
+    PERIODICY = config.read < bool >("PERIODICY");
+    PERIODICZ = config.read < bool >("PERIODICZ");
 
     x_center = config.read < double >("x_center");
     y_center = config.read < double >("y_center");
@@ -707,316 +713,3 @@ void Collective::save() {
 
 }
 
-/*! get the physical space dimensions */
-int Collective::getDim() {
-  return (dim);
-}
-/*! get Lx */
-double Collective::getLx() {
-  return (Lx);
-}
-/*! get Ly */
-double Collective::getLy() {
-  return (Ly);
-}
-/*! get Lz */
-double Collective::getLz() {
-  return (Lz);
-}
-/*! get x_center */
-double Collective::getx_center() {
-  return (x_center);
-}
-/*! get y_center */
-double Collective::gety_center() {
-  return (y_center);
-}
-/*! get z_center */
-double Collective::getz_center() {
-  return (z_center);
-}
-/*! get L_square */
-double Collective::getL_square() {
-  return (L_square);
-}
-/*! get nxc */
-int Collective::getNxc() {
-  return (nxc);
-}
-/*! get nyx */
-int Collective::getNyc() {
-  return (nyc);
-}
-/*! get nzc */
-int Collective::getNzc() {
-  return (nzc);
-}
-/*! get dx */
-double Collective::getDx() {
-  return (dx);
-}
-/*! get dy */
-double Collective::getDy() {
-  return (dy);
-}
-/*! get dz */
-double Collective::getDz() {
-  return (dz);
-}
-/*! get the light speed */
-double Collective::getC() {
-  return (c);
-}
-/*! get the time step */
-double Collective::getDt() {
-  return (dt);
-}
-/*! get the decentering parameter */
-double Collective::getTh() {
-  return (th);
-}
-/*! get the smooth parameter */
-double Collective::getSmooth() {
-  return (Smooth);
-}
-
-/*! get the number of time cycles */
-int Collective::getNcycles() {
-  return (ncycles);
-}
-/*! get the number of species */
-int Collective::getNs() {
-  return (ns);
-}
-/*! get the number of particles per cell for species nspecies */
-int Collective::getNpcel(int nspecies) {
-  return (npcel[nspecies]);
-}
-/*! get the number of particles per cell for species nspecies - direction X */
-int Collective::getNpcelx(int nspecies) {
-  return (npcelx[nspecies]);
-}
-/*! get the number of particles per cell for species nspecies - direction Y */
-int Collective::getNpcely(int nspecies) {
-  return (npcely[nspecies]);
-}
-/*! get the number of particles per cell for species nspecies - direction Z */
-int Collective::getNpcelz(int nspecies) {
-  return (npcelz[nspecies]);
-}
-/*! get the number of particles for different species */
-long Collective::getNp(int nspecies) {
-  return (np[nspecies]);
-}
-/*! get maximum number of particles for different species */
-long Collective::getNpMax(int nspecies) {
-  return (npMax[nspecies]);
-}
-double Collective::getNpMaxNpRatio() {
-  return (NpMaxNpRatio);
-}
-/*! get charge to mass ratio for different species */
-double Collective::getQOM(int nspecies) {
-  return (qom[nspecies]);
-}
-/*! get the background density for GEM challenge */
-double Collective::getRHOinit(int nspecies) {
-  return (rhoINIT[nspecies]);
-}
-/*! get the background density for GEM challenge */
-inline double Collective::getRHOinject(int nspecies){
-  return(rhoINJECT[nspecies]);
-}
-/*! get thermal velocity - Direction X */
-double Collective::getUth(int nspecies) {
-  return (uth[nspecies]);
-}
-/*! get thermal velocity - Direction Y */
-double Collective::getVth(int nspecies) {
-  return (vth[nspecies]);
-}
-/*! get thermal velocity - Direction Z */
-double Collective::getWth(int nspecies) {
-  return (wth[nspecies]);
-}
-/*! get beam velocity - Direction X */
-double Collective::getU0(int nspecies) {
-  return (u0[nspecies]);
-}
-/*! get beam velocity - Direction Y */
-double Collective::getV0(int nspecies) {
-  return (v0[nspecies]);
-}
-/*! get beam velocity - Direction Z */
-double Collective::getW0(int nspecies) {
-  return (w0[nspecies]);
-}
-/*! get Boundary Condition Particles: FaceXright */
-int Collective::getBcPfaceXright() {
-  return (bcPfaceXright);
-}
-/*! get Boundary Condition Particles: FaceXleft */
-int Collective::getBcPfaceXleft() {
-  return (bcPfaceXleft);
-}
-/*! get Boundary Condition Particles: FaceYright */
-int Collective::getBcPfaceYright() {
-  return (bcPfaceYright);
-}
-/*! get Boundary Condition Particles: FaceYleft */
-int Collective::getBcPfaceYleft() {
-  return (bcPfaceYleft);
-}
-/*! get Boundary Condition Particles: FaceZright */
-int Collective::getBcPfaceZright() {
-  return (bcPfaceZright);
-}
-/*! get Boundary Condition Particles: FaceZleft */
-int Collective::getBcPfaceZleft() {
-  return (bcPfaceZleft);
-}
-/*! get Boundary Condition Electrostatic Potential: FaceXright */
-int Collective::getBcPHIfaceXright() {
-  return (bcPHIfaceXright);
-}
-/*! get Boundary Condition Electrostatic Potential:FaceXleft */
-int Collective::getBcPHIfaceXleft() {
-  return (bcPHIfaceXleft);
-}
-/*! get Boundary Condition Electrostatic Potential:FaceYright */
-int Collective::getBcPHIfaceYright() {
-  return (bcPHIfaceYright);
-}
-/*! get Boundary Condition Electrostatic Potential:FaceYleft */
-int Collective::getBcPHIfaceYleft() {
-  return (bcPHIfaceYleft);
-}
-/*! get Boundary Condition Electrostatic Potential:FaceZright */
-int Collective::getBcPHIfaceZright() {
-  return (bcPHIfaceZright);
-}
-/*! get Boundary Condition Electrostatic Potential:FaceZleft */
-int Collective::getBcPHIfaceZleft() {
-  return (bcPHIfaceZleft);
-}
-/*! get Boundary Condition EM Field: FaceXright */
-int Collective::getBcEMfaceXright() {
-  return (bcEMfaceXright);
-}
-/*! get Boundary Condition EM Field: FaceXleft */
-int Collective::getBcEMfaceXleft() {
-  return (bcEMfaceXleft);
-}
-/*! get Boundary Condition EM Field: FaceYright */
-int Collective::getBcEMfaceYright() {
-  return (bcEMfaceYright);
-}
-/*! get Boundary Condition EM Field: FaceYleft */
-int Collective::getBcEMfaceYleft() {
-  return (bcEMfaceYleft);
-}
-/*! get Boundary Condition EM Field: FaceZright */
-int Collective::getBcEMfaceZright() {
-  return (bcEMfaceZright);
-}
-/*! get Boundary Condition EM Field: FaceZleft */
-int Collective::getBcEMfaceZleft() {
-  return (bcEMfaceZleft);
-}
-/*! Get GEM Challenge parameters */
-double Collective::getDelta() {
-  return (delta);
-}
-double Collective::getB0x() {
-  return (B0x);
-}
-double Collective::getB0y() {
-  return (B0y);
-}
-double Collective::getB0z() {
-  return (B0z);
-}
-double Collective::getB1x(){
-  return (B1x);
-}
-double Collective::getB1y(){
-  return (B1y);
-}
-double Collective::getB1z(){
-  return (B1z);
-}
-/*! get the boolean value for verbose results */
-bool Collective::getVerbose() {
-  return (verbose);
-}
-/*! get the boolean value for TrackParticleID */
-bool Collective::getTrackParticleID(int nspecies) {
-  return (TrackParticleID[nspecies]);
-}
-int Collective::getRestart_status() {
-  return (restart_status);
-}
-/*! get SaveDirName */
-string Collective::getSaveDirName() {
-  return (SaveDirName);
-}
-/*! get RestartDirName */
-string Collective::getRestartDirName() {
-  return (RestartDirName);
-}
-/*! get inputfile */
-string Collective::getinputfile() {
-  return (inputfile);
-}
-/*! get Case type */
-string Collective::getCase() {
-  return (Case);
-}
-/*! get simulation name */
-string Collective::getSimName() {
-  return (SimName);
-}
-/*! get output writing method */
-string Collective::getWriteMethod() {
-  return (wmethod);
-}
-/*! get Poisson correction flag */
-string Collective::getPoissonCorrection() {
-  return (PoissonCorrection);
-}
-/*! get last_cycle */
-int Collective::getLast_cycle() {
-  return (last_cycle);
-}
-/*! get the velocity of injection of the plasma from the wall */
-double Collective::getVinj() {
-  return (Vinj);
-}
-/*! get the converging tolerance for CG solver */
-double Collective::getCGtol() {
-  return (CGtol);
-}
-/*! get the converging tolerance for GMRES solver */
-double Collective::getGMREStol() {
-  return (GMREStol);
-}
-/*! get the numbers of iteration for the PC mover */
-int Collective::getNiterMover() {
-  return (NiterMover);
-}
-/*! output of fields */
-int Collective::getFieldOutputCycle() {
-  return (FieldOutputCycle);
-}
-/*! output of particles */
-int Collective::getParticlesOutputCycle() {
-  return (ParticlesOutputCycle);
-}
-/*! restart cycle */
-int Collective::getRestartOutputCycle() {
-  return (RestartOutputCycle);
-}
-/*! output of fields */
-int Collective::getDiagnosticsOutputCycle() {
-  return (DiagnosticsOutputCycle);
-}
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index dd731c9d..d467dc8d 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -29,7 +29,7 @@ int c_Solver::Init(int argc, char **argv) {
   ns = col->getNs();            // get the number of particle species involved in simulation
   first_cycle = col->getLast_cycle() + 1; // get the last cycle from the restart
   // initialize the virtual cartesian topology 
-  vct = new VCtopology3D();
+  vct = new VCtopology3D(*col);
   // Check if we can map the processes into a matrix ordering defined in Collective.cpp
   if (nprocs != vct->getNprocs()) {
     if (myrank == 0) {

From 51fa54df43b27816eabe6922e9a3cfd59ece6679 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 2 Oct 2013 00:04:16 +0200
Subject: [PATCH 040/118] issue #47: updating ipic help: configurable XLEN

---
 scripts/ipic-help     | 11 +----------
 scripts/ipic-help-mic | 17 +++--------------
 scripts/tags          |  7 -------
 3 files changed, 4 insertions(+), 31 deletions(-)
 delete mode 100644 scripts/tags

diff --git a/scripts/ipic-help b/scripts/ipic-help
index 5dc5cf41..6856364d 100755
--- a/scripts/ipic-help
+++ b/scripts/ipic-help
@@ -10,21 +10,12 @@ then
     cmake ..
     make # or "make -j" to compile in parallel
   
-  Before you build, you should first configure the number of MPI
-  processes you will use.  To do so, you currently have to edit
-  "communication/VCtopology3D.cpp" (and then recompile in the build
-  directory).  The lines you need to change are:
-  
-    XLEN = 2;
-    YLEN = 2;
-    ZLEN = 1;
-  
   To run the code you can use
   
     mkdir data
     mpiexec.hydra -n 4 -env OMP_NUM_THREADS=1 exec/iPic3D ../inputfiles/GEM.inp
   
-  where 4 = XLEN times YLEN times ZLEN.
+  where 4 = XLEN times YLEN times ZLEN (defined in GEM.inp).
   
   Available subcommands:
     ipic help mic
diff --git a/scripts/ipic-help-mic b/scripts/ipic-help-mic
index ecb81bce..d983dd72 100755
--- a/scripts/ipic-help-mic
+++ b/scripts/ipic-help-mic
@@ -2,21 +2,10 @@
 #if test $# -lt 1
 #then
   echo '
-  For the Xeon, you might want to change this to:
-  
-    XLEN = 4;
-    YLEN = 2;
-    ZLEN = 1;
-  
-  For the Xeon Phi, you might want:
-  
-    XLEN = 10;
-    YLEN = 5;
-    ZLEN = 1;
-  
-  Then to run the code you would use something like:
+  See "ipic help".  Modifications are as follows.
+
+  To run on the Xeon host processor, use something like:
   
-    mkdir data
     mpiexec.hydra -n 8 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
   
   where 8 = XLEN times YLEN times ZLEN.
diff --git a/scripts/tags b/scripts/tags
deleted file mode 100644
index 9f0219d6..00000000
--- a/scripts/tags
+++ /dev/null
@@ -1,7 +0,0 @@
-ipic	ipic	1
-ipic-ctags	ipic-ctags	1
-ipic-help	ipic-help	1
-ipic-help-ctags	ipic-help-ctags	1
-ipic-help-mic	ipic-help-mic	1
-makefiletags	makefiletags	1
-tags	tags	1

From c0855af0c05d5cf21d49599a3cd27e132d3d8f87 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 2 Oct 2013 00:22:16 +0200
Subject: [PATCH 041/118] minor correction to ipic-help

---
 scripts/ipic-help | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/ipic-help b/scripts/ipic-help
index 6856364d..98a88e71 100755
--- a/scripts/ipic-help
+++ b/scripts/ipic-help
@@ -18,6 +18,8 @@ then
   where 4 = XLEN times YLEN times ZLEN (defined in GEM.inp).
   
   Available subcommands:
+
+    ipic help ctags
     ipic help mic
 '
   exit

From 990cba03675fbf60bb472e502e960ecf80a361b7 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 2 Oct 2013 00:27:06 +0200
Subject: [PATCH 042/118] created makefile in main directory to give useful
 info for "make".

---
 makefile | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 makefile

diff --git a/makefile b/makefile
new file mode 100644
index 00000000..519cb605
--- /dev/null
+++ b/makefile
@@ -0,0 +1,12 @@
+# Convenience makefile to call scripts
+
+help:
+	scripts/ipic help
+
+tags: retags
+
+retags:
+	scripts/ipic-ctags
+
+#monitor:
+#	less +F data/ConservedQuantities.txt

From 964e8b07070efe9b909eb002923d1cc9cdcd785f Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 2 Oct 2013 17:27:46 +0200
Subject: [PATCH 043/118] issue #41: doubled rate of summing moments via
 array(nx,ny,nz,10)

---
 fields/EMfields3D.cpp | 247 ++++++++++++++++++++++--------------------
 fields/Moments.cpp    |  14 ++-
 include/Alloc.h       |  42 +++----
 include/EMfields3D.h  |  15 ++-
 include/Moments.h     |  23 ++++
 include/arraysfwd.h   |   9 ++
 inputfiles/GEM.inp    |   4 +-
 7 files changed, 213 insertions(+), 141 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index d878b3a8..026698df 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -193,12 +193,14 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
 
   sizeMomentsArray = omp_thread_count();
   momentsArray = new Moments*[sizeMomentsArray];
-  moments10 = (arr4_double**) malloc(sizeof(void*)*sizeMomentsArray);
+  moments10Array = new Moments10*[sizeMomentsArray];
+  //moments10 = (arr4_double**) malloc(sizeof(void*)*sizeMomentsArray);
   for(int i=0;i<sizeMomentsArray;i++)
   {
     momentsArray[i] = new Moments(nxn,nyn,nzn);
+    moments10Array[i] = new Moments10(nxn,nyn,nzn);
     //momentsArray[i]->init(nxn,nyn,nzn);
-    moments10[i] = new arr4_double(nxn,nyn,nzn,10);
+    //moments10[i] = new arr4_double(nxn,nyn,nzn,10);
   }
 }
 
@@ -223,8 +225,8 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   double const*const q = pcls.getQall();
   //
   const int is = pcls.get_ns();
-  bool bmoments10 = false;
-  bool b10moments = true; // turn on doing it the old way
+  bool bmoments10 = true;
+  bool b10moments = false; // turn on doing it the old way
 
   // if b10moments
   double* rhons1d = &rhons[is][0][0][0];
@@ -251,8 +253,11 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
     int thread_num = omp_get_thread_num();
     Moments& speciesMoments = fetch_momentsArray(thread_num);
     speciesMoments.set_to_zero();
-    arr4_double moments = fetch_moments10(thread_num);
-    moments.setall(0.);
+    Moments10& speciesMoments10 = fetch_moments10Array(thread_num);
+    speciesMoments10.set_to_zero();
+    arr4_double moments = speciesMoments10.fetch_arr();
+    //arr4_double moments = fetch_moments10(thread_num);
+    //moments.setall(0.);
     //
     arr3_double rho = speciesMoments.fetch_rho();
     arr3_double Jx  = speciesMoments.fetch_Jx();
@@ -323,93 +328,102 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
 
       if(bmoments10)
       {
-        moments[ix  ][iy  ][iz  ][0] += velmoments[0]*weight000;
-        moments[ix  ][iy  ][iz  ][1] += velmoments[1]*weight000;
-        moments[ix  ][iy  ][iz  ][2] += velmoments[2]*weight000;
-        moments[ix  ][iy  ][iz  ][3] += velmoments[3]*weight000;
-        moments[ix  ][iy  ][iz  ][4] += velmoments[4]*weight000;
-        moments[ix  ][iy  ][iz  ][5] += velmoments[5]*weight000;
-        moments[ix  ][iy  ][iz  ][6] += velmoments[6]*weight000;
-        moments[ix  ][iy  ][iz  ][7] += velmoments[7]*weight000;
-        moments[ix  ][iy  ][iz  ][8] += velmoments[8]*weight000;
-        moments[ix  ][iy  ][iz  ][9] += velmoments[9]*weight000;
-
-        moments[ix  ][iy  ][iz-1][0] += velmoments[0]*weight001;
-        moments[ix  ][iy  ][iz-1][1] += velmoments[1]*weight001;
-        moments[ix  ][iy  ][iz-1][2] += velmoments[2]*weight001;
-        moments[ix  ][iy  ][iz-1][3] += velmoments[3]*weight001;
-        moments[ix  ][iy  ][iz-1][4] += velmoments[4]*weight001;
-        moments[ix  ][iy  ][iz-1][5] += velmoments[5]*weight001;
-        moments[ix  ][iy  ][iz-1][6] += velmoments[6]*weight001;
-        moments[ix  ][iy  ][iz-1][7] += velmoments[7]*weight001;
-        moments[ix  ][iy  ][iz-1][8] += velmoments[8]*weight001;
-        moments[ix  ][iy  ][iz-1][9] += velmoments[9]*weight001;
-
-        moments[ix  ][iy-1][iz  ][0] += velmoments[0]*weight010;
-        moments[ix  ][iy-1][iz  ][1] += velmoments[1]*weight010;
-        moments[ix  ][iy-1][iz  ][2] += velmoments[2]*weight010;
-        moments[ix  ][iy-1][iz  ][3] += velmoments[3]*weight010;
-        moments[ix  ][iy-1][iz  ][4] += velmoments[4]*weight010;
-        moments[ix  ][iy-1][iz  ][5] += velmoments[5]*weight010;
-        moments[ix  ][iy-1][iz  ][6] += velmoments[6]*weight010;
-        moments[ix  ][iy-1][iz  ][7] += velmoments[7]*weight010;
-        moments[ix  ][iy-1][iz  ][8] += velmoments[8]*weight010;
-        moments[ix  ][iy-1][iz  ][9] += velmoments[9]*weight010;
-
-        moments[ix  ][iy-1][iz-1][0] += velmoments[0]*weight011;
-        moments[ix  ][iy-1][iz-1][1] += velmoments[1]*weight011;
-        moments[ix  ][iy-1][iz-1][2] += velmoments[2]*weight011;
-        moments[ix  ][iy-1][iz-1][3] += velmoments[3]*weight011;
-        moments[ix  ][iy-1][iz-1][4] += velmoments[4]*weight011;
-        moments[ix  ][iy-1][iz-1][5] += velmoments[5]*weight011;
-        moments[ix  ][iy-1][iz-1][6] += velmoments[6]*weight011;
-        moments[ix  ][iy-1][iz-1][7] += velmoments[7]*weight011;
-        moments[ix  ][iy-1][iz-1][8] += velmoments[8]*weight011;
-        moments[ix  ][iy-1][iz-1][9] += velmoments[9]*weight011;
-
-        moments[ix-1][iy  ][iz  ][0] += velmoments[0]*weight100;
-        moments[ix-1][iy  ][iz  ][1] += velmoments[1]*weight100;
-        moments[ix-1][iy  ][iz  ][2] += velmoments[2]*weight100;
-        moments[ix-1][iy  ][iz  ][3] += velmoments[3]*weight100;
-        moments[ix-1][iy  ][iz  ][4] += velmoments[4]*weight100;
-        moments[ix-1][iy  ][iz  ][5] += velmoments[5]*weight100;
-        moments[ix-1][iy  ][iz  ][6] += velmoments[6]*weight100;
-        moments[ix-1][iy  ][iz  ][7] += velmoments[7]*weight100;
-        moments[ix-1][iy  ][iz  ][8] += velmoments[8]*weight100;
-        moments[ix-1][iy  ][iz  ][9] += velmoments[9]*weight100;
-
-        moments[ix-1][iy  ][iz-1][0] += velmoments[0]*weight101;
-        moments[ix-1][iy  ][iz-1][1] += velmoments[1]*weight101;
-        moments[ix-1][iy  ][iz-1][2] += velmoments[2]*weight101;
-        moments[ix-1][iy  ][iz-1][3] += velmoments[3]*weight101;
-        moments[ix-1][iy  ][iz-1][4] += velmoments[4]*weight101;
-        moments[ix-1][iy  ][iz-1][5] += velmoments[5]*weight101;
-        moments[ix-1][iy  ][iz-1][6] += velmoments[6]*weight101;
-        moments[ix-1][iy  ][iz-1][7] += velmoments[7]*weight101;
-        moments[ix-1][iy  ][iz-1][8] += velmoments[8]*weight101;
-        moments[ix-1][iy  ][iz-1][9] += velmoments[9]*weight101;
-
-        moments[ix-1][iy-1][iz  ][0] += velmoments[0]*weight110;
-        moments[ix-1][iy-1][iz  ][1] += velmoments[1]*weight110;
-        moments[ix-1][iy-1][iz  ][2] += velmoments[2]*weight110;
-        moments[ix-1][iy-1][iz  ][3] += velmoments[3]*weight110;
-        moments[ix-1][iy-1][iz  ][4] += velmoments[4]*weight110;
-        moments[ix-1][iy-1][iz  ][5] += velmoments[5]*weight110;
-        moments[ix-1][iy-1][iz  ][6] += velmoments[6]*weight110;
-        moments[ix-1][iy-1][iz  ][7] += velmoments[7]*weight110;
-        moments[ix-1][iy-1][iz  ][8] += velmoments[8]*weight110;
-        moments[ix-1][iy-1][iz  ][9] += velmoments[9]*weight110;
-
-        moments[ix-1][iy-1][iz-1][0] += velmoments[0]*weight111;
-        moments[ix-1][iy-1][iz-1][1] += velmoments[1]*weight111;
-        moments[ix-1][iy-1][iz-1][2] += velmoments[2]*weight111;
-        moments[ix-1][iy-1][iz-1][3] += velmoments[3]*weight111;
-        moments[ix-1][iy-1][iz-1][4] += velmoments[4]*weight111;
-        moments[ix-1][iy-1][iz-1][5] += velmoments[5]*weight111;
-        moments[ix-1][iy-1][iz-1][6] += velmoments[6]*weight111;
-        moments[ix-1][iy-1][iz-1][7] += velmoments[7]*weight111;
-        moments[ix-1][iy-1][iz-1][8] += velmoments[8]*weight111;
-        moments[ix-1][iy-1][iz-1][9] += velmoments[9]*weight111;
+        arr1_double_fetch moments000 = moments[ix  ][iy  ][iz  ];
+        arr1_double_fetch moments001 = moments[ix  ][iy  ][iz-1];
+        arr1_double_fetch moments010 = moments[ix  ][iy-1][iz  ];
+        arr1_double_fetch moments011 = moments[ix  ][iy-1][iz-1];
+        arr1_double_fetch moments100 = moments[ix-1][iy  ][iz  ];
+        arr1_double_fetch moments101 = moments[ix-1][iy  ][iz-1];
+        arr1_double_fetch moments110 = moments[ix-1][iy-1][iz  ];
+        arr1_double_fetch moments111 = moments[ix-1][iy-1][iz-1];
+
+        moments000[0] += velmoments[0]*weight000;
+        moments000[1] += velmoments[1]*weight000;
+        moments000[2] += velmoments[2]*weight000;
+        moments000[3] += velmoments[3]*weight000;
+        moments000[4] += velmoments[4]*weight000;
+        moments000[5] += velmoments[5]*weight000;
+        moments000[6] += velmoments[6]*weight000;
+        moments000[7] += velmoments[7]*weight000;
+        moments000[8] += velmoments[8]*weight000;
+        moments000[9] += velmoments[9]*weight000;
+
+        moments001[0] += velmoments[0]*weight001;
+        moments001[1] += velmoments[1]*weight001;
+        moments001[2] += velmoments[2]*weight001;
+        moments001[3] += velmoments[3]*weight001;
+        moments001[4] += velmoments[4]*weight001;
+        moments001[5] += velmoments[5]*weight001;
+        moments001[6] += velmoments[6]*weight001;
+        moments001[7] += velmoments[7]*weight001;
+        moments001[8] += velmoments[8]*weight001;
+        moments001[9] += velmoments[9]*weight001;
+
+        moments010[0] += velmoments[0]*weight010;
+        moments010[1] += velmoments[1]*weight010;
+        moments010[2] += velmoments[2]*weight010;
+        moments010[3] += velmoments[3]*weight010;
+        moments010[4] += velmoments[4]*weight010;
+        moments010[5] += velmoments[5]*weight010;
+        moments010[6] += velmoments[6]*weight010;
+        moments010[7] += velmoments[7]*weight010;
+        moments010[8] += velmoments[8]*weight010;
+        moments010[9] += velmoments[9]*weight010;
+
+        moments011[0] += velmoments[0]*weight011;
+        moments011[1] += velmoments[1]*weight011;
+        moments011[2] += velmoments[2]*weight011;
+        moments011[3] += velmoments[3]*weight011;
+        moments011[4] += velmoments[4]*weight011;
+        moments011[5] += velmoments[5]*weight011;
+        moments011[6] += velmoments[6]*weight011;
+        moments011[7] += velmoments[7]*weight011;
+        moments011[8] += velmoments[8]*weight011;
+        moments011[9] += velmoments[9]*weight011;
+
+        moments100[0] += velmoments[0]*weight100;
+        moments100[1] += velmoments[1]*weight100;
+        moments100[2] += velmoments[2]*weight100;
+        moments100[3] += velmoments[3]*weight100;
+        moments100[4] += velmoments[4]*weight100;
+        moments100[5] += velmoments[5]*weight100;
+        moments100[6] += velmoments[6]*weight100;
+        moments100[7] += velmoments[7]*weight100;
+        moments100[8] += velmoments[8]*weight100;
+        moments100[9] += velmoments[9]*weight100;
+
+        moments101[0] += velmoments[0]*weight101;
+        moments101[1] += velmoments[1]*weight101;
+        moments101[2] += velmoments[2]*weight101;
+        moments101[3] += velmoments[3]*weight101;
+        moments101[4] += velmoments[4]*weight101;
+        moments101[5] += velmoments[5]*weight101;
+        moments101[6] += velmoments[6]*weight101;
+        moments101[7] += velmoments[7]*weight101;
+        moments101[8] += velmoments[8]*weight101;
+        moments101[9] += velmoments[9]*weight101;
+
+        moments110[0] += velmoments[0]*weight110;
+        moments110[1] += velmoments[1]*weight110;
+        moments110[2] += velmoments[2]*weight110;
+        moments110[3] += velmoments[3]*weight110;
+        moments110[4] += velmoments[4]*weight110;
+        moments110[5] += velmoments[5]*weight110;
+        moments110[6] += velmoments[6]*weight110;
+        moments110[7] += velmoments[7]*weight110;
+        moments110[8] += velmoments[8]*weight110;
+        moments110[9] += velmoments[9]*weight110;
+
+        moments111[0] += velmoments[0]*weight111;
+        moments111[1] += velmoments[1]*weight111;
+        moments111[2] += velmoments[2]*weight111;
+        moments111[3] += velmoments[3]*weight111;
+        moments111[4] += velmoments[4]*weight111;
+        moments111[5] += velmoments[5]*weight111;
+        moments111[6] += velmoments[6]*weight111;
+        moments111[7] += velmoments[7]*weight111;
+        moments111[8] += velmoments[8]*weight111;
+        moments111[9] += velmoments[9]*weight111;
 
         //double weight[2][2][2];
         //weight[0][0][0]=weight000;
@@ -617,35 +631,35 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
     {
       //
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { rhons[is][i][j][k] += invVOL*moments[i][j][k][0]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { rhons[is][i][j][k] += invVOL*moments[i][j][k][0]; }}
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1]; }}
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { Jys  [is][i][j][k] += invVOL*moments[i][j][k][2]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { Jys  [is][i][j][k] += invVOL*moments[i][j][k][2]; }}
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3]; }}
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4]; }}
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5]; }}
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6]; }}
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7]; }}
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8]; }}
       #pragma omp critical
-      for(int i=0;i<nxn;i++) for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }}
     }
     else
     {
@@ -3545,8 +3559,11 @@ EMfields3D::~EMfields3D() {
   for(int i=0;i<sizeMomentsArray;i++)
   {
     delete momentsArray[i];
-    moments10[i]->free();
+    delete moments10Array[i];
+    //moments10[i]->free();
   }
   delete [] momentsArray;
-  free(moments10);
+  delete [] moments10Array;
+  //delete [] moments10;
+  //free(moments10);
 }
diff --git a/fields/Moments.cpp b/fields/Moments.cpp
index 6c31a2f1..32376155 100644
--- a/fields/Moments.cpp
+++ b/fields/Moments.cpp
@@ -1,8 +1,20 @@
 #include "Moments.h"
 #include "Alloc.h"
 
+void Moments10::set_to_zero()
+{
+  #pragma omp parallel for collapse(4)
+  for (register int i = 0; i < nx; i++)
+  for (register int j = 0; j < ny; j++)
+  for (register int k = 0; k < nz; k++)
+  for (register int m = 0; m < 10; m++)
+  {
+    arr[i][j][k][m] = 0.0;
+  }
+}
+
 void Moments::set_to_zero() {
-  // #pragma omp parallel for collapse(1)
+  #pragma omp parallel for collapse(3)
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
       for (register int k = 0; k < nz; k++) {
diff --git a/include/Alloc.h b/include/Alloc.h
index 8e589321..e8456785 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -58,6 +58,10 @@
     code to compile on the latest intel compiler (2013) and on
     g++ 4.0 (2005); g++ 4.2 (2007) compiled (but unfortunately,
     for my g++ 4.2, iPic3D suffered from stack frame corruption.)
+    //
+    Note that the directive
+      #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
+    appears not only here but also in arraysfwd.h
 */
 #define ALIGNMENT (64)
 #ifdef __INTEL_COMPILER
@@ -217,26 +221,26 @@ namespace iPic3D
   
   // classes to dereference arrays.
   //
-  // ArrayGetN is essentially a dumbed-down version of ArrN with
+  // array_fetchN is essentially a dumbed-down version of ArrN with
   // an index shift applied to the underlying array.  The purpose
-  // of ArrayGetN is to allow elements of multidimensional arrays
+  // of array_fetchN is to allow elements of multidimensional arrays
   // to be accessed with a calculated one-dimensional index while
   // using chained operator[] syntax (e.g. myarr[i][j]), i.e. the
   // same syntax as is used for native or nested arrays.  This
   // implementation is likely to be slow unless optimization is
   // turned on, allowing the compiler to figure out that the whole
-  // chain of calls to the operator[] methods and to the ArrayGetN
+  // chain of calls to the operator[] methods and to the array_fetchN
   // constructors reduces to computing a one-dimensional subscript
   // used to access a one-dimensional array.
   //
   template <class type>
-  class ArrayGet1
+  class array_fetch1
   {
     type* const __restrict__ arr;
     const size_t S1;
     const size_t shift;
    public:
-    inline ArrayGet1(type*const arr_, size_t k, size_t s1) :
+    inline array_fetch1(type*const arr_, size_t k, size_t s1) :
       arr(arr_), shift(k), S1(s1)
     {}
     inline type& operator[](size_t n1){
@@ -247,34 +251,34 @@ namespace iPic3D
   };
   
   template <class type>
-  class ArrayGet2
+  class array_fetch2
   {
     type* const __restrict__ arr;
     const size_t shift;
     const size_t S2, S1;
    public:
-    inline ArrayGet2(type*const arr_, size_t k, size_t s2, size_t s1) :
+    inline array_fetch2(type*const arr_, size_t k, size_t s2, size_t s1) :
       arr(arr_), shift(k), S2(s2), S1(s1)
     {}
-    inline ArrayGet1<type> operator[](size_t n2){
+    inline array_fetch1<type> operator[](size_t n2){
       check_bounds(n2,S2);
-      return ArrayGet1<type>(arr, (shift+n2)*S1, S1);
+      return array_fetch1<type>(arr, (shift+n2)*S1, S1);
     }
   };
   
   template <class type>
-  class ArrayGet3
+  class array_fetch3
   {
     type* const __restrict__ arr;
     const size_t shift;
     const size_t S3, S2, S1;
    public:
-    inline ArrayGet3(type*const arr_, size_t k, size_t s3, size_t s2, size_t s1) :
+    inline array_fetch3(type*const arr_, size_t k, size_t s3, size_t s2, size_t s1) :
       arr(arr_), shift(k), S3(s3), S2(s2), S1(s1)
     {}
-    inline ArrayGet2<type> operator[](size_t n3){
+    inline array_fetch2<type> operator[](size_t n3){
       check_bounds(n3, S3);
-      return ArrayGet2<type>(arr, (shift+n3)*S2, S2, S1);
+      return array_fetch2<type>(arr, (shift+n3)*S2, S2, S1);
     }
   };
   
@@ -426,9 +430,9 @@ namespace iPic3D
         arr(*in)
       { }
       // dereference via calculated index
-      inline ArrayGet1<type> operator[](size_t n2){
+      inline array_fetch1<type> operator[](size_t n2){
         check_bounds(n2, S2);
-        return ArrayGet1<type>(arr, n2*S1, S1);
+        return array_fetch1<type>(arr, n2*S1, S1);
       }
       inline size_t getidx(size_t n2, size_t n1) const
       {
@@ -530,9 +534,9 @@ namespace iPic3D
       { }
       void free(){ delArray3<type>((type***)arr3); }
     #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
-      inline ArrayGet2<type> operator[](size_t n3){
+      inline array_fetch2<type> operator[](size_t n3){
         check_bounds(n3, S3);
-        return ArrayGet2<type>(arr, n3*S2, S2, S1);
+        return array_fetch2<type>(arr, n3*S2, S2, S1);
       }
     #else
       // this causes operator[] to dereference via chained pointer
@@ -638,9 +642,9 @@ namespace iPic3D
         const_array_ref4<type>(in,s4,s3,s2,s1)
       { }
     #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
-      inline ArrayGet3<type> operator[](size_t n4){
+      inline array_fetch3<type> operator[](size_t n4){
         check_bounds(n4, S4);
-        return ArrayGet3<type>(arr, n4*S3, S3, S2, S1);
+        return array_fetch3<type>(arr, n4*S3, S3, S2, S1);
       }
     #else
       operator type****(){ return (type****) arr4; }
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index b2105b0b..07999eeb 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -32,6 +32,7 @@ using std::endl;
 
 class Particles3Dcomm;
 class Moments;
+class Moments10;
 class EMfields3D                // :public Field
 {
   public:
@@ -259,12 +260,17 @@ class EMfields3D                // :public Field
     Moments& fetch_momentsArray(int i){
       assert_le(0,i);
       assert_le(i,sizeMomentsArray);
-      return *momentsArray[i];
+      return *(momentsArray[i]);
     }
-    arr4_double fetch_moments10(int i){
+    //arr4_double fetch_moments10(int i){
+    //  assert_le(0,i);
+    //  assert_le(i,sizeMomentsArray);
+    //  return *(moments10[i]);
+    //}
+    Moments10& fetch_moments10Array(int i){
       assert_le(0,i);
       assert_le(i,sizeMomentsArray);
-      return *moments10[i];
+      return *(moments10Array[i]);
     }
 
     /*! print electromagnetic fields info */
@@ -390,7 +396,8 @@ class EMfields3D                // :public Field
     /* temporary arrays for summing moments */
     int sizeMomentsArray;
     Moments **momentsArray;
-    arr4_double** moments10;
+    Moments10 **moments10Array;
+    //arr4_double** moments10;
 
 
     // *******************************************************************************
diff --git a/include/Moments.h b/include/Moments.h
index fd28e169..33c706fa 100644
--- a/include/Moments.h
+++ b/include/Moments.h
@@ -2,6 +2,29 @@
 #define Moments_H
 #include "Alloc.h"
 
+class Moments10
+{
+  private:
+    arr4_double arr;
+    int nx;
+    int ny;
+    int nz;
+  public:
+    void set_to_zero();
+
+    // fetch accessors (write access)
+    arr4_double fetch_arr() { return arr; }
+
+    Moments10(int nxn, int nyn, int nzn) :
+      nx(nxn),
+      ny(nyn),
+      nz(nzn),
+      arr (nxn, nyn, nzn,10)
+    {
+    };
+    ~Moments10(){};
+};
+
 // class to accumulate node-centered species moments
 // 
 class Moments {
diff --git a/include/arraysfwd.h b/include/arraysfwd.h
index 41fdbc19..9341a381 100644
--- a/include/arraysfwd.h
+++ b/include/arraysfwd.h
@@ -49,4 +49,13 @@ typedef iPic3D::array1<double> array1_double;
 typedef iPic3D::array2<double> array2_double;
 typedef iPic3D::array3<double> array3_double;
 typedef iPic3D::array4<double> array4_double;
+// This directive should be consistent with the directives in Alloc.h
+#if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
+typedef iPic3D::array_fetch1<double> arr1_double_fetch;
+typedef iPic3D::array_get1<double> arr1_double_get;
+#else
+typedef double* arr1_double_fetch;
+typedef double* arr1_double_get;
+#endif
+
 #endif
diff --git a/inputfiles/GEM.inp b/inputfiles/GEM.inp
index e6f9bc23..f3eb5aab 100644
--- a/inputfiles/GEM.inp
+++ b/inputfiles/GEM.inp
@@ -53,8 +53,8 @@ nzc =  1                   # nzc = number of cells - z direction
 
 # %%%%%%%%%%%%%% MPI TOPOLOGY %%%%%%%%%%%%%%
 # number of MPI subdomains in each direction
-XLEN = 4
-YLEN = 4
+XLEN = 2
+YLEN = 2
 ZLEN = 1
 # topology of subdomains in each dimension (1=true, 0=false)
 PERIODICX = 1

From 7c6db4e769f6b23a00facc75dc1d29662e32baa5 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 2 Oct 2013 18:13:20 +0200
Subject: [PATCH 044/118] commenting out deprecated TenMoments class with
 preprocessor directives

---
 fields/EMfields3D.cpp         | 150 +++++++++++++++-------------------
 fields/Moments.cpp            |   4 +-
 include/EMfields3D.h          |  23 +++---
 include/Moments.h             |   8 +-
 particles/Particles3Dcomm.cpp |   1 -
 5 files changed, 83 insertions(+), 103 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 026698df..d37a90fb 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -192,15 +192,16 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
   injFieldsRear   = new injInfoFields(nxn, nyn, nzn);
 
   sizeMomentsArray = omp_thread_count();
-  momentsArray = new Moments*[sizeMomentsArray];
+  #ifdef TENMOMENTS
+  tenMomentsArray = new TenMoments*[sizeMomentsArray];
+  #endif // TENMOMENTS
   moments10Array = new Moments10*[sizeMomentsArray];
-  //moments10 = (arr4_double**) malloc(sizeof(void*)*sizeMomentsArray);
   for(int i=0;i<sizeMomentsArray;i++)
   {
-    momentsArray[i] = new Moments(nxn,nyn,nzn);
+    #ifdef TENMOMENTS
+    tenMomentsArray[i] = new TenMoments(nxn,nyn,nzn);
+    #endif // TENMOMENTS
     moments10Array[i] = new Moments10(nxn,nyn,nzn);
-    //momentsArray[i]->init(nxn,nyn,nzn);
-    //moments10[i] = new arr4_double(nxn,nyn,nzn,10);
   }
 }
 
@@ -226,7 +227,6 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   //
   const int is = pcls.get_ns();
   bool bmoments10 = true;
-  bool b10moments = false; // turn on doing it the old way
 
   // if b10moments
   double* rhons1d = &rhons[is][0][0][0];
@@ -251,14 +251,9 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   #pragma omp parallel
   {
     int thread_num = omp_get_thread_num();
-    Moments& speciesMoments = fetch_momentsArray(thread_num);
+    #ifdef TENMOMENTS
+    TenMoments& speciesMoments = fetch_momentsArray(thread_num);
     speciesMoments.set_to_zero();
-    Moments10& speciesMoments10 = fetch_moments10Array(thread_num);
-    speciesMoments10.set_to_zero();
-    arr4_double moments = speciesMoments10.fetch_arr();
-    //arr4_double moments = fetch_moments10(thread_num);
-    //moments.setall(0.);
-    //
     arr3_double rho = speciesMoments.fetch_rho();
     arr3_double Jx  = speciesMoments.fetch_Jx();
     arr3_double Jy  = speciesMoments.fetch_Jy();
@@ -269,6 +264,10 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
     arr3_double Pyy = speciesMoments.fetch_Pyy();
     arr3_double Pyz = speciesMoments.fetch_Pyz();
     arr3_double Pzz = speciesMoments.fetch_Pzz();
+    #endif // TENMOMENTS
+    Moments10& speciesMoments10 = fetch_moments10Array(thread_num);
+    speciesMoments10.set_to_zero();
+    arr4_double moments = speciesMoments10.fetch_arr();
     // The following loop is expensive, so it is wise to assume that the
     // compiler is stupid.  Therefore we should on the one hand
     // expand things out and on the other hand avoid repeating computations.
@@ -444,7 +443,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
         //}
       }
 
-      if(b10moments)
+      #ifdef TENMOMENTS
       {
         // use the weight to distribute the moments
         //
@@ -549,29 +548,16 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
         Pzz[ix-1][iy-1][iz  ] += wwi*weight110;
         Pzz[ix-1][iy-1][iz-1] += wwi*weight111;
       }
+      #endif // TENMOMENTS
 
-      // why on earth do I observe the following:
-      // * without openmp, b10moments and bmoments10 gives same results,
-      // * b10moments gives same results with and without openmp, and
-      // * bmoments10 gives wrong results when I use openmp.
-      // I'm using Moments class and moments array exactly the same way
-      // as far as openmp is concerned...  To isolate the problem,
-      // gradually morph Moments class until implemented via arr4_double...
-      // Problem in constructor?
-      // 
-      // 
-      if(b10moments && bmoments10)
+      #ifdef TENMOMENTS
       {
         // check work
         for(int jx=0;jx<2;jx++)
         for(int jy=0;jy<2;jy++)
         for(int jz=0;jz<2;jz++)
         {
-          //dprintf("gothere");
-          //dprintf("%24.16f == rho[ix-jx][iy-jy][iz-jz]", rho[ix-jx][iy-jy][iz-jz]);
-          //dprintf("%24.16f == moments[ix-jx][iy-jy][iz-jz][0]", moments[ix-jx][iy-jy][iz-jz][0]);
           assert_eq(rho[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][0]);
-          //dprintf("gothere");
           assert_eq(Jx [ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][1]);
           assert_eq(Jy [ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][2]);
           assert_eq(Jz [ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][3]);
@@ -583,53 +569,53 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
           assert_eq(Pzz[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][9]);
         }
       }
+      #endif // TENMOMENTS
     }
     // split up the reduction tasks.
     //
-    if(b10moments)
+    //{
+    //  //
+    //  // One-dimensional array access is presumably
+    //  // more efficient on poor compilers.
+    //  double* rho1d = &rho[0][0][0];
+    //  double* Jx1d  = &Jx [0][0][0];
+    //  double* Jy1d  = &Jy [0][0][0];
+    //  double* Jz1d  = &Jz [0][0][0];
+    //  double* Pxx1d = &Pxx[0][0][0];
+    //  double* Pxy1d = &Pxy[0][0][0];
+    //  double* Pxz1d = &Pxz[0][0][0];
+    //  double* Pyy1d = &Pyy[0][0][0];
+    //  double* Pyz1d = &Pyz[0][0][0];
+    //  double* Pzz1d = &Pzz[0][0][0];
+    //  ////
+    //  assert_eq(speciesMoments.get_nx(), nxn);
+    //  assert_eq(speciesMoments.get_ny(), nyn);
+    //  assert_eq(speciesMoments.get_nz(), nzn);
+    //  const int numel = nxn*nyn*nzn;
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) rhons1d[i] += invVOL*rho1d[i];
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) Jxs1d  [i] += invVOL*Jx1d [i];
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) Jys1d  [i] += invVOL*Jy1d [i];
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) Jzs1d  [i] += invVOL*Jz1d [i];
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) pXXsn1d[i] += invVOL*Pxx1d[i];
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) pXYsn1d[i] += invVOL*Pxy1d[i];
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) pXZsn1d[i] += invVOL*Pxz1d[i];
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) pYYsn1d[i] += invVOL*Pyy1d[i];
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) pYZsn1d[i] += invVOL*Pyz1d[i];
+    //  #pragma omp critical
+    //  for(int i=0;i<numel;i++) pZZsn1d[i] += invVOL*Pzz1d[i];
+    //}
+
+    // reduce arrays
     {
-      //
-      // One-dimensional array access is presumably
-      // more efficient on poor compilers.
-      double* rho1d = &rho[0][0][0];
-      double* Jx1d  = &Jx [0][0][0];
-      double* Jy1d  = &Jy [0][0][0];
-      double* Jz1d  = &Jz [0][0][0];
-      double* Pxx1d = &Pxx[0][0][0];
-      double* Pxy1d = &Pxy[0][0][0];
-      double* Pxz1d = &Pxz[0][0][0];
-      double* Pyy1d = &Pyy[0][0][0];
-      double* Pyz1d = &Pyz[0][0][0];
-      double* Pzz1d = &Pzz[0][0][0];
-      ////
-      assert_eq(speciesMoments.get_nx(), nxn);
-      assert_eq(speciesMoments.get_ny(), nyn);
-      assert_eq(speciesMoments.get_nz(), nzn);
-      const int numel = nxn*nyn*nzn;
-      #pragma omp critical
-      for(int i=0;i<numel;i++) rhons1d[i] += invVOL*rho1d[i];
-      #pragma omp critical
-      for(int i=0;i<numel;i++) Jxs1d  [i] += invVOL*Jx1d [i];
-      #pragma omp critical
-      for(int i=0;i<numel;i++) Jys1d  [i] += invVOL*Jy1d [i];
-      #pragma omp critical
-      for(int i=0;i<numel;i++) Jzs1d  [i] += invVOL*Jz1d [i];
-      #pragma omp critical
-      for(int i=0;i<numel;i++) pXXsn1d[i] += invVOL*Pxx1d[i];
-      #pragma omp critical
-      for(int i=0;i<numel;i++) pXYsn1d[i] += invVOL*Pxy1d[i];
-      #pragma omp critical
-      for(int i=0;i<numel;i++) pXZsn1d[i] += invVOL*Pxz1d[i];
-      #pragma omp critical
-      for(int i=0;i<numel;i++) pYYsn1d[i] += invVOL*Pyy1d[i];
-      #pragma omp critical
-      for(int i=0;i<numel;i++) pYZsn1d[i] += invVOL*Pyz1d[i];
-      #pragma omp critical
-      for(int i=0;i<numel;i++) pZZsn1d[i] += invVOL*Pzz1d[i];
-    }
-    else if(bmoments10)
-    {
-      //
       #pragma omp critical
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { rhons[is][i][j][k] += invVOL*moments[i][j][k][0]; }}
@@ -661,10 +647,6 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }}
     }
-    else
-    {
-      eprintf("reduction impossible without data!");
-    }
   }
   communicateGhostP2G(is, 0, 0, 0, 0, vct);
 }
@@ -1598,7 +1580,7 @@ void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft,
 }
 
 /* add moments (e.g. from an OpenMP thread) to the accumulated moments */
-//void EMfields3D::addToSpeciesMoments(const Moments & in, int is) {
+//void EMfields3D::addToSpeciesMoments(const TenMoments & in, int is) {
 //  assert_eq(in.get_nx(), nxn);
 //  assert_eq(in.get_ny(), nyn);
 //  assert_eq(in.get_nz(), nzn);
@@ -3556,14 +3538,10 @@ EMfields3D::~EMfields3D() {
   delete injFieldsBottom;
   delete injFieldsFront;
   delete injFieldsRear;
-  for(int i=0;i<sizeMomentsArray;i++)
-  {
-    delete momentsArray[i];
-    delete moments10Array[i];
-    //moments10[i]->free();
-  }
-  delete [] momentsArray;
+  #ifdef TENMOMENTS
+  for(int i=0;i<sizeMomentsArray;i++) { delete tenMomentsArray[i]; }
+  delete [] tenMomentsArray;
+  #endif // TENMOMENTS
+  for(int i=0;i<sizeMomentsArray;i++) { delete moments10Array[i]; }
   delete [] moments10Array;
-  //delete [] moments10;
-  //free(moments10);
 }
diff --git a/fields/Moments.cpp b/fields/Moments.cpp
index 32376155..5a4c5d0e 100644
--- a/fields/Moments.cpp
+++ b/fields/Moments.cpp
@@ -13,7 +13,8 @@ void Moments10::set_to_zero()
   }
 }
 
-void Moments::set_to_zero() {
+#ifdef TENMOMENTS
+void TenMoments::set_to_zero() {
   #pragma omp parallel for collapse(3)
   for (register int i = 0; i < nx; i++)
     for (register int j = 0; j < ny; j++)
@@ -30,4 +31,5 @@ void Moments::set_to_zero() {
         pZZ[i][j][k] = 0.0;
       }
 }
+#endif // TENMOMENTS
 
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 07999eeb..9ce5f215 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -31,7 +31,9 @@ using std::endl;
 /*! Electromagnetic fields and sources defined for each local grid, and for an implicit maxwell's solver @date May 2008 @par Copyright: (C) 2008 KUL @author Stefano Markidis, Giovanni Lapenta. @version 3.0 */
 
 class Particles3Dcomm;
-class Moments;
+#ifdef TENMOMENTS
+class TenMoments;
+#endif // TENMOMENTS
 class Moments10;
 class EMfields3D                // :public Field
 {
@@ -120,7 +122,7 @@ class EMfields3D                // :public Field
     void communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, VirtualTopology3D * vct);
     void sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
     /*! add accumulated moments to the moments for a given species */
-    //void addToSpeciesMoments(const Moments & in, int is);
+    //void addToSpeciesMoments(const TenMoments & in, int is);
     /*! add an amount of charge density to charge density field at node X,Y,Z */
     void addRho(double weight[][2][2], int X, int Y, int Z, int is);
     /*! add an amount of current density - direction X to current density field at node X,Y,Z */
@@ -257,16 +259,13 @@ class EMfields3D                // :public Field
     double getBenergy();
 
     /*! fetch array for summing moments of thread i */
-    Moments& fetch_momentsArray(int i){
+    #ifdef TENMOMENTS
+    TenMoments& fetch_momentsArray(int i){
       assert_le(0,i);
       assert_le(i,sizeMomentsArray);
-      return *(momentsArray[i]);
+      return *(tenMomentsArray[i]);
     }
-    //arr4_double fetch_moments10(int i){
-    //  assert_le(0,i);
-    //  assert_le(i,sizeMomentsArray);
-    //  return *(moments10[i]);
-    //}
+    #endif // TENMOMENTS
     Moments10& fetch_moments10Array(int i){
       assert_le(0,i);
       assert_le(i,sizeMomentsArray);
@@ -395,10 +394,10 @@ class EMfields3D                // :public Field
     array3_double divC;
     /* temporary arrays for summing moments */
     int sizeMomentsArray;
-    Moments **momentsArray;
+    #ifdef TENMOMENTS
+    TenMoments **tenMomentsArray;
+    #endif // TENMOMENTS
     Moments10 **moments10Array;
-    //arr4_double** moments10;
-
 
     // *******************************************************************************
     // *********** SOURCES **
diff --git a/include/Moments.h b/include/Moments.h
index 33c706fa..8a4a10cf 100644
--- a/include/Moments.h
+++ b/include/Moments.h
@@ -27,7 +27,8 @@ class Moments10
 
 // class to accumulate node-centered species moments
 // 
-class Moments {
+#ifdef TENMOMENTS
+class TenMoments {
   private:
     arr3_double rho;
 
@@ -73,7 +74,7 @@ class Moments {
     arr3_double fetch_Pyz() { return pYZ; }
     arr3_double fetch_Pzz() { return pZZ; }
   public:
-    Moments(int nxn, int nyn, int nzn) :
+    TenMoments(int nxn, int nyn, int nzn) :
       nx(nxn),
       ny(nyn),
       nz(nzn),
@@ -89,8 +90,9 @@ class Moments {
       pZZ (nxn, nyn, nzn)
     {
     };
-    ~Moments(){};
+    ~TenMoments(){};
     void set_to_zero();
 };
+#endif // TENMOMENTS
 
 #endif
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index e9c73aaa..1120de1c 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -20,7 +20,6 @@ developers: Stefano Markidis, Giovanni Lapenta.
 #include "Grid.h"
 #include "Grid3DCU.h"
 #include "Field.h"
-#include "Moments.h"
 #include "MPIdata.h"
 #include "ompdefs.h"
 

From 3304a90fed21a995394b2d6c91654f03cc1f8644 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 4 Oct 2013 16:27:56 +0200
Subject: [PATCH 045/118] issue #49: created ipic.py to replace ipic scripts

---
 scripts/ipic.py | 232 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100755 scripts/ipic.py

diff --git a/scripts/ipic.py b/scripts/ipic.py
new file mode 100755
index 00000000..7ae763e7
--- /dev/null
+++ b/scripts/ipic.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python
+
+import sys
+import getopt
+# http://docs.python.org/2/library/collections.html#collections.deque
+from collections import deque # double-ended queue
+import os
+#from optparse import OptionParser
+
+# useful documentation:
+#
+# http://effbot.org/zone/python-list.htm
+# http://pymotw.com/2/subprocess/
+# http://stackoverflow.com/questions/3777301/how-to-call-a-shell-script-from-python-code
+
+def ipic_ctags(args):
+    # create tags file using ctags
+    create_tags_command = \
+        '''find . -name '*.cpp' -or -name '*.h' | xargs ctags --extra=+qf'''
+    print create_tags_command
+    os.system(create_tags_command)
+    # sort tags file
+    sort_tags_command = '''LC_ALL=C sort -u tags -o tags'''
+    print sort_tags_command
+    os.system(sort_tags_command)
+
+def ipic_help():
+    print '''
+  To build, in the iPic3D directory you can use:
+  
+    rm -rf build # if necessary
+    mkdir build
+    cd build
+    cmake ..
+    make # or "make -j" to compile in parallel
+  
+  To run the code you can use
+  
+    mkdir data
+    mpiexec -n 4 exec/iPic3D ../inputfiles/GEM.inp
+  
+  where 4 = XLEN times YLEN times ZLEN (defined in GEM.inp).
+  
+  Available subcommands:
+
+    ''', progname, '''help ctags
+    ''', progname, '''help mic
+  '''
+
+def ipic_help_mic(args):
+    print '''
+  See "ipic help".  Modifications are as follows.
+
+  To run on the Xeon host processor, use something like:
+  
+    mpiexec.hydra -n 8 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
+  
+  where 8 = XLEN times YLEN times ZLEN.
+  
+  If you want to cross-compile for the MIC, then the instructions are
+  different:
+  
+      mkdir build.phi
+      cd build.phi
+      cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/cmake_template.cmake.XeonPhi
+      make -j
+  
+  And to run you use, e.g.:
+  
+    mkdir data
+    mpiexec.hydra -host knc2-mic0 -n 50 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
+  
+  where 50 = XLEN times YLEN times ZLEN.
+    '''
+
+def ipic_help_ctags(args):
+    print '''
+  Make sure that you are in the source code directory
+  and then run
+
+    ''', progname, '''ctags
+    '''
+
+def ipic_help_git(args):
+    print '''
+    ### This stub gives examples of git commands ###
+
+    # show branch information
+    git branch -avv
+    # examining the .git directory reveals a wealth of information, e.g.:
+    cat .git/config
+    # with --stat all files checked in are displayed.
+    git log --stat
+    # for the following I just do "git tree" (see .gitconfig below):
+    git log --oneline --decorate --graph --branches --source
+    git status # shows file statuses
+    git remote -v # show remote repositories
+    # show commits in chronological order.
+    git reflog
+    # git reflog is useful to get the sha-1 hash of a commit
+    # that you recently made and whose branch you accidentally
+    # deleted, making it no longer reachable.  Note that
+    # each snapshot that you commit should stay in its local
+    # repository for 90 days before being garbage collected
+    # unless you do something like "git gc".  See also
+    # http://gitready.com/advanced/2009/01/17/restoring-lost-commits.html
+    #
+    # show file
+    git show mybranch:myfile
+    eg cat myfile # slightly nicer than git show
+    # show who checked in what line when under what commit.
+    git blame myfile # on current branch
+    git blame amaya-library iPic3D.cpp
+
+  for modification:
+
+    # initialize a repository
+    mkdir localrepository; cd localrepository; git init
+    # creating/removing remote:
+    git remote add myremote  https://github.com/alecjohnson/iPic3D.git
+    git remote rm myremote  
+    # get all branches and their filesystem snapshots
+    # from myremote that are not already in localrepository
+    git fetch myremote 
+    # check in mods
+    git stage myfile
+    git rm oldfile
+    git commit
+    # modify a commit message
+    git commit --amend
+    # create a branch and check it out
+    git checkout -b newbranch
+    # push branch to server
+    eg push --branch newbranch myremote
+    # pull changes from server into current branch
+    git pull
+    # delete a branch on server (!):
+    git push myremote --delete mybranch
+
+  # example of global configuration file:
+
+    $ cat ~/.gitconfig
+    [user]
+    name = eajohnson
+    email = e.alec.johnson@gmail.com
+    [alias]
+    tree = log --oneline --decorate --graph --branches --source
+    undo-commit = reset --soft HEAD~1
+    '''
+
+def help(args):
+    if len(args) == 0:
+      ipic_help()
+      sys.exit()
+    
+    command = deque.popleft(args)
+    if command == "mic":
+      ipic_help_mic(args)
+    elif command == "ctags":
+      ipic_help_ctags(args)
+    elif command == "git":
+      ipic_help_git(args)
+    else:
+        print "ipic help", command, "is not supported"
+        sys.exit(-1)
+
+def usage():
+    print '''
+  usage: ''', progname, ''' [options] <command>
+
+  Available commands:
+    ''', progname, '''ctags
+    ''', progname, '''help
+      '''
+
+def main():
+
+    global progname
+    progname = os.path.basename(sys.argv[0])
+    global dirname
+    dirname = os.path.dirname(sys.argv[0])
+
+    # it might be better to use the argparse module rather than getopt,
+    # but unfortunately argparse is only available beginning with python 2.7
+    # and most HPC platforms seem to have python 2.6 installed.
+    # optparse has been deprecated and does not seem to be in python 3;
+    # note, however, that argparse was initially an extension of optparse
+    # before giving up on backward compatibility.
+    #
+    try:
+      opts, args = getopt.getopt(sys.argv[1:], 'ho:', ['help', 'output='])
+    except getopt.GetoptError, e:
+      if e.opt == 'o' and 'requires argument' in e.msg:
+        print 'ERROR: -o requires filename'
+      else:
+        usage()
+        sys.exit(-1)
+
+    for o, a in opts:
+        if o in ("-h", "--help"):
+          usage()
+          sys.exit()
+        elif o in ("-o", "--output"):
+          output = a
+        #else:
+        #  assert False, "unhandled option"
+
+    numargs = len(args)
+    if numargs==0:
+      usage()
+      sys.exit()
+
+    #print args
+    args = deque(args)
+    command = deque.popleft(args)
+    #print list(args)
+
+    if command == "help":
+        help(args)
+    elif command == "ctags":
+        ipic_ctags(args)
+        #print "ctags not yet implemented"
+    else:
+        print progname, command, "not supported"
+        sys.exit(-1)
+
+    #print os.path.basename(__file__)
+    #print os.path.dirname(__file__)
+
+if __name__ == '__main__':
+    main()
+

From 8ab8b3a6b5b667b41860ddc92bc3f4b764ca9ea7 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 4 Oct 2013 16:31:32 +0200
Subject: [PATCH 046/118] issue #49: removed ipic shell scripts

---
 scripts/ipic            | 16 ----------------
 scripts/ipic-ctags      |  8 --------
 scripts/ipic-help       | 30 ------------------------------
 scripts/ipic-help-ctags | 22 ----------------------
 scripts/ipic-help-mic   | 28 ----------------------------
 scripts/makefiletags    | 14 --------------
 6 files changed, 118 deletions(-)
 delete mode 100755 scripts/ipic
 delete mode 100755 scripts/ipic-ctags
 delete mode 100755 scripts/ipic-help
 delete mode 100755 scripts/ipic-help-ctags
 delete mode 100755 scripts/ipic-help-mic
 delete mode 100755 scripts/makefiletags

diff --git a/scripts/ipic b/scripts/ipic
deleted file mode 100755
index 25158819..00000000
--- a/scripts/ipic
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/sh
-if test $# -lt 1
-then 
-  echo '
-  usage: ipic <command>
-
-  Available ipic commands:
-    ipic ctags
-    ipic help
-'
-  exit
-fi
-DIRNAME=`dirname $0`
-APPENDIX="$1"
-shift
-exec "${DIRNAME}/ipic-${APPENDIX}" "$@"
diff --git a/scripts/ipic-ctags b/scripts/ipic-ctags
deleted file mode 100755
index df4767be..00000000
--- a/scripts/ipic-ctags
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-DIRNAME=`dirname $0`
-echo creating tags file using ctags
-find . -name '*.cpp' -or -name '*.h' | xargs ctags --extra=+q
-echo creating tag for each C++ file
-find . -name '*.cpp' -or -name '*.h' | xargs $DIRNAME/makefiletags >> tags
-echo sorting tags file
-LC_ALL=C sort -u tags -o tags
diff --git a/scripts/ipic-help b/scripts/ipic-help
deleted file mode 100755
index 98a88e71..00000000
--- a/scripts/ipic-help
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-if test $# -lt 1
-then
-  echo '
-  To build, in the iPic3D directory you can use:
-  
-    rm -rf build # if necessary
-    mkdir build
-    cd build
-    cmake ..
-    make # or "make -j" to compile in parallel
-  
-  To run the code you can use
-  
-    mkdir data
-    mpiexec.hydra -n 4 -env OMP_NUM_THREADS=1 exec/iPic3D ../inputfiles/GEM.inp
-  
-  where 4 = XLEN times YLEN times ZLEN (defined in GEM.inp).
-  
-  Available subcommands:
-
-    ipic help ctags
-    ipic help mic
-'
-  exit
-fi
-DIRNAME=`dirname $0`
-APPENDIX="$1"
-shift
-exec "${DIRNAME}/ipic-help-${APPENDIX}" "$@"
diff --git a/scripts/ipic-help-ctags b/scripts/ipic-help-ctags
deleted file mode 100755
index e80c00c1..00000000
--- a/scripts/ipic-help-ctags
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-DIRNAME=`dirname $0`
-SCRIPTSDIRNAME=`cd "${DIRNAME}"; pwd`
-PARENTOFSCRIPTSDIRNAME=`dirname "${SCRIPTSDIRNAME}"`
-#if test $# -lt 1
-#then
-  echo '
-  Make sure that you are in the source code directory via e.g.
-
-    cd '"${PARENTOFSCRIPTSDIRNAME}"'
-
-  and then run the script, e.g. via
-
-    '"${SCRIPTSDIRNAME}"'/ipic ctags
-
-  or
-
-    ipic ctags
-
-  if you have '"${DIRNAME}"' in your path.
-'
-
diff --git a/scripts/ipic-help-mic b/scripts/ipic-help-mic
deleted file mode 100755
index d983dd72..00000000
--- a/scripts/ipic-help-mic
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/sh
-#if test $# -lt 1
-#then
-  echo '
-  See "ipic help".  Modifications are as follows.
-
-  To run on the Xeon host processor, use something like:
-  
-    mpiexec.hydra -n 8 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
-  
-  where 8 = XLEN times YLEN times ZLEN.
-  
-  If you want to cross-compile for the MIC, then the instructions are
-  different:
-  
-      mkdir build.phi
-      cd build.phi
-      cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/cmake_template.cmake.XeonPhi
-      make -j
-  
-  And to run you use, e.g.:
-  
-    mkdir data
-    mpiexec.hydra -host knc2-mic0 -n 50 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
-  
-  where 50 = XLEN times YLEN times ZLEN.
-'
-
diff --git a/scripts/makefiletags b/scripts/makefiletags
deleted file mode 100755
index da0fb7d1..00000000
--- a/scripts/makefiletags
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh
-# generate a tag for each file name argument
-ls $* 2>&1| sed 's/ /\
-/g' \
-| perl -ne '
-    if(m@/@) {
-      m@(.*?)/([^/\n]+)$@;
-      print "$2\t$1/$2\t1\n";
-    } else {
-      m@(.*)@;
-      print "$1\t$1\t1\n";
-    }
-  '
-echo "tags	tags	1" >> tags

From 4f74c32a626403a42086a89901d511701c179571 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 4 Oct 2013 16:37:15 +0200
Subject: [PATCH 047/118] iss #49: scripts/ipic linked to scripts/ipic.py

---
 makefile     | 2 +-
 scripts/ipic | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 120000 scripts/ipic

diff --git a/makefile b/makefile
index 519cb605..3ca3c2a6 100644
--- a/makefile
+++ b/makefile
@@ -6,7 +6,7 @@ help:
 tags: retags
 
 retags:
-	scripts/ipic-ctags
+	scripts/ipic ctags
 
 #monitor:
 #	less +F data/ConservedQuantities.txt
diff --git a/scripts/ipic b/scripts/ipic
new file mode 120000
index 00000000..e264ff86
--- /dev/null
+++ b/scripts/ipic
@@ -0,0 +1 @@
+ipic.py
\ No newline at end of file

From 4b92444ed6a1c71a6effb4ae7ba46f029e26569c Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 7 Oct 2013 10:17:13 +0200
Subject: [PATCH 048/118] give environment info in response to "ipic help deep"

---
 scripts/ipic.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/scripts/ipic.py b/scripts/ipic.py
index 7ae763e7..90948c4b 100755
--- a/scripts/ipic.py
+++ b/scripts/ipic.py
@@ -45,6 +45,7 @@ def ipic_help():
 
     ''', progname, '''help ctags
     ''', progname, '''help mic
+    ''', progname, '''help deep
   '''
 
 def ipic_help_mic(args):
@@ -71,6 +72,21 @@ def ipic_help_mic(args):
     mpiexec.hydra -host knc2-mic0 -n 50 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
   
   where 50 = XLEN times YLEN times ZLEN.
+
+  See also:
+    ''', progname, '''help deep
+    '''
+
+def ipic_help_deep(args):
+    print '''
+  DEEP needs the following modules:
+
+    module load hdf5/1.8.10-patch1
+    module load knc/intel_mpi/4.1.0.030
+    module load knc/mic
+
+  For instructions on how to build and run, see
+    ''', progname, '''help mic
     '''
 
 def ipic_help_ctags(args):
@@ -156,6 +172,8 @@ def help(args):
     command = deque.popleft(args)
     if command == "mic":
       ipic_help_mic(args)
+    elif command == "deep":
+      ipic_help_deep(args)
     elif command == "ctags":
       ipic_help_ctags(args)
     elif command == "git":

From 179b9a56d21d1c2b45b6cd3d8239281eb1894796 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 7 Oct 2013 12:03:25 +0200
Subject: [PATCH 049/118] added support for single-precision particles

---
 include/Particles.h       |  3 ++
 particles/Particles3D.cpp | 90 +++++++++++++++++++--------------------
 2 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/include/Particles.h b/include/Particles.h
index e8a903da..f5420a75 100644
--- a/include/Particles.h
+++ b/include/Particles.h
@@ -21,6 +21,9 @@ developers: Stefano Markidis, Giovanni Lapenta
  *
  */
 
+// precision to use for particles
+//typedef float pfloat;
+typedef double pfloat;
 
 class Particles {
 public:
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 469fff44..76e92f08 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -333,24 +333,24 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
 #pragma simd                    // this just slows things down (why?)
   for (int rest = 0; rest < nop; rest++) {
     // copy the particle
-    double xp = x[rest];
-    double yp = y[rest];
-    double zp = z[rest];
-    double up = u[rest];
-    double vp = v[rest];
-    double wp = w[rest];
-    const double xptilde = x[rest];
-    const double yptilde = y[rest];
-    const double zptilde = z[rest];
-    double uptilde;
-    double vptilde;
-    double wptilde;
+    pfloat xp = x[rest];
+    pfloat yp = y[rest];
+    pfloat zp = z[rest];
+    pfloat up = u[rest];
+    pfloat vp = v[rest];
+    pfloat wp = w[rest];
+    const pfloat xptilde = x[rest];
+    const pfloat yptilde = y[rest];
+    const pfloat zptilde = z[rest];
+    pfloat uptilde;
+    pfloat vptilde;
+    pfloat wptilde;
     // calculate the average velocity iteratively
     for (int innter = 0; innter < 1; innter++) {
       // interpolation G-->P
-      const double ixd = floor((xp - xstart) * inv_dx);
-      const double iyd = floor((yp - ystart) * inv_dy);
-      const double izd = floor((zp - zstart) * inv_dz);
+      const pfloat ixd = floor((xp - xstart) * inv_dx);
+      const pfloat iyd = floor((yp - ystart) * inv_dy);
+      const pfloat izd = floor((zp - zstart) * inv_dz);
       int ix = 2 + int (ixd);
       int iy = 2 + int (iyd);
       int iz = 2 + int (izd);
@@ -367,9 +367,9 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       if (iz > nzn - 1)
         iz = nzn - 1;
 
-      double xi[2];
-      double eta[2];
-      double zeta[2];
+      pfloat xi[2];
+      pfloat eta[2];
+      pfloat zeta[2];
       xi[0]   = xp - grid->getXN(ix-1);
       eta[0]  = yp - grid->getYN(iy-1);
       zeta[0] = zp - grid->getZN(iz-1);
@@ -377,16 +377,16 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       eta[1]  = grid->getYN(iy) - yp;
       zeta[1] = grid->getZN(iz) - zp;
 
-      double Exl = 0.0;
-      double Eyl = 0.0;
-      double Ezl = 0.0;
-      double Bxl = 0.0;
-      double Byl = 0.0;
-      double Bzl = 0.0;
+      pfloat Exl = 0.0;
+      pfloat Eyl = 0.0;
+      pfloat Ezl = 0.0;
+      pfloat Bxl = 0.0;
+      pfloat Byl = 0.0;
+      pfloat Bzl = 0.0;
 
       // MIC refuses to vectorize this ...
       // 
-      // double weight[2][2][2];
+      // pfloat weight[2][2][2];
       // for (int ii = 0; ii < 2; ii++)
       // for (int jj = 0; jj < 2; jj++)
       // for (int kk = 0; kk < 2; kk++)
@@ -394,12 +394,12 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       // for (int ii = 0; ii < 2; ii++)
       // for (int jj = 0; jj < 2; jj++)
       // for (int kk = 0; kk < 2; kk++) {
-      // const double Exlp = weight[ii][jj][kk] * Ex.get(ix - ii, iy - jj, iz - kk);
-      // const double Eylp = weight[ii][jj][kk] * Ey.get(ix - ii, iy - jj, iz - kk);
-      // const double Ezlp = weight[ii][jj][kk] * Ez.get(ix - ii, iy - jj, iz - kk);
-      // const double Bxlp = weight[ii][jj][kk] * Bx.get(ix - ii, iy - jj, iz - kk);
-      // const double Bylp = weight[ii][jj][kk] * By.get(ix - ii, iy - jj, iz - kk);
-      // const double Bzlp = weight[ii][jj][kk] * Bz.get(ix - ii, iy - jj, iz - kk);
+      // const pfloat Exlp = weight[ii][jj][kk] * Ex.get(ix - ii, iy - jj, iz - kk);
+      // const pfloat Eylp = weight[ii][jj][kk] * Ey.get(ix - ii, iy - jj, iz - kk);
+      // const pfloat Ezlp = weight[ii][jj][kk] * Ez.get(ix - ii, iy - jj, iz - kk);
+      // const pfloat Bxlp = weight[ii][jj][kk] * Bx.get(ix - ii, iy - jj, iz - kk);
+      // const pfloat Bylp = weight[ii][jj][kk] * By.get(ix - ii, iy - jj, iz - kk);
+      // const pfloat Bzlp = weight[ii][jj][kk] * Bz.get(ix - ii, iy - jj, iz - kk);
       // Exl += Exlp;
       // Eyl += Eylp;
       // Ezl += Ezlp;
@@ -410,14 +410,14 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
 
       // ... so we expand things out instead
       // 
-      const double weight000 = xi[0] * eta[0] * zeta[0] * invVOL;
-      const double weight001 = xi[0] * eta[0] * zeta[1] * invVOL;
-      const double weight010 = xi[0] * eta[1] * zeta[0] * invVOL;
-      const double weight011 = xi[0] * eta[1] * zeta[1] * invVOL;
-      const double weight100 = xi[1] * eta[0] * zeta[0] * invVOL;
-      const double weight101 = xi[1] * eta[0] * zeta[1] * invVOL;
-      const double weight110 = xi[1] * eta[1] * zeta[0] * invVOL;
-      const double weight111 = xi[1] * eta[1] * zeta[1] * invVOL;
+      const pfloat weight000 = xi[0] * eta[0] * zeta[0] * invVOL;
+      const pfloat weight001 = xi[0] * eta[0] * zeta[1] * invVOL;
+      const pfloat weight010 = xi[0] * eta[1] * zeta[0] * invVOL;
+      const pfloat weight011 = xi[0] * eta[1] * zeta[1] * invVOL;
+      const pfloat weight100 = xi[1] * eta[0] * zeta[0] * invVOL;
+      const pfloat weight101 = xi[1] * eta[0] * zeta[1] * invVOL;
+      const pfloat weight110 = xi[1] * eta[1] * zeta[0] * invVOL;
+      const pfloat weight111 = xi[1] * eta[1] * zeta[1] * invVOL;
       // 
       Bxl += weight000 * Bx[ix][iy][iz];
       Bxl += weight001 * Bx[ix][iy][iz - 1];
@@ -474,13 +474,13 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       Ezl += weight111 * Ez[ix - 1][iy - 1][iz - 1];
 
       // end interpolation
-      const double omdtsq = qomdt2 * qomdt2 * (Bxl * Bxl + Byl * Byl + Bzl * Bzl);
-      const double denom = 1.0 / (1.0 + omdtsq);
+      const pfloat omdtsq = qomdt2 * qomdt2 * (Bxl * Bxl + Byl * Byl + Bzl * Bzl);
+      const pfloat denom = 1.0 / (1.0 + omdtsq);
       // solve the position equation
-      const double ut = up + qomdt2 * Exl;
-      const double vt = vp + qomdt2 * Eyl;
-      const double wt = wp + qomdt2 * Ezl;
-      const double udotb = ut * Bxl + vt * Byl + wt * Bzl;
+      const pfloat ut = up + qomdt2 * Exl;
+      const pfloat vt = vp + qomdt2 * Eyl;
+      const pfloat wt = wp + qomdt2 * Ezl;
+      const pfloat udotb = ut * Bxl + vt * Byl + wt * Bzl;
       // solve the velocity equation 
       uptilde = (ut + qomdt2 * (vt * Bzl - wt * Byl + qomdt2 * udotb * Bxl)) * denom;
       vptilde = (vt + qomdt2 * (wt * Bxl - ut * Bzl + qomdt2 * udotb * Byl)) * denom;

From 5da183b05da30f7215653188b9a73f820dc9e050 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 8 Oct 2013 10:45:24 +0200
Subject: [PATCH 050/118] issue #52: use separate fieldForPcls array to push
 particles

---
 communication/ComNodes3D.cpp |   1 +
 fields/EMfields3D.cpp        |  28 +++++-
 grids/Grid3DCU.cpp           |   6 ++
 include/EMfields3D.h         |   8 ++
 include/Grid3DCU.h           |   8 +-
 include/Particles.h          |   4 -
 include/arraysfwd.h          |   5 +
 include/ipicdefs.h           |  19 ++++
 main/iPic3Dlib.cpp           |   4 +
 particles/Particles3D.cpp    | 172 +++++++++++++++++++++++++++++++----
 10 files changed, 230 insertions(+), 25 deletions(-)

diff --git a/communication/ComNodes3D.cpp b/communication/ComNodes3D.cpp
index 73906404..0d97ce89 100644
--- a/communication/ComNodes3D.cpp
+++ b/communication/ComNodes3D.cpp
@@ -1,4 +1,5 @@
 
+#include "mpi.h"
 #include "ComNodes3D.h"
 #include "TimeTasks.h"
 #include "ipicdefs.h"
diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index d37a90fb..9856e3a8 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -52,6 +52,7 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
   //
   // array allocation: nodes
   //
+  fieldForPcls  (nxn, nyn, nzn, 6),
   Ex   (nxn, nyn, nzn),
   Ey   (nxn, nyn, nzn),
   Ez   (nxn, nyn, nzn),
@@ -226,9 +227,8 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   double const*const q = pcls.getQall();
   //
   const int is = pcls.get_ns();
-  bool bmoments10 = true;
 
-  // if b10moments
+  #ifdef TENMOMENTS
   double* rhons1d = &rhons[is][0][0][0];
   double* Jxs1d   = &Jxs  [is][0][0][0];
   double* Jys1d   = &Jys  [is][0][0][0];
@@ -239,6 +239,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   double* pYYsn1d = &pYYsn[is][0][0][0];
   double* pYZsn1d = &pYZsn[is][0][0][0];
   double* pZZsn1d = &pZZsn[is][0][0][0];
+  #endif
   //
   const long long nop_ll = pcls.getNOP();
   const int nop = pcls.getNOP();
@@ -325,7 +326,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
       const double weight110 = qi * xi1 * eta1 * zeta0 * invVOL;
       const double weight111 = qi * xi1 * eta1 * zeta1 * invVOL;
 
-      if(bmoments10)
+      // add particle to moments
       {
         arr1_double_fetch moments000 = moments[ix  ][iy  ][iz  ];
         arr1_double_fetch moments001 = moments[ix  ][iy  ][iz-1];
@@ -1375,6 +1376,27 @@ void EMfields3D::ConstantChargePlanet(Grid * grid, VirtualTopology3D * vct, doub
 
 }
 
+/*! Populate the field data used to push particles */
+// 
+// One could add a background magnetic field B_ext at this point,
+// which was incompletely implemented in commit 05082fc8ad688
+//
+void EMfields3D::set_fieldForPcls()
+{
+  #pragma omp parallel for collapse(3)
+  for(int i=0;i<nxn;i++)
+  for(int j=0;j<nyn;j++)
+  for(int k=0;k<nzn;k++)
+  {
+    fieldForPcls[i][j][k][0] = (pfloat) Bxn[i][j][k];
+    fieldForPcls[i][j][k][1] = (pfloat) Byn[i][j][k];
+    fieldForPcls[i][j][k][2] = (pfloat) Bzn[i][j][k];
+    fieldForPcls[i][j][k][3] = (pfloat) Ex[i][j][k];
+    fieldForPcls[i][j][k][4] = (pfloat) Ey[i][j][k];
+    fieldForPcls[i][j][k][5] = (pfloat) Ez[i][j][k];
+  }
+}
+
 /*! Calculate Magnetic field with the implicit solver: calculate B defined on nodes With E(n+ theta) computed, the magnetic field is evaluated from Faraday's law */
 void EMfields3D::calculateB(Grid * grid, VirtualTopology3D * vct, Collective *col) {
   if (vct->getCartesian_rank() == 0)
diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index 5e7c3e2b..fa6bdd49 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -50,12 +50,18 @@ Grid3DCU::Grid3DCU(CollectiveIO * col, VirtualTopology3D * vct) {
   zEnd = zStart + (col->getLz() / (double) vct->getZLEN());
 
   // arrays allocation: nodes ---> the first node has index 1, the last has index nxn-2!
+  pfloat_node_xcoord = new pfloat[nxn];
+  pfloat_node_ycoord = new pfloat[nyn];
+  pfloat_node_zcoord = new pfloat[nzn];
   node_xcoord = new double[nxn];
   node_ycoord = new double[nyn];
   node_zcoord = new double[nzn];
   for (int i=0; i<nxn; i++) node_xcoord[i] = xStart + (i - 1) * dx;
   for (int j=0; j<nyn; j++) node_ycoord[j] = yStart + (j - 1) * dy;
   for (int k=0; k<nzn; k++) node_zcoord[k] = zStart + (k - 1) * dz;
+  for (int i=0; i<nxn; i++) pfloat_node_xcoord[i] = node_xcoord[i];
+  for (int j=0; j<nyn; j++) pfloat_node_ycoord[j] = node_ycoord[j];
+  for (int k=0; k<nzn; k++) pfloat_node_zcoord[k] = node_zcoord[k];
   // arrays allocation: cells ---> the first cell has index 1, the last has index ncn-2!
   center_xcoord = new double[nxc];
   center_ycoord = new double[nyc];
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 9ce5f215..38c62018 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -118,6 +118,8 @@ class EMfields3D                // :public Field
     /*! smooth the electric field */
     void smoothE(double value, VirtualTopology3D * vct, Collective *col);
 
+    /*! copy the field data to the array used to move the particles */
+    void set_fieldForPcls();
     /*! communicate ghost for grid -> Particles interpolation */
     void communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, VirtualTopology3D * vct);
     void sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
@@ -187,6 +189,7 @@ class EMfields3D                // :public Field
     double getBy(int X, int Y, int Z) const { return Byn.get(X,Y,Z);}
     double getBz(int X, int Y, int Z) const { return Bzn.get(X,Y,Z);}
     //
+    const_arr4_pfloat get_fieldForPcls() { return fieldForPcls; }
     arr3_double getEx() { return Ex; }
     arr3_double getEy() { return Ey; }
     arr3_double getEz() { return Ez; }
@@ -339,6 +342,11 @@ class EMfields3D                // :public Field
     /*! PHI: electric potential (indexX, indexY, indexZ), defined on central points between nodes */
     array3_double PHI;
 
+    // Electric field component used to move particles
+    // organized for rapid access in mover_PC()
+    // [This is the information transferred from cluster to booster].
+    array4_pfloat fieldForPcls;
+
     // Electric field components defined on nodes
     //
     array3_double Ex;
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index 2754ec95..b7eb7a6b 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -7,8 +7,6 @@
 #ifndef GRID3DCU_H
 #define GRID3DCU_H
 
-#include <iostream>
-
 #include "Grid.h"
 #include "CollectiveIO.h"
 #include "ComInterpNodes3D.h"
@@ -142,6 +140,9 @@ class Grid3DCU                  // :public Grid
   /** invol = inverse of volume*/
   double invVOL;
   /** node coordinate */
+  pfloat *pfloat_node_xcoord;
+  pfloat *pfloat_node_ycoord;
+  pfloat *pfloat_node_zcoord;
   double *node_xcoord;
   double *node_ycoord;
   double *node_zcoord;
@@ -169,6 +170,9 @@ class Grid3DCU                  // :public Grid
   //const double &calcXN(int X) { return xStart+(X-1)*dx;}
   //const double &calcYN(int Y) { return yStart+(Y-1)*dy;}
   //const double &calcZN(int Z) { return zStart+(Z-1)*dz;}
+  const pfloat &get_pfloat_XN(int X) { return pfloat_node_xcoord[X];}
+  const pfloat &get_pfloat_YN(int Y) { return pfloat_node_ycoord[Y];}
+  const pfloat &get_pfloat_ZN(int Z) { return pfloat_node_zcoord[Z];}
   const double &getXN(int X) { return node_xcoord[X];}
   const double &getYN(int Y) { return node_ycoord[Y];}
   const double &getZN(int Z) { return node_zcoord[Z];}
diff --git a/include/Particles.h b/include/Particles.h
index f5420a75..be97ac28 100644
--- a/include/Particles.h
+++ b/include/Particles.h
@@ -21,10 +21,6 @@ developers: Stefano Markidis, Giovanni Lapenta
  *
  */
 
-// precision to use for particles
-//typedef float pfloat;
-typedef double pfloat;
-
 class Particles {
 public:
   /** allocate particles */
diff --git a/include/arraysfwd.h b/include/arraysfwd.h
index 9341a381..30bda425 100644
--- a/include/arraysfwd.h
+++ b/include/arraysfwd.h
@@ -1,6 +1,7 @@
 /* forward declaration for array classes */
 #ifndef arraysfwd_h
 #define arraysfwd_h
+#include "ipicdefs.h" // for pfloat
 
 namespace iPic3D
 {
@@ -41,6 +42,7 @@ namespace iPic3D
 //
 typedef iPic3D::const_array_ref3<double> const_arr3_double;
 typedef iPic3D::const_array_ref4<double> const_arr4_double;
+typedef iPic3D::const_array_ref4<pfloat> const_arr4_pfloat;
 typedef iPic3D::array_ref1<double> arr1_double;
 typedef iPic3D::array_ref2<double> arr2_double;
 typedef iPic3D::array_ref3<double> arr3_double;
@@ -49,13 +51,16 @@ typedef iPic3D::array1<double> array1_double;
 typedef iPic3D::array2<double> array2_double;
 typedef iPic3D::array3<double> array3_double;
 typedef iPic3D::array4<double> array4_double;
+typedef iPic3D::array4<pfloat> array4_pfloat;
 // This directive should be consistent with the directives in Alloc.h
 #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
 typedef iPic3D::array_fetch1<double> arr1_double_fetch;
 typedef iPic3D::array_get1<double> arr1_double_get;
+typedef iPic3D::array_get1<pfloat> arr1_pfloat_get;
 #else
 typedef double* arr1_double_fetch;
 typedef double* arr1_double_get;
+typedef pfloat* arr1_pfloat_get;
 #endif
 
 #endif
diff --git a/include/ipicdefs.h b/include/ipicdefs.h
index ef86aaf1..1031f60c 100644
--- a/include/ipicdefs.h
+++ b/include/ipicdefs.h
@@ -10,4 +10,23 @@
 // use precprocessor to remove MPI_Barrier() calls.
 #define MPI_Barrier(args...)
 
+//#define SINGLE_PRECISION_PCLS
+//
+// single precision does not seem to help on the MIC
+#ifdef SINGLE_PRECISION_PCLS
+  typedef float pfloat;
+  #ifdef __MIC__
+    #define VECTOR_WIDTH 16
+  #else
+    #define VECTOR_WIDTH 8
+  #endif
+#else
+  #ifdef __MIC__
+    #define VECTOR_WIDTH 8
+  #else
+    #define VECTOR_WIDTH 4
+  #endif
+  typedef double pfloat;
+#endif
+
 #endif
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index d467dc8d..1cc76f78 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -218,9 +218,13 @@ bool c_Solver::ParticlesMover() {
   /*  -------------- */
 
   timeTasks.start(TimeTasks::PARTICLES);
+  // Should change this to add background guide field
+  EMf->set_fieldForPcls();
   for (int i = 0; i < ns; i++)  // move each species
   {
     // #pragma omp task inout(part[i]) in(grid) target_device(booster)
+    //
+    // should merely pass EMf->get_fieldForPcls() rather than EMf.
     mem_avail = part[i].mover_PC(grid, vct, EMf); // use the Predictor Corrector scheme 
   }
   timeTasks.end(TimeTasks::PARTICLES);
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 76e92f08..7bdf7162 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -316,32 +316,50 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
     cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
   }
   double start_mover_PC = MPI_Wtime();
+  #if 0
   const_arr3_double Ex = EMf->getEx();
   const_arr3_double Ey = EMf->getEy();
   const_arr3_double Ez = EMf->getEz();
   const_arr3_double Bx = EMf->getBx();
   const_arr3_double By = EMf->getBy();
   const_arr3_double Bz = EMf->getBz();
+  #endif
+  const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
 
-  const double dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
-  const double inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
+  #if 0
+  for(int i=0;i<nxn;i++)
+  for(int j=0;j<nyn;j++)
+  for(int k=0;k<nzn;k++)
+  {
+    assert_eq(fieldForPcls[i][j][k][0], (pfloat) Bx[i][j][k]);
+    assert_eq(fieldForPcls[i][j][k][1], (pfloat) By[i][j][k]);
+    assert_eq(fieldForPcls[i][j][k][2], (pfloat) Bz[i][j][k]);
+    assert_eq(fieldForPcls[i][j][k][3], (pfloat) Ex[i][j][k]);
+    assert_eq(fieldForPcls[i][j][k][4], (pfloat) Ey[i][j][k]);
+    assert_eq(fieldForPcls[i][j][k][5], (pfloat) Ez[i][j][k]);
+  }
+  #endif
+
+  const pfloat dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
+  const pfloat inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
   assert_le(nop,(long long) INT_MAX); // else would need to use long long
   // don't bother trying to push any particles simultaneously;
   // MIC already does vectorization automatically, and trying
   // to do it by hand only hurts performance.
-#pragma omp parallel for
-#pragma simd                    // this just slows things down (why?)
+  #pragma omp parallel for
+  // why does single precision make no difference in execution speed?
+  #pragma simd vectorlength(VECTOR_WIDTH)
   for (int rest = 0; rest < nop; rest++) {
     // copy the particle
-    pfloat xp = x[rest];
-    pfloat yp = y[rest];
-    pfloat zp = z[rest];
-    pfloat up = u[rest];
-    pfloat vp = v[rest];
-    pfloat wp = w[rest];
     const pfloat xptilde = x[rest];
     const pfloat yptilde = y[rest];
     const pfloat zptilde = z[rest];
+    pfloat xp = xptilde;
+    pfloat yp = yptilde;
+    pfloat zp = zptilde;
+    pfloat up = u[rest];
+    pfloat vp = v[rest];
+    pfloat wp = w[rest];
     pfloat uptilde;
     pfloat vptilde;
     pfloat wptilde;
@@ -370,12 +388,12 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       pfloat xi[2];
       pfloat eta[2];
       pfloat zeta[2];
-      xi[0]   = xp - grid->getXN(ix-1);
-      eta[0]  = yp - grid->getYN(iy-1);
-      zeta[0] = zp - grid->getZN(iz-1);
-      xi[1]   = grid->getXN(ix) - xp;
-      eta[1]  = grid->getYN(iy) - yp;
-      zeta[1] = grid->getZN(iz) - zp;
+      xi[0]   = xp - grid->get_pfloat_XN(ix-1);
+      eta[0]  = yp - grid->get_pfloat_YN(iy-1);
+      zeta[0] = zp - grid->get_pfloat_ZN(iz-1);
+      xi[1]   = grid->get_pfloat_XN(ix) - xp;
+      eta[1]  = grid->get_pfloat_YN(iy) - yp;
+      zeta[1] = grid->get_pfloat_ZN(iz) - zp;
 
       pfloat Exl = 0.0;
       pfloat Eyl = 0.0;
@@ -418,7 +436,128 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       const pfloat weight101 = xi[1] * eta[0] * zeta[1] * invVOL;
       const pfloat weight110 = xi[1] * eta[1] * zeta[0] * invVOL;
       const pfloat weight111 = xi[1] * eta[1] * zeta[1] * invVOL;
+
+      // creating these aliases seems to accelerate this method by about 30%
+      // on the Xeon host, processor, suggesting deficiency in the optimizer.
+      //
+      arr1_pfloat_get field000 = fieldForPcls[ix  ][iy  ][iz  ];
+      arr1_pfloat_get field001 = fieldForPcls[ix  ][iy  ][iz-1];
+      arr1_pfloat_get field010 = fieldForPcls[ix  ][iy-1][iz  ];
+      arr1_pfloat_get field011 = fieldForPcls[ix  ][iy-1][iz-1];
+      arr1_pfloat_get field100 = fieldForPcls[ix-1][iy  ][iz  ];
+      arr1_pfloat_get field101 = fieldForPcls[ix-1][iy  ][iz-1];
+      arr1_pfloat_get field110 = fieldForPcls[ix-1][iy-1][iz  ];
+      arr1_pfloat_get field111 = fieldForPcls[ix-1][iy-1][iz-1];
       // 
+      #if 0 // (takes same time as other order)
+      Bxl += weight000 * field000[0];
+      Bxl += weight001 * field001[0];
+      Bxl += weight010 * field010[0];
+      Bxl += weight011 * field011[0];
+      Bxl += weight100 * field100[0];
+      Bxl += weight101 * field101[0];
+      Bxl += weight110 * field110[0];
+      Bxl += weight111 * field111[0];
+      Byl += weight000 * field000[1];
+      Byl += weight001 * field001[1];
+      Byl += weight010 * field010[1];
+      Byl += weight011 * field011[1];
+      Byl += weight100 * field100[1];
+      Byl += weight101 * field101[1];
+      Byl += weight110 * field110[1];
+      Byl += weight111 * field111[1];
+      Bzl += weight000 * field000[2];
+      Bzl += weight001 * field001[2];
+      Bzl += weight010 * field010[2];
+      Bzl += weight011 * field011[2];
+      Bzl += weight100 * field100[2];
+      Bzl += weight101 * field101[2];
+      Bzl += weight110 * field110[2];
+      Bzl += weight111 * field111[2];
+      Exl += weight000 * field000[3];
+      Exl += weight001 * field001[3];
+      Exl += weight010 * field010[3];
+      Exl += weight011 * field011[3];
+      Exl += weight100 * field100[3];
+      Exl += weight101 * field101[3];
+      Exl += weight110 * field110[3];
+      Exl += weight111 * field111[3];
+      Eyl += weight000 * field000[4];
+      Eyl += weight001 * field001[4];
+      Eyl += weight010 * field010[4];
+      Eyl += weight011 * field011[4];
+      Eyl += weight100 * field100[4];
+      Eyl += weight101 * field101[4];
+      Eyl += weight110 * field110[4];
+      Eyl += weight111 * field111[4];
+      Ezl += weight000 * field000[5];
+      Ezl += weight001 * field001[5];
+      Ezl += weight010 * field010[5];
+      Ezl += weight011 * field011[5];
+      Ezl += weight100 * field100[5];
+      Ezl += weight101 * field101[5];
+      Ezl += weight110 * field110[5];
+      Ezl += weight111 * field111[5];
+      #endif
+
+      Bxl += weight000 * field000[0];
+      Byl += weight000 * field000[1];
+      Bzl += weight000 * field000[2];
+      Exl += weight000 * field000[3];
+      Eyl += weight000 * field000[4];
+      Ezl += weight000 * field000[5];
+
+      Bxl += weight001 * field001[0];
+      Byl += weight001 * field001[1];
+      Bzl += weight001 * field001[2];
+      Exl += weight001 * field001[3];
+      Eyl += weight001 * field001[4];
+      Ezl += weight001 * field001[5];
+
+      Bxl += weight010 * field010[0];
+      Byl += weight010 * field010[1];
+      Bzl += weight010 * field010[2];
+      Exl += weight010 * field010[3];
+      Eyl += weight010 * field010[4];
+      Ezl += weight010 * field010[5];
+
+      Bxl += weight011 * field011[0];
+      Byl += weight011 * field011[1];
+      Bzl += weight011 * field011[2];
+      Exl += weight011 * field011[3];
+      Eyl += weight011 * field011[4];
+      Ezl += weight011 * field011[5];
+
+      Bxl += weight100 * field100[0];
+      Byl += weight100 * field100[1];
+      Bzl += weight100 * field100[2];
+      Exl += weight100 * field100[3];
+      Eyl += weight100 * field100[4];
+      Ezl += weight100 * field100[5];
+
+      Bxl += weight101 * field101[0];
+      Byl += weight101 * field101[1];
+      Bzl += weight101 * field101[2];
+      Exl += weight101 * field101[3];
+      Eyl += weight101 * field101[4];
+      Ezl += weight101 * field101[5];
+
+      Bxl += weight110 * field110[0];
+      Byl += weight110 * field110[1];
+      Bzl += weight110 * field110[2];
+      Exl += weight110 * field110[3];
+      Eyl += weight110 * field110[4];
+      Ezl += weight110 * field110[5];
+
+      Bxl += weight111 * field111[0];
+      Byl += weight111 * field111[1];
+      Bzl += weight111 * field111[2];
+      Exl += weight111 * field111[3];
+      Eyl += weight111 * field111[4];
+      Ezl += weight111 * field111[5];
+
+      #if 0
+      Bxl += weight000 * Bx[ix][iy][iz];
       Bxl += weight000 * Bx[ix][iy][iz];
       Bxl += weight001 * Bx[ix][iy][iz - 1];
       Bxl += weight010 * Bx[ix][iy - 1][iz];
@@ -472,6 +611,7 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       Ezl += weight101 * Ez[ix - 1][iy][iz - 1];
       Ezl += weight110 * Ez[ix - 1][iy - 1][iz];
       Ezl += weight111 * Ez[ix - 1][iy - 1][iz - 1];
+      #endif
 
       // end interpolation
       const pfloat omdtsq = qomdt2 * qomdt2 * (Bxl * Bxl + Byl * Byl + Bzl * Bzl);

From 379a03070fb0bebf3dd447abb50238e296ed8e8c Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 8 Oct 2013 13:07:13 +0200
Subject: [PATCH 051/118] commented out #pragma simd directive: was only
 hurting performance

---
 particles/Particles3D.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 7bdf7162..356bf4a3 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -348,7 +348,7 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   // to do it by hand only hurts performance.
   #pragma omp parallel for
   // why does single precision make no difference in execution speed?
-  #pragma simd vectorlength(VECTOR_WIDTH)
+  //#pragma simd vectorlength(VECTOR_WIDTH)
   for (int rest = 0; rest < nop; rest++) {
     // copy the particle
     const pfloat xptilde = x[rest];

From 93b043f50937a974b12d139efb882b066dddc5cf Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 8 Oct 2013 15:15:12 +0200
Subject: [PATCH 052/118] Plugged memory leak in c_Solver::WriteConserved

---
 main/iPic3Dlib.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 1cc76f78..58e3fd44 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -316,6 +316,7 @@ void c_Solver::WriteConserved(int cycle) {
         my_file << endl;
         my_file.close();
       }
+      delete [] VelocityDist;
     }
   }
 }

From 09ed53e97ae1de6f0233a25bd59b347a9c1b3fa5 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 8 Oct 2013 15:17:00 +0200
Subject: [PATCH 053/118] issue #53: use long long for pclId, not arr index

---
 fields/EMfields3D.cpp         |  2 -
 include/Collective.h          |  8 ++--
 include/Particles.h           | 22 +++++------
 include/Particles3Dcomm.h     | 42 ++++++++++----------
 include/iPic3D.h              |  2 +-
 inputoutput/Collective.cpp    | 15 ++++---
 main/iPic3Dlib.cpp            |  2 +-
 particles/Particles3D.cpp     | 73 +++++++++++++++++------------------
 particles/Particles3Dcomm.cpp | 69 +++++++++++++++++----------------
 9 files changed, 119 insertions(+), 116 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 9856e3a8..71872157 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -241,9 +241,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   double* pZZsn1d = &pZZsn[is][0][0][0];
   #endif
   //
-  const long long nop_ll = pcls.getNOP();
   const int nop = pcls.getNOP();
-  assert_le(nop_ll, (long long) INT_MAX); // else would need to use long long
   // To make memory use scale to a large number of threads, we
   // could first apply an efficient parallel sorting algorithm
   // to the particles and then accumulate moments in smaller
diff --git a/include/Collective.h b/include/Collective.h
index e4155bcf..415baa98 100644
--- a/include/Collective.h
+++ b/include/Collective.h
@@ -81,8 +81,8 @@ class Collective
     int getNpcelx(int nspecies)const{ return (npcelx[nspecies]); }
     int getNpcely(int nspecies)const{ return (npcely[nspecies]); }
     int getNpcelz(int nspecies)const{ return (npcelz[nspecies]); }
-    long getNp(int nspecies)const{ return (np[nspecies]); }
-    long getNpMax(int nspecies)const{ return (npMax[nspecies]); }
+    int getNp(int nspecies)const{ return (np[nspecies]); }
+    int getNpMax(int nspecies)const{ return (npMax[nspecies]); }
     double getNpMaxNpRatio()const{ return (NpMaxNpRatio); }
     double getQOM(int nspecies)const{ return (qom[nspecies]); }
     double getRHOinit(int nspecies)const{ return (rhoINIT[nspecies]); }
@@ -206,9 +206,9 @@ class Collective
     /*! number of particles per cell - Z direction */
     int *npcelz;
     /*! number of particles array for different species */
-    long *np;
+    int *np;
     /*! maximum number of particles array for different species */
-    long *npMax;
+    int *npMax;
     /*! max number of particles */
     double NpMaxNpRatio;
     /*! charge to mass ratio array for different species */
diff --git a/include/Particles.h b/include/Particles.h
index be97ac28..cca553c0 100644
--- a/include/Particles.h
+++ b/include/Particles.h
@@ -42,33 +42,33 @@ class Particles {
   /** get w (Z-velocity) array for all the particles */
   virtual double *getWall() const = 0;
   /** get ID array for all the particles */
-  virtual unsigned long *getParticleIDall() const = 0;
+  virtual long long *getParticleIDall() const = 0;
   /**get charge of particle array */
   virtual double *getQall() const = 0;
   /** get X-position of particle with label indexPart */
-  virtual double getX(long long indexPart) const = 0;
+  virtual double getX(int indexPart) const = 0;
   /** get Y-position of particle with label indexPart */
-  virtual double getY(long long indexPart) const = 0;
+  virtual double getY(int indexPart) const = 0;
   /** get Z-position of particle with label indexPart */
-  virtual double getZ(long long indexPart) const = 0;
+  virtual double getZ(int indexPart) const = 0;
   /** get u (X-velocity) of particle with label indexPart */
-  virtual double getU(long long indexPart) const = 0;
+  virtual double getU(int indexPart) const = 0;
   /** get v (Y-velocity) of particle with label indexPart */
-  virtual double getV(long long indexPart) const = 0;
+  virtual double getV(int indexPart) const = 0;
   /** get w (Z-velocity) of particle with label indexPart */
-  virtual double getW(long long indexPart) const = 0;
+  virtual double getW(int indexPart) const = 0;
   /** get ID of particle with label indexPart */
-  virtual unsigned long getParticleID(long long indexPart) const = 0;
+  virtual long long getParticleID(int indexPart) const = 0;
   /**get charge of particle with label indexPart */
-  virtual double getQ(long long indexPart) const = 0;
+  virtual double getQ(int indexPart) const = 0;
   /** get the number of particles of this subdomain */
-  virtual long long getNOP() const = 0;
+  virtual int getNOP() const = 0;
   /** return the Kinetic energy */
   virtual double getKe() = 0;
   /** return the maximum kinetic energy */
   virtual double getMaxVelocity() = 0;
   /** return energy distribution*/
-  virtual unsigned long *getVelocityDistribution(int nBins, double maxVel) = 0;
+  virtual long long *getVelocityDistribution(int nBins, double maxVel) = 0;
   /** retturn the momentum */
   virtual double getP() = 0;
   /** Print particles info: positions, velocities */
diff --git a/include/Particles3Dcomm.h b/include/Particles3Dcomm.h
index 29600957..1e646681 100644
--- a/include/Particles3Dcomm.h
+++ b/include/Particles3Dcomm.h
@@ -32,19 +32,19 @@ class Particles3Dcomm:public Particles {
   /** method for communicating exiting particles to X-RIGHT, X-LEFT, Y-RIGHT, Y-LEFT, Z-RIGHT, Z-LEFT processes */
   int communicate(VirtualTopology3D * ptVCT);
   /** put a particle exiting to X-LEFT in the bufferXLEFT for communication and check if you're sending the particle to the right subdomain*/
-  void bufferXleft(double *b_, long long np, VirtualTopology3D * vct);
+  void bufferXleft(double *b_, int np, VirtualTopology3D * vct);
   /** put a particle exiting to X-RIGHT in the bufferXRIGHT for communication and check if you're sending the particle to the right subdomain*/
-  void bufferXright(double *b_, long long np, VirtualTopology3D * vct);
+  void bufferXright(double *b_, int np, VirtualTopology3D * vct);
   /** put a particle exiting to Y-LEFT in the bufferYLEFT for communication and check if you're sending the particle to the right subdomain*/
-  void bufferYleft(double *b_, long long np, VirtualTopology3D * vct);
+  void bufferYleft(double *b_, int np, VirtualTopology3D * vct);
   /** put a particle exiting to Y-RIGHT in the bufferYRIGHT for communication and check if you're sending the particle to the right subdomain*/
-  void bufferYright(double *b_, long long np, VirtualTopology3D * vct);
+  void bufferYright(double *b_, int np, VirtualTopology3D * vct);
   /** put a particle exiting to Z-LEFT in the bufferZLEFT for communication and check if you're sending the particle to the right subdomain*/
-  void bufferZleft(double *b_, long long np, VirtualTopology3D * vct);
+  void bufferZleft(double *b_, int np, VirtualTopology3D * vct);
   /** put a particle exiting to Z-RIGHT in the bufferZRIGHT for communication and check if you're sending the particle to the right subdomain*/
-  void bufferZright(double *b_, long long np, VirtualTopology3D * vct);
+  void bufferZright(double *b_, int np, VirtualTopology3D * vct);
   /** Delete the a particle from a list(array) and pack the list(array) */
-  void del_pack(long long np, long long *nplast);
+  void del_pack(int np, int *nplast);
 
   /** method to debuild the buffer received */
   int unbuffer(double *b_);
@@ -70,33 +70,33 @@ class Particles3Dcomm:public Particles {
   /** get w (Z-velocity) array for all the particles */
   double *getWall() const;
   /** get the ID array   */
-  unsigned long *getParticleIDall() const;
+  long long *getParticleIDall() const;
   /** get X-position of particle with label indexPart */
-  double getX(long long indexPart) const;
+  double getX(int indexPart) const;
   /** get Y-position of particle with label indexPart */
-  double getY(long long indexPart) const;
+  double getY(int indexPart) const;
   /** get Z-position of particle with label indexPart */
-  double getZ(long long indexPart) const;
+  double getZ(int indexPart) const;
   /** get u (X-velocity) of particle with label indexPart */
-  double getU(long long indexPart) const;
+  double getU(int indexPart) const;
   /** get v (Y-velocity) of particle with label indexPart */
-  double getV(long long indexPart) const;
+  double getV(int indexPart) const;
   /** get w (Z-velocity) of particle with label indexPart */
-  double getW(long long indexPart) const;
+  double getW(int indexPart) const;
   /** get ID of particle with label indexPart */
-  unsigned long getParticleID(long long indexPart) const;
+  long long getParticleID(int indexPart) const;
   /**get charge of particle with label indexPart */
-  double getQ(long long indexPart) const;
+  double getQ(int indexPart) const;
   /** get charge of array for ID particles */
   double *getQall() const;
   /** get the number of particles of this subdomain */
-  long long getNOP() const;
+  int getNOP() const;
   /** return the Kinetic energy */
   double getKe();
   /** return the maximum kinetic energy */
   double getMaxVelocity();
   /** return energy distribution */
-  unsigned long *getVelocityDistribution(int nBins, double maxVel);
+  long long *getVelocityDistribution(int nBins, double maxVel);
   /** return the momentum */
   double getP();
   /** Print particles info: positions, velocities */
@@ -112,9 +112,9 @@ class Particles3Dcomm:public Particles {
   /** number of this species */
   int ns;
   /** maximum number of particles of this species on this domain. used for memory allocation */
-  long long npmax;
+  int npmax;
   /** number of particles of this species on this domain */
-  long long nop;
+  int nop;
   /** total number of particles */
   long long np_tot;
   /** number of particles per cell */
@@ -156,7 +156,7 @@ class Particles3Dcomm:public Particles {
   /** TrackParticleID */
   bool TrackParticleID;
   /** ParticleID */
-  unsigned long *ParticleID;
+  long long *ParticleID;
   /** rank of processor in which particle is created (for ID) */
   int BirthRank[2];
   /** number of variables to be stored in buffer for communication for each particle  */
diff --git a/include/iPic3D.h b/include/iPic3D.h
index 440fdf02..67a72e1a 100644
--- a/include/iPic3D.h
+++ b/include/iPic3D.h
@@ -55,7 +55,7 @@ namespace iPic3D {
     double        *Ke;
     double        *momentum;
     double        *Qremoved;
-    unsigned long *VelocityDist;
+    long long     *VelocityDist;
     Timing        *my_clock;
 
     PSK::OutputManager < PSK::OutputAdaptor > output_mgr; // Create an Output Manager
diff --git a/inputoutput/Collective.cpp b/inputoutput/Collective.cpp
index da721b76..708be195 100644
--- a/inputoutput/Collective.cpp
+++ b/inputoutput/Collective.cpp
@@ -2,6 +2,8 @@
 #include <mpi.h>
 #include "Collective.h"
 #include "debug.h"
+#include "limits.h" // for INT_MAX
+#include "asserts.h" // for assert_ge
 
 /*! Read the input file from text file and put the data in a collective wrapper: if it's a restart read from input file basic sim data and load particles and EM field from restart file */
 void Collective::ReadInput(string inputfile) {
@@ -28,6 +30,7 @@ void Collective::ReadInput(string inputfile) {
     RestartDirName = config.read < string > ("RestartDirName");
     ns = config.read < int >("ns");
     NpMaxNpRatio = config.read < double >("NpMaxNpRatio");
+    assert_ge(NpMaxNpRatio, 1.);
     // GEM Challenge 
     B0x = config.read <double>("B0x");
     B0y = config.read <double>("B0y");
@@ -576,18 +579,20 @@ Collective::Collective(int argc, char **argv) {
   /*! npcel = number of particles per cell */
   npcel = new int[ns];
   /*! np = number of particles of different species */
-  np = new long[ns];
+  np = new int[ns];
   /*! npMax = maximum number of particles of different species */
-  npMax = new long[ns];
+  npMax = new int[ns];
 
   for (int i = 0; i < ns; i++) {
     npcel[i] = npcelx[i] * npcely[i] * npcelz[i];
     np[i] = npcel[i] * nxc * nyc * nzc;
-    npMax[i] = (long) (NpMaxNpRatio * np[i]);
+    double npMaxi = (NpMaxNpRatio * np[i]);
+    // INT_MAX is about 2 billions, surely enough
+    // to index the particles in a single MPI process
+    assert_le(npMaxi, double(INT_MAX));
+    npMax[i] = (int) npMaxi;
   }
 
-
-
 }
 
 /*! destructor */
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 58e3fd44..7a0dd291 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -145,7 +145,7 @@ int c_Solver::Init(int argc, char **argv) {
   }
   // Distribution functions
   nDistributionBins = 1000;
-  VelocityDist = new unsigned long[nDistributionBins];
+  long long *VelocityDist = new long long[nDistributionBins];
   ds = SaveDirName + "/DistributionFunctions.txt";
   if (myrank == 0) {
     ofstream my_file(ds.c_str());
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 356bf4a3..ecb1dab7 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -67,7 +67,7 @@ Particles3D::~Particles3D() {
 
 /** particles are uniformly distributed with zero velocity   */
 void Particles3D::uniform_background(Grid * grid, Field * EMf) {
-  long long counter = 0;
+  int counter = 0;
   for (int i = 1; i < grid->getNXC() - 1; i++)
     for (int j = 1; j < grid->getNYC() - 1; j++)
       for (int k = 1; k < grid->getNZC() - 1; k++)
@@ -82,7 +82,7 @@ void Particles3D::uniform_background(Grid * grid, Field * EMf) {
               w[counter] = 0.0;
               q[counter] = (qom / fabs(qom)) * (EMf->getRHOcs(i, j, k, ns) / npcel) * (1.0 / grid->getInvVOL());
               if (TrackParticleID)
-                ParticleID[counter] = counter * (unsigned long) pow(10.0, BirthRank[1]) + BirthRank[0];
+                ParticleID[counter] = counter * (long long) pow(10.0, BirthRank[1]) + BirthRank[0];
               counter++;
             }
 
@@ -100,15 +100,15 @@ void Particles3D::uniform_background(Grid * grid, Field * EMf) {
 void Particles3D::constantVelocity(double vel, int dim, Grid * grid, Field * EMf) {
   switch (dim) {
     case 0:
-      for (long long i = 0; i < nop; i++)
+      for (int i = 0; i < nop; i++)
         u[i] = vel, v[i] = 0.0, w[i] = 0.0;
       break;
     case 1:
-      for (register long long i = 0; i < nop; i++)
+      for (int i = 0; i < nop; i++)
         u[i] = 0.0, v[i] = vel, w[i] = 0.0;
       break;
     case 2:
-      for (register long long i = 0; i < nop; i++)
+      for (int i = 0; i < nop; i++)
         u[i] = 0.0, v[i] = 0.0, w[i] = vel;
       break;
 
@@ -137,7 +137,7 @@ void Particles3D::MaxwellianFromFluid(Grid* grid,Field* EMf,VirtualTopology3D* v
         MaxwellianFromFluidCell(grid,col,is, i,j,k,counter,x,y,z,q,u,v,w,ParticleID);
 }
 
-void Particles3D::MaxwellianFromFluidCell(Grid* grid, Collective *col, int is, int i, int j, int k, int &ip, double *x, double *y, double *z, double *q, double *vx, double *vy, double *vz, unsigned long* ParticleID)
+void Particles3D::MaxwellianFromFluidCell(Grid* grid, Collective *col, int is, int i, int j, int k, int &ip, double *x, double *y, double *z, double *q, double *vx, double *vy, double *vz, long long* ParticleID)
 {
   /*
    * grid           : local grid object (in)
@@ -179,7 +179,7 @@ void Particles3D::MaxwellianFromFluidCell(Grid* grid, Collective *col, int is, i
         theta = 2.0*M_PI*harvest;
         w[ip] = col->getFluidUz(i,j,k,is) + col->getFluidUthz(i,j,k,is)*prob*cos(theta);
         if (TrackParticleID)
-          ParticleID[ip]= ip*(unsigned long)pow(10.0,BirthRank[1])+BirthRank[0];
+          ParticleID[ip]= ip*(long long)pow(10.0,BirthRank[1])+BirthRank[0];
         ip++ ;
       }
 }
@@ -193,7 +193,7 @@ void Particles3D::maxwellian(Grid * grid, Field * EMf, VirtualTopology3D * vct)
 
   double harvest;
   double prob, theta, sign;
-  long long counter = 0;
+  int counter = 0;
   for (int i = 1; i < grid->getNXC() - 1; i++)
     for (int j = 1; j < grid->getNYC() - 1; j++)
       for (int k = 1; k < grid->getNZC() - 1; k++)
@@ -220,7 +220,7 @@ void Particles3D::maxwellian(Grid * grid, Field * EMf, VirtualTopology3D * vct)
               theta = 2.0 * M_PI * harvest;
               w[counter] = w0 + wth * prob * cos(theta);
               if (TrackParticleID)
-                ParticleID[counter] = counter * (unsigned long) pow(10.0, BirthRank[1]) + BirthRank[0];
+                ParticleID[counter] = counter * (long long) pow(10.0, BirthRank[1]) + BirthRank[0];
 
 
               counter++;
@@ -234,7 +234,7 @@ void Particles3D::force_free(Grid * grid, Field * EMf, VirtualTopology3D * vct)
 
 
   double harvest, prob, theta;
-  long long counter = 0;
+  int counter = 0;
   double shaperx, shapery, shaperz;
   double flvx = 1.0, flvy = 1.0, flvz = 1.0;
 
@@ -281,7 +281,7 @@ void Particles3D::force_free(Grid * grid, Field * EMf, VirtualTopology3D * vct)
                 w[counter] = flvz + wth * prob * cos(theta);
               }
               if (TrackParticleID)
-                ParticleID[counter] = counter * (unsigned long) pow(10.0, BirthRank[1]) + BirthRank[0];
+                ParticleID[counter] = counter * (long long) pow(10.0, BirthRank[1]) + BirthRank[0];
 
               counter++;
             }
@@ -297,7 +297,7 @@ void Particles3D::AddPerturbationJ(double deltaBoB, double kx, double ky, double
   jx_mod *= alpha;
   jy_mod *= alpha;
   jz_mod *= alpha;
-  for (register long long i = 0; i < nop; i++) {
+  for (int i = 0; i < nop; i++) {
     u[i] += jx_mod / q[i] / npcel / invVOL * cos(kx * x[i] + ky * y[i] + jx_phase);
     v[i] += jy_mod / q[i] / npcel / invVOL * cos(kx * x[i] + ky * y[i] + jy_phase);
     w[i] += jz_mod / q[i] / npcel / invVOL * cos(kx * x[i] + ky * y[i] + jz_phase);
@@ -342,7 +342,6 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
 
   const pfloat dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
   const pfloat inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
-  assert_le(nop,(long long) INT_MAX); // else would need to use long long
   // don't bother trying to push any particles simultaneously;
   // MIC already does vectorization automatically, and trying
   // to do it by hand only hurts performance.
@@ -677,15 +676,15 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
   }
   double  FourPI =16*atan(1.0);
   int avail;
-  long long store_nop=nop;
+  int store_nop=nop;
 
   ////////////////////////
   // INJECTION FROM XLEFT
   ////////////////////////
   srand (vct->getCartesian_rank()+1+ns+(int(MPI_Wtime()))%10000);
   if (vct->getXleft_neighbor() == MPI_PROC_NULL && bcPfaceXleft == 2){ // use Field topology in this case
-    long long particles_index=0;
-    long long nplast = nop-1;
+    int particles_index=0;
+    int nplast = nop-1;
 
     while (particles_index < nplast+1) {
       if (x[particles_index] < 3.0*dx ) {
@@ -729,7 +728,7 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
                 theta = 2.0*M_PI*harvest;
                 w[particles_index] = w0 + wth*prob*cos(theta);
                 if (TrackParticleID)
-                  ParticleID[particles_index]= particles_index*(unsigned long)pow(10.0,BirthRank[1])+BirthRank[0];
+                  ParticleID[particles_index]= particles_index*(long long)pow(10.0,BirthRank[1])+BirthRank[0];
 
 
                 particles_index++ ;
@@ -745,8 +744,8 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
   srand (vct->getCartesian_rank()+1+ns+(int(MPI_Wtime()))%10000);
   if (vct->getYleft_neighbor() == MPI_PROC_NULL  && bcPfaceYleft == 2)
   {
-    long long particles_index=0;
-    long long nplast = nop-1;
+    int particles_index=0;
+    int nplast = nop-1;
     while (particles_index < nplast+1) {
       if (y[particles_index] < 3.0*dy ) {
         del_pack(particles_index,&nplast);
@@ -788,7 +787,7 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
                 theta = 2.0*M_PI*harvest;
                 w[particles_index] = w0 + wth*prob*cos(theta);
                 if (TrackParticleID)
-                  ParticleID[particles_index]= particles_index*(unsigned long)pow(10.0,BirthRank[1])+BirthRank[0];
+                  ParticleID[particles_index]= particles_index*(long long)pow(10.0,BirthRank[1])+BirthRank[0];
 
                 particles_index++ ;
               }
@@ -801,8 +800,8 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
   srand (vct->getCartesian_rank()+1+ns+(int(MPI_Wtime()))%10000);
   if (vct->getZleft_neighbor() == MPI_PROC_NULL  && bcPfaceZleft == 2)
   {
-    long long particles_index=0;
-    long long nplast = nop-1;
+    int particles_index=0;
+    int nplast = nop-1;
     while (particles_index < nplast+1) {
       if (z[particles_index] < 3.0*dz ) {
         del_pack(particles_index,&nplast);
@@ -844,7 +843,7 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
                 theta = 2.0*M_PI*harvest;
                 w[particles_index] = w0 + wth*prob*cos(theta);
                 if (TrackParticleID)
-                  ParticleID[particles_index]= particles_index*(unsigned long)pow(10.0,BirthRank[1])+BirthRank[0];
+                  ParticleID[particles_index]= particles_index*(long long)pow(10.0,BirthRank[1])+BirthRank[0];
 
                 particles_index++ ;
               }
@@ -856,8 +855,8 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
   ////////////////////////
   srand (vct->getCartesian_rank()+1+ns+(int(MPI_Wtime()))%10000);
   if (vct->getXright_neighbor() == MPI_PROC_NULL  && bcPfaceXright == 2){
-    long long particles_index=0;
-    long long nplast = nop-1;
+    int particles_index=0;
+    int nplast = nop-1;
     while (particles_index < nplast+1) {
       if (x[particles_index] > (Lx-3.0*dx) ) {
         del_pack(particles_index,&nplast);
@@ -899,7 +898,7 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
                 theta = 2.0*M_PI*harvest;
                 w[particles_index] = w0 + wth*prob*cos(theta);
                 if (TrackParticleID)
-                  ParticleID[particles_index]= particles_index*(unsigned long)pow(10.0,BirthRank[1])+BirthRank[0];
+                  ParticleID[particles_index]= particles_index*(long long)pow(10.0,BirthRank[1])+BirthRank[0];
 
                 particles_index++ ;
               }
@@ -912,8 +911,8 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
   srand (vct->getCartesian_rank()+1+ns+(int(MPI_Wtime()))%10000);
   if (vct->getYright_neighbor() == MPI_PROC_NULL  && bcPfaceYright == 2)
   {
-    long long particles_index=0;
-    long long nplast = nop-1;
+    int particles_index=0;
+    int nplast = nop-1;
     while (particles_index < nplast+1) {
       if (y[particles_index] > (Ly-3.0*dy) ) {
         del_pack(particles_index,&nplast);
@@ -955,7 +954,7 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
                 theta = 2.0*M_PI*harvest;
                 w[particles_index] = w0 + wth*prob*cos(theta);
                 if (TrackParticleID)
-                  ParticleID[particles_index]= particles_index*(unsigned long)pow(10.0,BirthRank[1])+BirthRank[0];
+                  ParticleID[particles_index]= particles_index*(long long)pow(10.0,BirthRank[1])+BirthRank[0];
 
                 particles_index++ ;
               }
@@ -968,8 +967,8 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
   srand (vct->getCartesian_rank()+1+ns+(int(MPI_Wtime()))%10000);
   if (vct->getZright_neighbor() == MPI_PROC_NULL  && bcPfaceZright == 2)
   {
-    long long particles_index=0;
-    long long nplast = nop-1;
+    int particles_index=0;
+    int nplast = nop-1;
     while (particles_index < nplast+1) {
       if (z[particles_index] > (Lz-3.0*dz) ) {
         del_pack(particles_index,&nplast);
@@ -1011,7 +1010,7 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
                 theta = 2.0*M_PI*harvest;
                 w[particles_index] = w0 + wth*prob*cos(theta);
                 if (TrackParticleID)
-                  ParticleID[particles_index]= particles_index*(unsigned long)pow(10.0,BirthRank[1])+BirthRank[0];
+                  ParticleID[particles_index]= particles_index*(long long)pow(10.0,BirthRank[1])+BirthRank[0];
 
                 particles_index++ ;
               }
@@ -1046,7 +1045,7 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
 void Particles3D::linear_perturbation(double deltaBoB, double kx, double ky, double angle, double omega_r, double omega_i, double Ex_mod, double Ex_phase, double Ey_mod, double Ey_phase, double Ez_mod, double Ez_phase, double Bx_mod, double Bx_phase, double By_mod, double By_phase, double Bz_mod, double Bz_phase, Grid * grid, Field * EMf, VirtualTopology3D * vct) {
 
   double value1 = 0.0, value2 = 0.0, max_value = 0.0, min_value = 0.0, phi, n;
-  long long counter = 0, total_generated = 0;
+  int counter = 0, total_generated = 0;
   bool rejected;
   double harvest, prob, theta;
   // rescaling of amplitudes according to deltaBoB //
@@ -1125,7 +1124,7 @@ void Particles3D::linear_perturbation(double deltaBoB, double kx, double ky, dou
 
           }
           if (TrackParticleID)
-            ParticleID[counter] = counter * (unsigned long) pow(10.0, BirthRank[1]) + BirthRank[0];
+            ParticleID[counter] = counter * (long long) pow(10.0, BirthRank[1]) + BirthRank[0];
           counter++;
         }
   nop = counter + 1;
@@ -1216,7 +1215,7 @@ double Particles3D::f0(double vpar, double vperp) {
 
 void Particles3D::RotatePlaneXY(double theta) {
   double temp, temp2;
-  for (register long long s = 0; s < nop; s++) {
+  for (register int s = 0; s < nop; s++) {
     temp = u[s];
     temp2 = v[s];
     u[s] = temp * cos(theta) + v[s] * sin(theta);
@@ -1227,8 +1226,8 @@ void Particles3D::RotatePlaneXY(double theta) {
 /*! Delete the particles inside the sphere with radius R and center x_center y_center and return the total charge removed */
 double Particles3D::deleteParticlesInsideSphere(double R, double x_center, double y_center, double z_center){
 
-  long long np_current = 0;
-  long long nplast     = nop-1;
+  int np_current = 0;
+  int nplast     = nop-1;
 
   while (np_current < nplast+1){
 
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 1120de1c..f1c0c5c5 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -146,7 +146,7 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   q = new double[npmax];
   // ID
   if (TrackParticleID) {
-    ParticleID = new unsigned long[npmax];
+    ParticleID = new long long[npmax];
     BirthRank[0] = vct->getCartesian_rank();
     if (vct->getNprocs() > 1)
       BirthRank[1] = (int) ceil(log10((double) (vct->getNprocs())));  // Number of digits needed for # of process in ID
@@ -154,7 +154,7 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
       BirthRank[1] = 1;
     if (BirthRank[1] + (int) ceil(log10((double) (npmax))) > 10 && BirthRank[0] == 0) {
       cerr << "Error: can't Track particles in Particles3Dcomm::allocate" << endl;
-      cerr << "Unsigned long 'ParticleID' cannot store all the particles" << endl;
+      cerr << "long long 'ParticleID' cannot store all the particles" << endl;
       return;
     }
   }
@@ -271,8 +271,8 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
       if (dataset_id > 0)
         status = H5Dread(dataset_id, H5T_NATIVE_ULONG, H5S_ALL, H5S_ALL, H5P_DEFAULT, ParticleID);
       else {
-        for (register long long counter = 0; counter < nop; counter++)
-          ParticleID[counter] = counter * (unsigned long) pow(10.0, BirthRank[1]) + BirthRank[0];
+        for (int counter = 0; counter < nop; counter++)
+          ParticleID[counter] = counter * (long long) pow(10.0, BirthRank[1]) + BirthRank[0];
       }
     }
     // close the hdf file
@@ -291,7 +291,7 @@ void Particles3Dcomm::interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vc
   const double nxn = grid->getNXN();
   const double nyn = grid->getNYN();
   const double nzn = grid->getNZN();
-  assert_le(nop,(long long)INT_MAX); // else would need to use long long
+  // assert_le(nop,(long long)INT_MAX); // else would need to use long long
   // to make memory use scale to a large number of threads we
   // could first apply an efficient parallel sorting algorithm
   // to the particles and then accumulate moments in smaller
@@ -395,7 +395,7 @@ int Particles3Dcomm::communicate(VirtualTopology3D * ptVCT) {
     b_Z_LEFT[i] = MIN_VAL;
   }
   npExitXright = 0, npExitXleft = 0, npExitYright = 0, npExitYleft = 0, npExitZright = 0, npExitZleft = 0, npExit = 0, rightDomain = 0;
-  long long np_current = 0, nplast = nop - 1;
+  int np_current = 0, nplast = nop - 1;
 
   while (np_current < nplast+1){
 
@@ -647,7 +647,7 @@ void Particles3Dcomm::resize_buffers(int new_buffer_size) {
   buffer_size = new_buffer_size;
 }
 /** put a particle exiting to X-LEFT in the bufferXLEFT for communication and check if you're sending the particle to the right subdomain*/
-void Particles3Dcomm::bufferXleft(double *b_, long long np_current, VirtualTopology3D * vct) {
+void Particles3Dcomm::bufferXleft(double *b_, int np_current, VirtualTopology3D * vct) {
   if (x[np_current] < 0)
     b_[npExitXleft * nVar] = x[np_current] + Lx;  // this applies to the the leftmost processor
   else
@@ -662,7 +662,7 @@ void Particles3Dcomm::bufferXleft(double *b_, long long np_current, VirtualTopol
     b_[npExitXleft * nVar + 7] = ParticleID[np_current];
 }
 /** put a particle exiting to X-RIGHT in the bufferXRIGHT for communication and check if you're sending the particle to the right subdomain*/
-void Particles3Dcomm::bufferXright(double *b_, long long np_current, VirtualTopology3D * vct) {
+void Particles3Dcomm::bufferXright(double *b_, int np_current, VirtualTopology3D * vct) {
   if (x[np_current] > Lx)
     b_[npExitXright * nVar] = x[np_current] - Lx; // this applies to the right most processor
   else
@@ -677,7 +677,7 @@ void Particles3Dcomm::bufferXright(double *b_, long long np_current, VirtualTopo
     b_[npExitXright * nVar + 7] = ParticleID[np_current];
 }
 /** put a particle exiting to Y-LEFT in the bufferYLEFT for communication and check if you're sending the particle to the right subdomain*/
-inline void Particles3Dcomm::bufferYleft(double *b_, long long np_current, VirtualTopology3D * vct) {
+inline void Particles3Dcomm::bufferYleft(double *b_, int np_current, VirtualTopology3D * vct) {
   b_[npExitYleft * nVar] = x[np_current];
   if (y[np_current] < 0)
     b_[npExitYleft * nVar + 1] = y[np_current] + Ly;
@@ -692,7 +692,7 @@ inline void Particles3Dcomm::bufferYleft(double *b_, long long np_current, Virtu
     b_[npExitYleft * nVar + 7] = ParticleID[np_current];
 }
 /** put a particle exiting to Y-RIGHT in the bufferYRIGHT for communication and check if you're sending the particle to the right subdomain*/
-inline void Particles3Dcomm::bufferYright(double *b_, long long np_current, VirtualTopology3D * vct) {
+inline void Particles3Dcomm::bufferYright(double *b_, int np_current, VirtualTopology3D * vct) {
   b_[npExitYright * nVar] = x[np_current];
   if (y[np_current] > Ly)
     b_[npExitYright * nVar + 1] = y[np_current] - Ly;
@@ -707,7 +707,7 @@ inline void Particles3Dcomm::bufferYright(double *b_, long long np_current, Virt
     b_[npExitYright * nVar + 7] = ParticleID[np_current];
 }
 /** put a particle exiting to Z-LEFT in the bufferZLEFT for communication and check if you're sending the particle to the right subdomain*/
-inline void Particles3Dcomm::bufferZleft(double *b_, long long np_current, VirtualTopology3D * vct) {
+inline void Particles3Dcomm::bufferZleft(double *b_, int np_current, VirtualTopology3D * vct) {
   b_[npExitZleft * nVar] = x[np_current];
   b_[npExitZleft * nVar + 1] = y[np_current];
   if (z[np_current] < 0)
@@ -722,7 +722,7 @@ inline void Particles3Dcomm::bufferZleft(double *b_, long long np_current, Virtu
     b_[npExitZleft * nVar + 7] = ParticleID[np_current];
 }
 /** put a particle exiting to Z-RIGHT in the bufferZRIGHT for communication and check if you're sending the particle to the right subdomain*/
-inline void Particles3Dcomm::bufferZright(double *b_, long long np_current, VirtualTopology3D * vct) {
+inline void Particles3Dcomm::bufferZright(double *b_, int np_current, VirtualTopology3D * vct) {
   b_[npExitZright * nVar] = x[np_current];
   b_[npExitZright * nVar + 1] = y[np_current];
   if (z[np_current] > Lz)
@@ -738,7 +738,7 @@ inline void Particles3Dcomm::bufferZright(double *b_, long long np_current, Virt
 }
 /** This unbuffer the last communication */
 int Particles3Dcomm::unbuffer(double *b_) {
-  long long np_current = 0;
+  int np_current = 0;
   // put the new particles at the end of the array, and update the number of particles
   while (b_[np_current * nVar] != MIN_VAL) {
     x[nop] = b_[nVar * np_current];
@@ -749,7 +749,7 @@ int Particles3Dcomm::unbuffer(double *b_) {
     w[nop] = b_[nVar * np_current + 5];
     q[nop] = b_[nVar * np_current + 6];
     if (TrackParticleID)
-      ParticleID[nop] = (unsigned long) b_[nVar * np_current + 7];
+      ParticleID[nop] = (long long) b_[nVar * np_current + 7];
     np_current++;
     // these particles need further communication
     if (x[nop] < xstart || x[nop] > xend || y[nop] < ystart || y[nop] > yend || z[nop] < zstart || z[nop] > zend)
@@ -770,7 +770,7 @@ int Particles3Dcomm::unbuffer(double *b_) {
  * @param np = the index of the particle that must be deleted
  * @param nplast = the index of the last particle in the array
  */
-void Particles3Dcomm::del_pack(long long np_current, long long *nplast) {
+void Particles3Dcomm::del_pack(int np_current, int *nplast) {
   x[np_current] = x[*nplast];
   y[np_current] = y[*nplast];
   z[np_current] = z[*nplast];
@@ -834,7 +834,7 @@ double *Particles3Dcomm::getWall()  const {
   return (w);
 }
 /**get ID of particle with label indexPart */
-unsigned long *Particles3Dcomm::getParticleIDall()  const {
+long long *Particles3Dcomm::getParticleIDall()  const {
   return (ParticleID);
 }
 /**get charge of particle with label indexPart */
@@ -842,45 +842,46 @@ double *Particles3Dcomm::getQall()  const {
   return (q);
 }
 /** return X-coordinate of particle with index indexPart */
-double Particles3Dcomm::getX(long long indexPart)  const {
+double Particles3Dcomm::getX(int indexPart)  const {
   return (x[indexPart]);
 }
 /** return Y-coordinate  of particle with index indexPart */
-double Particles3Dcomm::getY(long long indexPart)  const {
+double Particles3Dcomm::getY(int indexPart)  const {
   return (y[indexPart]);
 }
 /** return Y-coordinate  of particle with index indexPart */
-double Particles3Dcomm::getZ(long long indexPart)  const {
+double Particles3Dcomm::getZ(int indexPart)  const {
   return (z[indexPart]);
 }
 /** get u (X-velocity) of particle with label indexPart */
-double Particles3Dcomm::getU(long long indexPart)  const {
+double Particles3Dcomm::getU(int indexPart)  const {
   return (u[indexPart]);
 }
 /** get v (Y-velocity) of particle with label indexPart */
-double Particles3Dcomm::getV(long long indexPart)  const {
+double Particles3Dcomm::getV(int indexPart)  const {
   return (v[indexPart]);
 }
 /**get w (Z-velocity) of particle with label indexPart */
-double Particles3Dcomm::getW(long long indexPart)  const {
+double Particles3Dcomm::getW(int indexPart)  const {
   return (w[indexPart]);
 }
 /**get ID of particle with label indexPart */
-unsigned long Particles3Dcomm::getParticleID(long long indexPart)  const {
+long long Particles3Dcomm::getParticleID(int indexPart)  const {
   return (ParticleID[indexPart]);
 }
 /**get charge of particle with label indexPart */
-double Particles3Dcomm::getQ(long long indexPart)  const {
+double Particles3Dcomm::getQ(int indexPart)  const {
   return (q[indexPart]);
 }
-/** return the number of particles */ long long Particles3Dcomm::getNOP()  const {
+/** return the number of particles */
+int Particles3Dcomm::getNOP()  const {
   return (nop);
 }
 /** return the Kinetic energy */
 double Particles3Dcomm::getKe() {
   double localKe = 0.0;
   double totalKe = 0.0;
-  for (register long long i = 0; i < nop; i++)
+  for (register int i = 0; i < nop; i++)
     localKe += .5 * (q[i] / qom) * (u[i] * u[i] + v[i] * v[i] + w[i] * w[i]);
   MPI_Allreduce(&localKe, &totalKe, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   return (totalKe);
@@ -889,7 +890,7 @@ double Particles3Dcomm::getKe() {
 double Particles3Dcomm::getP() {
   double localP = 0.0;
   double totalP = 0.0;
-  for (register long long i = 0; i < nop; i++)
+  for (register int i = 0; i < nop; i++)
     localP += (q[i] / qom) * sqrt(u[i] * u[i] + v[i] * v[i] + w[i] * w[i]);
   MPI_Allreduce(&localP, &totalP, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   return (totalP);
@@ -899,7 +900,7 @@ double Particles3Dcomm::getP() {
 double Particles3Dcomm::getMaxVelocity() {
   double localVel = 0.0;
   double maxVel = 0.0;
-  for (long long i = 0; i < nop; i++)
+  for (int i = 0; i < nop; i++)
     localVel = max(localVel, sqrt(u[i] * u[i] + v[i] * v[i] + w[i] * w[i]));
   MPI_Allreduce(&localVel, &maxVel, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
   return (maxVel);
@@ -907,14 +908,14 @@ double Particles3Dcomm::getMaxVelocity() {
 
 
 /** get energy spectrum */
-unsigned long *Particles3Dcomm::getVelocityDistribution(int nBins, double maxVel) {
-  unsigned long *f = new unsigned long[nBins];
+long long *Particles3Dcomm::getVelocityDistribution(int nBins, double maxVel) {
+  long long *f = new long long[nBins];
   for (int i = 0; i < nBins; i++)
     f[i] = 0;
   double Vel = 0.0;
   double dv = maxVel / nBins;
   int bin = 0;
-  for (long long i = 0; i < nop; i++) {
+  for (int i = 0; i < nop; i++) {
     Vel = sqrt(u[i] * u[i] + v[i] * v[i] + w[i] * w[i]);
     bin = int (floor(Vel / dv));
     if (bin >= nBins)
@@ -922,8 +923,8 @@ unsigned long *Particles3Dcomm::getVelocityDistribution(int nBins, double maxVel
     else
       f[bin] += 1;
   }
-  unsigned long localN = 0;
-  unsigned long totalN = 0;
+  long long localN = 0;
+  long long totalN = 0;
   for (int i = 0; i < nBins; i++) {
     localN = f[i];
     MPI_Allreduce(&localN, &totalN, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
@@ -942,7 +943,7 @@ void Particles3Dcomm::Print(VirtualTopology3D * ptVCT) const {
   cout << "Yin = " << ystart << "; Yfin = " << yend << endl;
   cout << "Zin = " << zstart << "; Zfin = " << zend << endl;
   cout << "Number of species = " << ns << endl;
-  for (long long i = 0; i < nop; i++)
+  for (int i = 0; i < nop; i++)
     cout << "Particles #" << i << " x=" << x[i] << " y=" << y[i] << " z=" << z[i] << " u=" << u[i] << " v=" << v[i] << " w=" << w[i] << endl;
   cout << endl;
 }

From e4ba3d54c607cb915f86e751599651f091766041 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 8 Oct 2013 15:31:41 +0200
Subject: [PATCH 054/118] made VelocityDist a local variable (follow-up to
 93b043f5093)

---
 include/iPic3D.h   | 1 -
 main/iPic3Dlib.cpp | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/iPic3D.h b/include/iPic3D.h
index 67a72e1a..9a79c131 100644
--- a/include/iPic3D.h
+++ b/include/iPic3D.h
@@ -55,7 +55,6 @@ namespace iPic3D {
     double        *Ke;
     double        *momentum;
     double        *Qremoved;
-    long long     *VelocityDist;
     Timing        *my_clock;
 
     PSK::OutputManager < PSK::OutputAdaptor > output_mgr; // Create an Output Manager
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 7a0dd291..96216d78 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -145,7 +145,6 @@ int c_Solver::Init(int argc, char **argv) {
   }
   // Distribution functions
   nDistributionBins = 1000;
-  long long *VelocityDist = new long long[nDistributionBins];
   ds = SaveDirName + "/DistributionFunctions.txt";
   if (myrank == 0) {
     ofstream my_file(ds.c_str());
@@ -307,7 +306,7 @@ void c_Solver::WriteConserved(int cycle) {
     // Velocity distribution
     for (int is = 0; is < ns; is++) {
       double maxVel = part[is].getMaxVelocity();
-      VelocityDist = part[is].getVelocityDistribution(nDistributionBins, maxVel);
+      long long *VelocityDist = part[is].getVelocityDistribution(nDistributionBins, maxVel);
       if (myrank == 0) {
         ofstream my_file(ds.c_str(), fstream::app);
         my_file << cycle << "\t" << is << "\t" << maxVel;

From 3522db61d3d96d639df3c41417f0e025e938c55e Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 8 Oct 2013 17:29:55 +0200
Subject: [PATCH 055/118] corrected MPI_UNSIGNED_LONG_LONG to MPI_LONG_LONG in
 getVelocityDistribution

---
 particles/Particles3Dcomm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index f1c0c5c5..13a443c0 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -927,7 +927,7 @@ long long *Particles3Dcomm::getVelocityDistribution(int nBins, double maxVel) {
   long long totalN = 0;
   for (int i = 0; i < nBins; i++) {
     localN = f[i];
-    MPI_Allreduce(&localN, &totalN, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&localN, &totalN, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
     f[i] = totalN;
   }
   return f;

From 90334a2f978d01cb0d16a9a69e6530aa518587bb Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 10 Oct 2013 12:17:27 +0200
Subject: [PATCH 056/118] issue #54: TimeTasks now supports non-exclusive tasks
 and

---
 communication/ComNodes3D.cpp |  50 +++-----
 fields/EMfields3D.cpp        |  10 +-
 iPic3D.cpp                   |   7 ++
 include/ComNodes3D.h         |   3 -
 include/TimeTasks.h          | 182 ++++++++++++++++++++++++-----
 include/iPic3D.h             |   4 +-
 main/iPic3Dlib.cpp           |  60 +++++-----
 particles/Particles3D.cpp    |   3 +-
 utility/TimeTasks.cpp        | 220 +++++++++++++++++++++--------------
 9 files changed, 343 insertions(+), 196 deletions(-)

diff --git a/communication/ComNodes3D.cpp b/communication/ComNodes3D.cpp
index 0d97ce89..6b0bb169 100644
--- a/communication/ComNodes3D.cpp
+++ b/communication/ComNodes3D.cpp
@@ -7,7 +7,7 @@
 
 /** communicate ghost cells (FOR NODES) */
 void communicateNode(int nx, int ny, int nz, arr3_double _vector, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector=_vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
@@ -106,12 +106,10 @@ void communicateNode(int nx, int ny, int nz, arr3_double _vector, VirtualTopolog
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-
-  timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR NODES) */
 void communicateNodeBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector = _vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -212,12 +210,10 @@ void communicateNodeBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXr
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-
-  timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR NODES) with particles BC*/
 void communicateNodeBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -318,13 +314,11 @@ void communicateNodeBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFace
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-
-  timeTasks.addto_communicate();
 }
 
 /** SPECIES: communicate ghost cells */
 void communicateNode(int nx, int ny, int nz, arr4_double _vector, int ns, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ****vector = _vector.fetch_arr4();
 
   // allocate 6 ghost cell Faces
@@ -422,14 +416,12 @@ void communicateNode(int nx, int ny, int nz, arr4_double _vector, int ns, Virtua
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-
-  timeTasks.addto_communicate();
-}                               // 
+}
 
 // PARTICLES
 /** SPECIES: communicate ghost cells */
 void communicateNode_P(int nx, int ny, int nz, arr4_double _vector, int ns, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ****vector = _vector.fetch_arr4();
 
   // allocate 6 ghost cell Faces
@@ -527,14 +519,12 @@ void communicateNode_P(int nx, int ny, int nz, arr4_double _vector, int ns, Virt
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-
-  timeTasks.addto_communicate();
 }
 
 // 
 /** communicate ghost cells (FOR CENTERS) */
 void communicateCenter(int nx, int ny, int nz, arr3_double _vector, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector = _vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
@@ -631,12 +621,10 @@ void communicateCenter(int nx, int ny, int nz, arr3_double _vector, VirtualTopol
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-
-  timeTasks.addto_communicate();
 }
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
 void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -664,12 +652,11 @@ void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double _vector,
   delete[]ghostYleftFace;
   delete[]ghostZrightFace;
   delete[]ghostZleftFace;
-  timeTasks.addto_communicate();
 }
 // particles
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
 void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -697,14 +684,13 @@ void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector
   delete[]ghostYleftFace;
   delete[]ghostZrightFace;
   delete[]ghostZleftFace;
-  timeTasks.addto_communicate();
 }
 
 // 
 
 
 void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -732,11 +718,10 @@ void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double _vector, in
   delete[]ghostYleftFace;
   delete[]ghostZrightFace;
   delete[]ghostZleftFace;
-  timeTasks.addto_communicate();
 }
 
 void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -764,14 +749,13 @@ void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector,
   delete[]ghostYleftFace;
   delete[]ghostZrightFace;
   delete[]ghostZleftFace;
-  timeTasks.addto_communicate();
 }
 
 
 
 /** SPECIES: communicate ghost cells */
 void communicateCenter(int nx, int ny, int nz, arr4_double _vector, int ns, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ****vector=_vector.fetch_arr4();
 
   // allocate 6 ghost cell Faces
@@ -867,12 +851,10 @@ void communicateCenter(int nx, int ny, int nz, arr4_double _vector, int ns, Virt
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-
-  timeTasks.addto_communicate();
 }
 // /////////// communication + BC ////////////////////////////
 void communicateCenterBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector=_vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
@@ -971,12 +953,10 @@ void communicateCenterBC(int nx, int ny, int nz, arr3_double _vector, int bcFace
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-
-  timeTasks.addto_communicate();
 }
 // /////////// communication + BC ////////////////////////////
 void communicateCenterBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
   double ***vector=_vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
@@ -1075,6 +1055,4 @@ void communicateCenterBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFa
   delete[]ghostXrightYrightZsameEdge;
   delete[]ghostXleftYleftZsameEdge;
   delete[]ghostXleftYrightZsameEdge;
-
-  timeTasks.addto_communicate();
 }
diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 71872157..4abc4585 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -250,6 +250,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   #pragma omp parallel
   {
     int thread_num = omp_get_thread_num();
+    if(!thread_num) { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
     #ifdef TENMOMENTS
     TenMoments& speciesMoments = fetch_momentsArray(thread_num);
     speciesMoments.set_to_zero();
@@ -570,6 +571,11 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
       }
       #endif // TENMOMENTS
     }
+    if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION);
+
+    // reduction
+    if(!thread_num) timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION);
+
     // split up the reduction tasks.
     //
     //{
@@ -646,6 +652,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }}
     }
+    if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_REDUCTION);
   }
   communicateGhostP2G(is, 0, 0, 0, 0, vct);
 }
@@ -1570,7 +1577,7 @@ void EMfields3D::interpDensitiesN2C(VirtualTopology3D * vct, Grid * grid) {
 /*! communicate ghost for grid -> Particles interpolation */
 void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, VirtualTopology3D * vct) {
   // interpolate adding common nodes among processors
-  timeTasks.start_communicate();
+  timeTasks_set_communicating();
 
   communicateInterp(nxn, nyn, nzn, ns, rhons.fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
   communicateInterp(nxn, nyn, nzn, ns, Jxs  .fetch_arr4(), 0, 0, 0, 0, 0, 0, vct);
@@ -1585,7 +1592,6 @@ void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft,
   // calculate the correct densities on the boundaries
   adjustNonPeriodicDensities(ns, vct);
   // put the correct values on ghost cells
-  timeTasks.addto_communicate();
 
   communicateNode_P(nxn, nyn, nzn, rhons, ns, vct);
   communicateNode_P(nxn, nyn, nzn, Jxs  , ns, vct);
diff --git a/iPic3D.cpp b/iPic3D.cpp
index ec9b59ba..4e768a71 100644
--- a/iPic3D.cpp
+++ b/iPic3D.cpp
@@ -3,6 +3,7 @@
 #include <iomanip>
 #include "iPic3D.h"
 #include "debug.h"
+#include "TimeTasks.h"
 
 using namespace iPic3D;
 
@@ -19,8 +20,14 @@ int main(int argc, char **argv) {
     if (KCode.get_myrank() == 0) cout << " ======= Cycle " << i << " ======= " << endl;
 
     if (!b_err) {
+      timeTasks.resetCycle();
+      KCode.CalculateMoments();
       KCode.CalculateField();
       b_err = KCode.ParticlesMover();
+      KCode.CalculateB();
+
+      // print out total time for all tasks
+      timeTasks.print_cycle_times(i);
     }
 
     if (b_err) {
diff --git a/include/ComNodes3D.h b/include/ComNodes3D.h
index c7e86731..edd6aba1 100644
--- a/include/ComNodes3D.h
+++ b/include/ComNodes3D.h
@@ -12,9 +12,6 @@ developers           : Stefano Markidis, Giovanni Lapenta
 
 #include "arraysfwd.h"
 #include "ComBasic3D.h"
-//#include "TimeTasks.h"
-
-//extern TimeTasks timeTasks;
 
 // boundary condition for fields
 #include "BcFields3D.h"
diff --git a/include/TimeTasks.h b/include/TimeTasks.h
index 8427bee5..84b9d230 100644
--- a/include/TimeTasks.h
+++ b/include/TimeTasks.h
@@ -1,45 +1,90 @@
 #ifndef __TimeTasks_H__
 #define __TimeTasks_H__
 
-class TimeTasks {
+/* Avoid direct use of this class.
+   Instead, use and add to the macros at the bottom
+   so that we can redefine the macros when desired
+   (e.g. defining them to the empty string to
+   remove performance penalty).
+ */
+
+class TimeTasks
+{
+ public:
 
-public:
   // legitimate active subcycle values
-  enum Tasks {
+  //
+  // timeTasks_set_task(0) is a no-op, so
+  // MOMENT_REDUCTION=0
+  // would prevent monitoring of this task.
+  //
+  enum Tasks // order must agree with taskNames in TimeTasks.cpp
+  {
     NONE = 0,
     MOMENTS,
     FIELDS,
     PARTICLES,
+    LAST, // no more exclusive tasks
     BFIELD,
-    LAST,
-  };
-  enum Modes {
-    COMPUTATION = 0,
-    COMMUNICATION,
+    MOMENT_ACCUMULATION,
+    MOMENT_REDUCTION,
+    NUMBER_OF_TASKS // this line should be last
   };
 
-public:
-  void setActiveTask(int arg) {
-    active_task = arg;
-  } void setActiveMode(int in) {
-    t_start_communicate = in;
-  }
-  void resetCycle();
-  void start(int taskid);
-  void end(int taskid);
-  void start_communicate();
-  void addto_communicate();
-  void print_cycle_times();
+ private:
+  //enum Modes // for exclusive tasks
+  //{
+  //  COMPUTATION = 0,
+  //  COMMUNICATION,
+  //};
+
+ public: // methods
+
   TimeTasks() {
     resetCycle();
   }
+
+  // monitoring
+  //
+  void resetCycle();
+  //
+  // provide start_time on ending call
+  //
+  void end_communicating(double start_time);
+  void start_main_task(TimeTasks::Tasks taskid);
+  void end_main_task(TimeTasks::Tasks taskid, double start_time);
+  void start_task(TimeTasks::Tasks taskid);
+  void end_task(TimeTasks::Tasks taskid, double start_time);
+  //
+  // provide start_time at starting call
+  //
+  void start_task(TimeTasks::Tasks taskid, double start_time);
+  void end_task(TimeTasks::Tasks taskid);
+
+  // accessors
+  //
+  bool is_active(Tasks taskid){ return active[taskid]; }
+  bool get_communicating() { return communicating; }
+  void set_communicating(bool val) { communicating = val; }
+  int get_stack_depth(TimeTasks::Tasks taskid) { return stack_depth[taskid]; }
+
+  // reporting
+  //
+  void print_cycle_times(int cycle);
+
+ private:
+
+  // is task exclusive?
+  bool is_exclusive(Tasks taskid) { return (taskid < LAST); }
+
+  // reporting
+  //
   double get_time(int arg) {
     return task_duration[arg];
   }
   double get_communicate(int arg) {
     return communicate[arg];
   }
-
   double get_communicate() {
     double total = 0.;
     for (int i = NONE + 1; i < LAST; i++) {
@@ -47,7 +92,6 @@ class TimeTasks {
     }
     return total;
   }
-
   double get_time() {
     double total = 0.;
     for (int i = NONE + 1; i < LAST; i++) {
@@ -55,25 +99,101 @@ class TimeTasks {
     }
     return total;
   }
-
   double get_compute(int arg) {
     return get_time(arg) - get_communicate(arg);
   }
   double get_compute() {
     return get_time() - get_communicate();
   }
+  const char* get_taskname(int arg);
 
-private:
+ private:
   int active_task;
-  int active_mode;
-  double t_start_communicate;
-  double start_times[LAST];
-  double task_duration[LAST];
-  double communicate[LAST];
-  double compute[LAST];
-
+  bool active[NUMBER_OF_TASKS];
+  bool communicating;
+  double task_duration[NUMBER_OF_TASKS];
+  double communicate[NUMBER_OF_TASKS];
+  double compute[NUMBER_OF_TASKS];
+  int stack_depth[NUMBER_OF_TASKS];
+  double start_times[NUMBER_OF_TASKS];
 };
 
 extern TimeTasks timeTasks;
 
+// construct an anonymous instance of TimeTasksCaller
+class TimeTasks_caller_to_set_main_task_for_scope
+{
+  double start_time;
+  TimeTasks::Tasks task;
+ public:
+  TimeTasks_caller_to_set_main_task_for_scope(TimeTasks::Tasks _task) :
+    task(_task)
+  {
+    start_time = MPI_Wtime();
+    timeTasks.start_main_task(task);
+  }
+  ~TimeTasks_caller_to_set_main_task_for_scope()
+  {
+    timeTasks.end_main_task(task, start_time);
+  }
+};
+
+class TimeTasks_caller_to_set_task_for_scope
+{
+  bool already_active;
+  double start_time;
+  TimeTasks::Tasks task;
+ public:
+  TimeTasks_caller_to_set_task_for_scope(TimeTasks::Tasks _task) :
+    task(_task)
+  {
+    already_active = timeTasks.is_active(task);
+    if(!already_active)
+    {
+      start_time = MPI_Wtime();
+      timeTasks.start_task(task);
+    }
+  }
+  ~TimeTasks_caller_to_set_task_for_scope()
+  {
+    if(already_active)
+    {
+      assert(timeTasks.is_active(task));
+    }
+    else
+    {
+      timeTasks.end_task(task, start_time);
+    }
+  }
+};
+
+class TimeTasks_caller_to_set_communication_mode_for_scope
+{
+ private:
+  bool already_communicating;
+  double start_time;
+ public:
+  TimeTasks_caller_to_set_communication_mode_for_scope();
+  ~TimeTasks_caller_to_set_communication_mode_for_scope();
+};
+
+// These macros could be changed to provide file and line number
+//
+// We need to create nonanonymous instances so that the destructor
+// will not be called until the end of the scope, so we use the preprocessor.
+// to generate unique names of nonanonymous instances.
+//
+#define timeTasks_set_main_task(task) \
+  TimeTasks_caller_to_set_main_task_for_scope myFunnyInstance(task);
+#define timeTasks_set_task(task) \
+  TimeTasks_caller_to_set_task_for_scope myFunnyName##__func__##__LINE__(task);
+#define timeTasks_set_communicating() \
+  TimeTasks_caller_to_set_communication_mode_for_scope myFunnyCommunicationInstance;
+//
+// The scoping trick does not work if the timeTasks call needs to be conditional,
+// so we also provide the ability to explicitly begin and end.
+#define timeTasks_begin_task(task) if(task) timeTasks.start_task(task, MPI_Wtime());
+#define timeTasks_end_task(task) if(task) timeTasks.end_task(task);
+//
+
 #endif
diff --git a/include/iPic3D.h b/include/iPic3D.h
index 9a79c131..93db7d11 100644
--- a/include/iPic3D.h
+++ b/include/iPic3D.h
@@ -34,8 +34,10 @@ namespace iPic3D {
 
   public:
     int Init(int argc, char **argv);
-    void CalculateField();
+    void CalculateMoments();
+    void CalculateField(); //! calculate Efield
     bool ParticlesMover();
+    void CalculateB();
     void WriteOutput(int cycle);
     void WriteConserved(int cycle);
     void WriteRestart(int cycle);
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 96216d78..d88692ed 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -171,11 +171,9 @@ int c_Solver::Init(int argc, char **argv) {
   return 0;
 }
 
-void c_Solver::CalculateField() {
+void c_Solver::CalculateMoments() {
 
-  timeTasks.resetCycle();
-  // interpolation
-  timeTasks.start(TimeTasks::MOMENTS);
+  timeTasks_set_main_task(TimeTasks::MOMENTS);
 
   EMf->updateInfoFields(grid,vct,col);
   EMf->setZeroDensities();                  // set to zero the densities
@@ -201,32 +199,39 @@ void c_Solver::CalculateField() {
   EMf->interpDensitiesN2C(vct, grid);       // calculate densities on centers from nodes
   EMf->calculateHatFunctions(grid, vct);    // calculate the hat quantities for the implicit method
   MPI_Barrier(MPI_COMM_WORLD);
-  timeTasks.end(TimeTasks::MOMENTS);
+}
 
-  // MAXWELL'S SOLVER
-  timeTasks.start(TimeTasks::FIELDS);
+//! MAXWELL SOLVER for Efield
+void c_Solver::CalculateField() {
+  timeTasks_set_main_task(TimeTasks::FIELDS);
   EMf->calculateE(grid, vct, col);               // calculate the E field
-  timeTasks.end(TimeTasks::FIELDS);
+}
 
+//! MAXWELL SOLVER for Bfield (assuming Efield has already been calculated)
+void c_Solver::CalculateB() {
+  timeTasks_set_main_task(TimeTasks::FIELDS);
+  timeTasks_set_task(TimeTasks::BFIELD); // subtask
+  EMf->calculateB(grid, vct, col);   // calculate the B field
 }
 
+/*  -------------- */
+/*!  Particle mover */
+/*  -------------- */
 bool c_Solver::ParticlesMover() {
 
-  /*  -------------- */
-  /*  Particle mover */
-  /*  -------------- */
-
-  timeTasks.start(TimeTasks::PARTICLES);
-  // Should change this to add background guide field
-  EMf->set_fieldForPcls();
-  for (int i = 0; i < ns; i++)  // move each species
+  // move all species of particles
   {
-    // #pragma omp task inout(part[i]) in(grid) target_device(booster)
-    //
-    // should merely pass EMf->get_fieldForPcls() rather than EMf.
-    mem_avail = part[i].mover_PC(grid, vct, EMf); // use the Predictor Corrector scheme 
+    timeTasks_set_main_task(TimeTasks::PARTICLES);
+    // Should change this to add background field
+    EMf->set_fieldForPcls();
+    for (int i = 0; i < ns; i++)  // move each species
+    {
+      // #pragma omp task inout(part[i]) in(grid) target_device(booster)
+      //
+      // should merely pass EMf->get_fieldForPcls() rather than EMf.
+      mem_avail = part[i].mover_PC(grid, vct, EMf); // use the Predictor Corrector scheme 
+    }
   }
-  timeTasks.end(TimeTasks::PARTICLES);
 
   if (mem_avail < 0) {          // not enough memory space allocated for particles: stop the simulation
     if (myrank == 0) {
@@ -263,20 +268,7 @@ bool c_Solver::ParticlesMover() {
     for (int i=0; i < ns; i++)
       Qremoved[i] = part[i].deleteParticlesInsideSphere(col->getL_square(),col->getx_center(),col->gety_center(),col->getz_center());
   }
-
-  /* --------------------- */
-  /* Calculate the B field */
-  /* This step must be taken out of here! */
-  /* --------------------- */
-
-  timeTasks.start(TimeTasks::BFIELD);
-  EMf->calculateB(grid, vct, col);   // calculate the B field
-  timeTasks.end(TimeTasks::BFIELD);
-
-  // print out total time for all tasks
-  timeTasks.print_cycle_times();
   return (false);
-
 }
 
 void c_Solver::WriteRestart(int cycle) {
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index ecb1dab7..50550ee6 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -647,7 +647,7 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   // ********************//
   // COMMUNICATION 
   // *******************//
-  timeTasks.start_communicate();
+  timeTasks_set_communicating(); // communicating until end of scope
   const int avail = communicate(vct);
   if (avail < 0)
     return (-1);
@@ -660,7 +660,6 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       return (-1);
     MPI_Barrier(MPI_COMM_WORLD);
   }
-  timeTasks.addto_communicate();
   return (0);                   // exit succcesfully (hopefully) 
 }
 
diff --git a/utility/TimeTasks.cpp b/utility/TimeTasks.cpp
index b48bb070..15a2c578 100644
--- a/utility/TimeTasks.cpp
+++ b/utility/TimeTasks.cpp
@@ -4,121 +4,167 @@
 #include "TimeTasks.h"
 #include "asserts.h"
 #include "MPIdata.h" // for get_rank
+#include "debug.h"
 
 /** implementation of declarations in utility/TimeTasks.h **/
 
 TimeTasks timeTasks;
 
+static const char *taskNames[] = // order must agree with Tasks in TimeTasks.h
+{
+  "none",
+  "moments",
+  "fields",
+  "particles",
+  "last",
+  "bfield",
+  "moment_accumulation",
+  "moment_reduction",
+  "number_of_tasks"
+};
+
+const char* TimeTasks::get_taskname(int arg)
+{
+  assert_le(arg,NUMBER_OF_TASKS);
+  return taskNames[arg];
+}
+
 void TimeTasks::resetCycle()
 {
-  for(int e=0;e<LAST;e++)
+  for(int e=0;e<NUMBER_OF_TASKS;e++)
   {
-    //compute[e]=0.;
-    start_times[e]=0.;
     task_duration[e]=0.;
+    compute[e]=0.;
     communicate[e]=0.;
+    active[e]=false;
+    stack_depth[e]=0;
+    start_times[e]=0.;
   }
   active_task=NONE;
-  active_mode=COMPUTATION;
-  t_start_communicate = 0.;
+  communicating=false;
 }
-void TimeTasks::start(int taskid)
+void TimeTasks::start_main_task(TimeTasks::Tasks taskid)
 {
-  assert_eq(active_task+1,taskid);
+  assert(is_exclusive(taskid));
+  assert_ne(active_task, taskid);
   active_task = taskid;
-  double now = MPI_Wtime();
-  start_times[active_task] = now;
+  assert(!active[taskid]);
+  active[taskid]=true;
+  //if(!MPIdata::get_rank())
+  //dprintf("starting task %s at time %24.16e\n", get_taskname(taskid), MPI_Wtime());
+}
+void TimeTasks::start_task(TimeTasks::Tasks taskid)
+{
+  assert(!is_exclusive(taskid));
+  assert(!active[taskid]);
+  active[taskid]=true;
+  //dprintf("starting task %s at time %24.16e\n", get_taskname(taskid), MPI_Wtime());
+}
+// have to manage the task stack explicitly
+void TimeTasks::start_task(TimeTasks::Tasks taskid, double start_time)
+{
+  if(stack_depth[taskid]==0)
+  {
+    start_times[taskid]=start_time;
+    start_task(taskid);
+  }
+  stack_depth[taskid]++;
+  //dprintf("starting task %s at time %24.16e\n", get_taskname(taskid), start_time);
+}
+void TimeTasks::end_main_task(TimeTasks::Tasks taskid, double start_time)
+{
+  end_task(taskid, start_time);
+  active_task = NONE;
 }
-void TimeTasks::end(int taskid)
+void TimeTasks::end_task(TimeTasks::Tasks taskid, double start_time)
 {
-  assert_eq(taskid,active_task);
+  assert(active[taskid]);
   double now = MPI_Wtime();
-  task_duration[active_task] = now - start_times[active_task];
-  compute[active_task] = task_duration[active_task]-communicate[active_task];
+  // compute time spent on task
+  task_duration[taskid] += now - start_time;
+  active[taskid] = false;
 }
-void TimeTasks::start_communicate()
+// have to manage the task stack explicitly
+void TimeTasks::end_task(TimeTasks::Tasks taskid)
 {
-  if(!active_task) return;
-  assert_eq(active_mode,COMPUTATION);
-  t_start_communicate = MPI_Wtime();
-  active_mode=COMMUNICATION;
+  stack_depth[taskid]--;
+  assert_ge(stack_depth[taskid],0);
+  if(stack_depth[taskid]==0)
+  {
+    end_task(taskid, start_times[taskid]);
+  }
 }
-void TimeTasks::addto_communicate()
+void TimeTasks::end_communicating(double start_time)
 {
-  if(!active_task) return;
-  assert_eq(active_mode,COMMUNICATION);
-  assert_ne(t_start_communicate,0.);
-  communicate[active_task] += MPI_Wtime()-t_start_communicate;
-  t_start_communicate = 0.;
-  active_mode=COMPUTATION;
+  //if(!active_task) return;
+  assert(active_task);
+  assert(communicating);
+  double additional_communication_time = MPI_Wtime()-start_time;
+  //dprint(additional_communication_time);
+  communicate[active_task] += additional_communication_time;
+  communicating=false;
 }
 #define TIMING_PREFIX "| "
-void TimeTasks::print_cycle_times()
+void TimeTasks::print_cycle_times(int cycle)
 {
+  // calculate portion of time spent computing
+  //
+  for(int e=NONE+1; e<NUMBER_OF_TASKS; e++)
+  {
+    compute[e] = task_duration[e]-communicate[e];
+  }
+
+  FILE* file = stdout;
   // we could report average for all processes
   if(!MPIdata::get_rank())
   {
-    fflush(stdout);
-    fprintf(stdout,"=== times for cycle for rank %d === \n",
+    fflush(file);
+    fprintf(file,"=== times for cycle %d for rank %d === \n",
+      cycle,
       MPIdata::get_rank());
-    fprintf(stdout, TIMING_PREFIX
-      "moms flds pcls Bfld cycl\n");
-    fprintf(stdout, TIMING_PREFIX
-      "%4.2f "
-      "%4.2f "
-      "%4.2f "
-      "%4.2f "
-      "%4.2f (total time)\n",
-      get_time(TimeTasks::MOMENTS),
-      get_time(TimeTasks::FIELDS),
-      get_time(TimeTasks::PARTICLES),
-      get_time(TimeTasks::BFIELD),
-      get_time()
-      );
-    fprintf(stdout, TIMING_PREFIX
-      "%4.2f "
-      "%4.2f "
-      "%4.2f "
-      "%4.2f "
-      "%4.2f (communication)\n",
-      get_communicate(TimeTasks::MOMENTS),
-      get_communicate(TimeTasks::FIELDS),
-      get_communicate(TimeTasks::PARTICLES),
-      get_communicate(TimeTasks::BFIELD),
-      get_communicate()
-      );
-    fprintf(stdout, TIMING_PREFIX
-      "%4.2f "
-      "%4.2f "
-      "%4.2f "
-      "%4.2f "
-      "%4.2f (computation)\n",
-      get_compute(TimeTasks::MOMENTS),
-      get_compute(TimeTasks::FIELDS),
-      get_compute(TimeTasks::PARTICLES),
-      get_compute(TimeTasks::BFIELD),
-      get_compute()
-      );
-    //fprintf(stdout, TIMING_PREFIX
-    //  "MOMS comm  FLDS comm  PCLS comm  CYCL comm\n");
-    //fprintf(stdout, TIMING_PREFIX
-    //  "%4.2f "
-    //  "%4.2f  "
-    //  "%4.2f "
-    //  "%4.2f  "
-    //  "%4.2f "
-    //  "%4.2f  "
-    //  "%4.2f "
-    //  "%4.2f\n",
-    //  get_time(TimeTasks::MOMENTS),
-    //  get_communicate(TimeTasks::MOMENTS),
-    //  get_time(TimeTasks::FIELDS),
-    //  get_communicate(TimeTasks::FIELDS),
-    //  get_time(TimeTasks::PARTICLES),
-    //  get_communicate(TimeTasks::PARTICLES),
-    //  get_time(),
-    //  get_communicate()
-    //  );
-    fflush(stdout);
+    fprintf(file, TIMING_PREFIX "total  comput commun task\n");
+    for(int e=NONE+1; e<LAST; e++)
+    {
+      fprintf(file, TIMING_PREFIX "%6.3f %6.3f %6.3f %s\n",
+      get_time(e),
+      get_compute(e),
+      get_communicate(e),
+      get_taskname(e));
+    }
+    fprintf(file, TIMING_PREFIX "%6.3f %6.3f %6.3f %s\n",
+      get_time(),
+      get_compute(),
+      get_communicate(),
+      "[total times]");
+
+    fprintf(file, TIMING_PREFIX "time  subtask\n");
+    for(int e=LAST+1; e<NUMBER_OF_TASKS; e++)
+    {
+      assert_eq(stack_depth[e],0);
+      fprintf(file, TIMING_PREFIX "%5.3f %s\n",
+      get_time(e),
+      get_taskname(e));
+    }
+    
+    fflush(file);
+  }
+}
+
+TimeTasks_caller_to_set_communication_mode_for_scope::
+TimeTasks_caller_to_set_communication_mode_for_scope()
+{
+  already_communicating = timeTasks.get_communicating();
+  if(!already_communicating)
+  {
+    start_time = MPI_Wtime();
+    timeTasks.set_communicating(true);
+  }
+}
+TimeTasks_caller_to_set_communication_mode_for_scope::
+~TimeTasks_caller_to_set_communication_mode_for_scope()
+{
+  if(!already_communicating)
+  {
+    timeTasks.end_communicating(start_time);
   }
 }

From 1fb8825dd4768128e6e8ac9339d896a98ceb5bbe Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 10 Oct 2013 12:36:01 +0200
Subject: [PATCH 057/118] corrected compile error from previous commit

---
 include/TimeTasks.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/TimeTasks.h b/include/TimeTasks.h
index 84b9d230..8387d042 100644
--- a/include/TimeTasks.h
+++ b/include/TimeTasks.h
@@ -1,5 +1,6 @@
 #ifndef __TimeTasks_H__
 #define __TimeTasks_H__
+#include "assert.h"
 
 /* Avoid direct use of this class.
    Instead, use and add to the macros at the bottom

From e86bc65d6e444a28258ad1db32fb07812a8fb02d Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 14 Oct 2013 12:23:51 +0200
Subject: [PATCH 058/118] sumMoments now sums moments of all species

---
 fields/EMfields3D.cpp | 189 +++++++++++++++++++++++++++++++++++++++---
 fields/Moments.cpp    |  20 -----
 include/EMfields3D.h  |  16 +---
 include/Moments.h     |  72 +---------------
 main/iPic3Dlib.cpp    |  12 +--
 5 files changed, 185 insertions(+), 124 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 4abc4585..f081bb54 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -193,21 +193,15 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
   injFieldsRear   = new injInfoFields(nxn, nyn, nzn);
 
   sizeMomentsArray = omp_thread_count();
-  #ifdef TENMOMENTS
-  tenMomentsArray = new TenMoments*[sizeMomentsArray];
-  #endif // TENMOMENTS
   moments10Array = new Moments10*[sizeMomentsArray];
   for(int i=0;i<sizeMomentsArray;i++)
   {
-    #ifdef TENMOMENTS
-    tenMomentsArray[i] = new TenMoments(nxn,nyn,nzn);
-    #endif // TENMOMENTS
     moments10Array[i] = new Moments10(nxn,nyn,nzn);
   }
 }
 
 // This was Particles3Dcomm::interpP2G()
-void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct)
+void EMfields3D::sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct)
 {
   const double inv_dx = 1.0 / dx;
   const double inv_dy = 1.0 / dy;
@@ -656,6 +650,183 @@ void EMfields3D::sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTop
   }
   communicateGhostP2G(is, 0, 0, 0, 0, vct);
 }
+// This was Particles3Dcomm::interpP2G()
+void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct)
+{
+  const double inv_dx = 1.0 / dx;
+  const double inv_dy = 1.0 / dy;
+  const double inv_dz = 1.0 / dz;
+  const int nxn = grid->getNXN();
+  const int nyn = grid->getNYN();
+  const int nzn = grid->getNZN();
+  const double xstart = grid->getXstart();
+  const double ystart = grid->getYstart();
+  const double zstart = grid->getZstart();
+  // To make memory use scale to a large number of threads, we
+  // could first apply an efficient parallel sorting algorithm
+  // to the particles and then accumulate moments in smaller
+  // subarrays.
+  //#ifdef _OPENMP
+  #pragma omp parallel
+  for (int i = 0; i < ns; i++)
+  {
+    const Particles3Dcomm& pcls = part[i];
+    const int is = pcls.get_ns();
+
+    double const*const x = pcls.getXall();
+    double const*const y = pcls.getYall();
+    double const*const z = pcls.getZall();
+    double const*const u = pcls.getUall();
+    double const*const v = pcls.getVall();
+    double const*const w = pcls.getWall();
+    double const*const q = pcls.getQall();
+
+    const int nop = pcls.getNOP();
+
+    int thread_num = omp_get_thread_num();
+    if(!thread_num) { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
+    Moments10& speciesMoments10 = fetch_moments10Array(thread_num);
+    speciesMoments10.set_to_zero();
+    arr4_double moments = speciesMoments10.fetch_arr();
+    // The following loop is expensive, so it is wise to assume that the
+    // compiler is stupid.  Therefore we should on the one hand
+    // expand things out and on the other hand avoid repeating computations.
+    #pragma omp for nowait
+    for (int i = 0; i < nop; i++)
+    {
+      // compute the quadratic moments of velocity
+      //
+      const double ui=u[i];
+      const double vi=v[i];
+      const double wi=w[i];
+      const double uui=ui*ui;
+      const double uvi=ui*vi;
+      const double uwi=ui*wi;
+      const double vvi=vi*vi;
+      const double vwi=vi*wi;
+      const double wwi=wi*wi;
+      double velmoments[10];
+      velmoments[0] = 1.;
+      velmoments[1] = ui;
+      velmoments[2] = vi;
+      velmoments[3] = wi;
+      velmoments[4] = uui;
+      velmoments[5] = uvi;
+      velmoments[6] = uwi;
+      velmoments[7] = vvi;
+      velmoments[8] = vwi;
+      velmoments[9] = wwi;
+
+      //
+      // compute the weights to distribute the moments
+      //
+      const int ix = 2 + int (floor((x[i] - xstart) * inv_dx));
+      const int iy = 2 + int (floor((y[i] - ystart) * inv_dy));
+      const int iz = 2 + int (floor((z[i] - zstart) * inv_dz));
+      const double xi0   = x[i] - grid->getXN(ix-1);
+      const double eta0  = y[i] - grid->getYN(iy-1);
+      const double zeta0 = z[i] - grid->getZN(iz-1);
+      const double xi1   = grid->getXN(ix) - x[i];
+      const double eta1  = grid->getYN(iy) - y[i];
+      const double zeta1 = grid->getZN(iz) - z[i];
+      const double qi = q[i];
+      const double weight000 = qi * xi0 * eta0 * zeta0 * invVOL;
+      const double weight001 = qi * xi0 * eta0 * zeta1 * invVOL;
+      const double weight010 = qi * xi0 * eta1 * zeta0 * invVOL;
+      const double weight011 = qi * xi0 * eta1 * zeta1 * invVOL;
+      const double weight100 = qi * xi1 * eta0 * zeta0 * invVOL;
+      const double weight101 = qi * xi1 * eta0 * zeta1 * invVOL;
+      const double weight110 = qi * xi1 * eta1 * zeta0 * invVOL;
+      const double weight111 = qi * xi1 * eta1 * zeta1 * invVOL;
+      double weights[8];
+      weights[0] = weight000;
+      weights[1] = weight001;
+      weights[2] = weight010;
+      weights[3] = weight011;
+      weights[4] = weight100;
+      weights[5] = weight101;
+      weights[6] = weight110;
+      weights[0] = weight111;
+
+      // add particle to moments
+      {
+        arr1_double_fetch moments000 = moments[ix  ][iy  ][iz  ];
+        arr1_double_fetch moments001 = moments[ix  ][iy  ][iz-1];
+        arr1_double_fetch moments010 = moments[ix  ][iy-1][iz  ];
+        arr1_double_fetch moments011 = moments[ix  ][iy-1][iz-1];
+        arr1_double_fetch moments100 = moments[ix-1][iy  ][iz  ];
+        arr1_double_fetch moments101 = moments[ix-1][iy  ][iz-1];
+        arr1_double_fetch moments110 = moments[ix-1][iy-1][iz  ];
+        arr1_double_fetch moments111 = moments[ix-1][iy-1][iz-1];
+
+        arr1_double_fetch momentsArray[8];
+        momentsArray[0] = moments000;
+        momentsArray[1] = moments001;
+        momentsArray[2] = moments010;
+        momentsArray[3] = moments011;
+        momentsArray[4] = moments100;
+        momentsArray[5] = moments101;
+        momentsArray[6] = moments110;
+        momentsArray[7] = moments111;
+
+        double buffer[10][8];
+        // #pragma simd
+        for(int m=0;m<10;m++)
+        for(int c=0;c<8;c++)
+        {
+          buffer[m][c] = velmoments[c]*weights[m];
+        }
+        for(int c=0;c<8;c++)
+        for(int m=0;m<10;m++)
+        {
+          momentsArray[c][m] = buffer[m][c];
+        }
+      }
+    }
+    if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION);
+
+    // reduction
+    if(!thread_num) timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION);
+
+    // reduce arrays
+    {
+      #pragma omp critical (0)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { rhons[is][i][j][k] += invVOL*moments[i][j][k][0]; }}
+      #pragma omp critical (1)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1]; }}
+      #pragma omp critical (2)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { Jys  [is][i][j][k] += invVOL*moments[i][j][k][2]; }}
+      #pragma omp critical (3)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3]; }}
+      #pragma omp critical (4)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4]; }}
+      #pragma omp critical (5)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5]; }}
+      #pragma omp critical (6)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6]; }}
+      #pragma omp critical (7)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7]; }}
+      #pragma omp critical (8)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8]; }}
+      #pragma omp critical (9)
+      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+        { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }}
+    }
+    if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_REDUCTION);
+    // uncomment this and remove the loop below
+    // when we change to use asynchronous communication.
+    // communicateGhostP2G(is, 0, 0, 0, 0, vct);
+  }
+}
 
 /*! Calculate Electric field with the implicit solver: the Maxwell solver method is called here */
 void EMfields3D::calculateE(Grid * grid, VirtualTopology3D * vct, Collective *col) {
@@ -3564,10 +3735,6 @@ EMfields3D::~EMfields3D() {
   delete injFieldsBottom;
   delete injFieldsFront;
   delete injFieldsRear;
-  #ifdef TENMOMENTS
-  for(int i=0;i<sizeMomentsArray;i++) { delete tenMomentsArray[i]; }
-  delete [] tenMomentsArray;
-  #endif // TENMOMENTS
   for(int i=0;i<sizeMomentsArray;i++) { delete moments10Array[i]; }
   delete [] moments10Array;
 }
diff --git a/fields/Moments.cpp b/fields/Moments.cpp
index 5a4c5d0e..c1518b4d 100644
--- a/fields/Moments.cpp
+++ b/fields/Moments.cpp
@@ -13,23 +13,3 @@ void Moments10::set_to_zero()
   }
 }
 
-#ifdef TENMOMENTS
-void TenMoments::set_to_zero() {
-  #pragma omp parallel for collapse(3)
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++) {
-        rho[i][j][k] = 0.0;
-        Jx[i][j][k] = 0.0;
-        Jy[i][j][k] = 0.0;
-        Jz[i][j][k] = 0.0;
-        pXX[i][j][k] = 0.0;
-        pXY[i][j][k] = 0.0;
-        pXZ[i][j][k] = 0.0;
-        pYY[i][j][k] = 0.0;
-        pYZ[i][j][k] = 0.0;
-        pZZ[i][j][k] = 0.0;
-      }
-}
-#endif // TENMOMENTS
-
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 38c62018..2aaef7c4 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -31,9 +31,6 @@ using std::endl;
 /*! Electromagnetic fields and sources defined for each local grid, and for an implicit maxwell's solver @date May 2008 @par Copyright: (C) 2008 KUL @author Stefano Markidis, Giovanni Lapenta. @version 3.0 */
 
 class Particles3Dcomm;
-#ifdef TENMOMENTS
-class TenMoments;
-#endif // TENMOMENTS
 class Moments10;
 class EMfields3D                // :public Field
 {
@@ -122,7 +119,8 @@ class EMfields3D                // :public Field
     void set_fieldForPcls();
     /*! communicate ghost for grid -> Particles interpolation */
     void communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, VirtualTopology3D * vct);
-    void sumMoments(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
+    void sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
+    void sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
     /*! add accumulated moments to the moments for a given species */
     //void addToSpeciesMoments(const TenMoments & in, int is);
     /*! add an amount of charge density to charge density field at node X,Y,Z */
@@ -262,13 +260,6 @@ class EMfields3D                // :public Field
     double getBenergy();
 
     /*! fetch array for summing moments of thread i */
-    #ifdef TENMOMENTS
-    TenMoments& fetch_momentsArray(int i){
-      assert_le(0,i);
-      assert_le(i,sizeMomentsArray);
-      return *(tenMomentsArray[i]);
-    }
-    #endif // TENMOMENTS
     Moments10& fetch_moments10Array(int i){
       assert_le(0,i);
       assert_le(i,sizeMomentsArray);
@@ -402,9 +393,6 @@ class EMfields3D                // :public Field
     array3_double divC;
     /* temporary arrays for summing moments */
     int sizeMomentsArray;
-    #ifdef TENMOMENTS
-    TenMoments **tenMomentsArray;
-    #endif // TENMOMENTS
     Moments10 **moments10Array;
 
     // *******************************************************************************
diff --git a/include/Moments.h b/include/Moments.h
index 8a4a10cf..9c6aadb8 100644
--- a/include/Moments.h
+++ b/include/Moments.h
@@ -2,6 +2,8 @@
 #define Moments_H
 #include "Alloc.h"
 
+// class to accumulate node-centered species moments
+// 
 class Moments10
 {
   private:
@@ -25,74 +27,4 @@ class Moments10
     ~Moments10(){};
 };
 
-// class to accumulate node-centered species moments
-// 
-#ifdef TENMOMENTS
-class TenMoments {
-  private:
-    arr3_double rho;
-
-    /** current density, defined on nodes */
-    arr3_double Jx;
-    arr3_double Jy;
-    arr3_double Jz;
-
-    /** pressure tensor components, defined on nodes */
-    arr3_double pXX;
-    arr3_double pXY;
-    arr3_double pXZ;
-    arr3_double pYY;
-    arr3_double pYZ;
-    arr3_double pZZ;
-    int nx;
-    int ny;
-    int nz;
-  public:
-    // get accessors (read access)
-    int get_nx() const { return nx; }
-    int get_ny() const { return ny; }
-    int get_nz() const { return nz; }
-    double get_rho(int i, int j, int k) const { return rho.get(i,j,k); }
-    double get_Jx (int i, int j, int k) const { return Jx .get(i,j,k); }
-    double get_Jy (int i, int j, int k) const { return Jy .get(i,j,k); }
-    double get_Jz (int i, int j, int k) const { return Jz .get(i,j,k); }
-    double get_pXX(int i, int j, int k) const { return pXX.get(i,j,k); }
-    double get_pXY(int i, int j, int k) const { return pXY.get(i,j,k); }
-    double get_pXZ(int i, int j, int k) const { return pXZ.get(i,j,k); }
-    double get_pYY(int i, int j, int k) const { return pYY.get(i,j,k); }
-    double get_pYZ(int i, int j, int k) const { return pYZ.get(i,j,k); }
-    double get_pZZ(int i, int j, int k) const { return pZZ.get(i,j,k); }
-    // fetch accessors (write access)
-    arr3_double fetch_rho() { return rho; }
-    arr3_double fetch_Jx () { return Jx ; }
-    arr3_double fetch_Jy () { return Jy ; }
-    arr3_double fetch_Jz () { return Jz ; }
-    arr3_double fetch_Pxx() { return pXX; }
-    arr3_double fetch_Pxy() { return pXY; }
-    arr3_double fetch_Pxz() { return pXZ; }
-    arr3_double fetch_Pyy() { return pYY; }
-    arr3_double fetch_Pyz() { return pYZ; }
-    arr3_double fetch_Pzz() { return pZZ; }
-  public:
-    TenMoments(int nxn, int nyn, int nzn) :
-      nx(nxn),
-      ny(nyn),
-      nz(nzn),
-      rho (nxn, nyn, nzn),
-      Jx  (nxn, nyn, nzn),
-      Jy  (nxn, nyn, nzn),
-      Jz  (nxn, nyn, nzn),
-      pXX (nxn, nyn, nzn),
-      pXY (nxn, nyn, nzn),
-      pXZ (nxn, nyn, nzn),
-      pYY (nxn, nyn, nzn),
-      pYZ (nxn, nyn, nzn),
-      pZZ (nxn, nyn, nzn)
-    {
-    };
-    ~TenMoments(){};
-    void set_to_zero();
-};
-#endif // TENMOMENTS
-
 #endif
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index d88692ed..0ee817c9 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -177,15 +177,9 @@ void c_Solver::CalculateMoments() {
 
   EMf->updateInfoFields(grid,vct,col);
   EMf->setZeroDensities();                  // set to zero the densities
-
-  for (int i = 0; i < ns; i++)
-  {
-    // interpolate particles to grid nodes
-    EMf->sumMoments(part[i], grid, vct);
-    //part[i].interpP2G(EMf, grid, vct); // the old, slow way.
-  }
-
-  EMf->sumOverSpecies(vct);                 // sum all over the species
+  EMf->sumMoments(part, grid, vct);
+  //EMf->sumMomentsOld(part, grid, vct);
+  //EMf->sumOverSpecies(vct);                 // sum all over the species
 
   // Fill with constant charge the planet
   if (col->getCase()=="Dipole") {

From aa96d727d90ddc3513002f6b1bafa4c897162889 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 14 Oct 2013 15:12:50 +0200
Subject: [PATCH 059/118] fix to errors in previous commit

---
 fields/EMfields3D.cpp | 258 +++++-------------------------------------
 main/iPic3Dlib.cpp    |  10 +-
 2 files changed, 38 insertions(+), 230 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index f081bb54..a652cda0 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -222,19 +222,6 @@ void EMfields3D::sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, Virtual
   //
   const int is = pcls.get_ns();
 
-  #ifdef TENMOMENTS
-  double* rhons1d = &rhons[is][0][0][0];
-  double* Jxs1d   = &Jxs  [is][0][0][0];
-  double* Jys1d   = &Jys  [is][0][0][0];
-  double* Jzs1d   = &Jzs  [is][0][0][0];
-  double* pXXsn1d = &pXXsn[is][0][0][0];
-  double* pXYsn1d = &pXYsn[is][0][0][0];
-  double* pXZsn1d = &pXZsn[is][0][0][0];
-  double* pYYsn1d = &pYYsn[is][0][0][0];
-  double* pYZsn1d = &pYZsn[is][0][0][0];
-  double* pZZsn1d = &pZZsn[is][0][0][0];
-  #endif
-  //
   const int nop = pcls.getNOP();
   // To make memory use scale to a large number of threads, we
   // could first apply an efficient parallel sorting algorithm
@@ -245,20 +232,6 @@ void EMfields3D::sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, Virtual
   {
     int thread_num = omp_get_thread_num();
     if(!thread_num) { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
-    #ifdef TENMOMENTS
-    TenMoments& speciesMoments = fetch_momentsArray(thread_num);
-    speciesMoments.set_to_zero();
-    arr3_double rho = speciesMoments.fetch_rho();
-    arr3_double Jx  = speciesMoments.fetch_Jx();
-    arr3_double Jy  = speciesMoments.fetch_Jy();
-    arr3_double Jz  = speciesMoments.fetch_Jz();
-    arr3_double Pxx = speciesMoments.fetch_Pxx();
-    arr3_double Pxy = speciesMoments.fetch_Pxy();
-    arr3_double Pxz = speciesMoments.fetch_Pxz();
-    arr3_double Pyy = speciesMoments.fetch_Pyy();
-    arr3_double Pyz = speciesMoments.fetch_Pyz();
-    arr3_double Pzz = speciesMoments.fetch_Pzz();
-    #endif // TENMOMENTS
     Moments10& speciesMoments10 = fetch_moments10Array(thread_num);
     speciesMoments10.set_to_zero();
     arr4_double moments = speciesMoments10.fetch_arr();
@@ -297,12 +270,6 @@ void EMfields3D::sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, Virtual
       const int ix = 2 + int (floor((x[i] - xstart) * inv_dx));
       const int iy = 2 + int (floor((y[i] - ystart) * inv_dy));
       const int iz = 2 + int (floor((z[i] - zstart) * inv_dz));
-      //const double xi0   = x[i] - grid->getXN(ix - 1, iy, iz);
-      //const double eta0  = y[i] - grid->getYN(ix, iy - 1, iz);
-      //const double zeta0 = z[i] - grid->getZN(ix, iy, iz - 1);
-      //const double xi1   = grid->getXN(ix, iy, iz) - x[i];
-      //const double eta1  = grid->getYN(ix, iy, iz) - y[i];
-      //const double zeta1 = grid->getZN(ix, iy, iz) - z[i];
       const double xi0   = x[i] - grid->getXN(ix-1);
       const double eta0  = y[i] - grid->getYN(iy-1);
       const double zeta0 = z[i] - grid->getZN(iz-1);
@@ -437,212 +404,42 @@ void EMfields3D::sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, Virtual
         //}
       }
 
-      #ifdef TENMOMENTS
-      {
-        // use the weight to distribute the moments
-        //
-        // add charge density
-        //speciesMoments.addRho(weight, ix, iy, iz);
-        rho[ix  ][iy  ][iz  ] += weight000;
-        rho[ix  ][iy  ][iz-1] += weight001;
-        rho[ix  ][iy-1][iz  ] += weight010;
-        rho[ix  ][iy-1][iz-1] += weight011;
-        rho[ix-1][iy  ][iz  ] += weight100;
-        rho[ix-1][iy  ][iz-1] += weight101;
-        rho[ix-1][iy-1][iz  ] += weight110;
-        rho[ix-1][iy-1][iz-1] += weight111;
-        // add current density - X
-        //speciesMoments.addJx(temp, ix, iy, iz);
-        Jx[ix  ][iy  ][iz  ] += ui*weight000;
-        Jx[ix  ][iy  ][iz-1] += ui*weight001;
-        Jx[ix  ][iy-1][iz  ] += ui*weight010;
-        Jx[ix  ][iy-1][iz-1] += ui*weight011;
-        Jx[ix-1][iy  ][iz  ] += ui*weight100;
-        Jx[ix-1][iy  ][iz-1] += ui*weight101;
-        Jx[ix-1][iy-1][iz  ] += ui*weight110;
-        Jx[ix-1][iy-1][iz-1] += ui*weight111;
-        // add current density - Y
-        //speciesMoments.addJy(temp, ix, iy, iz);
-        Jy[ix  ][iy  ][iz  ] += vi*weight000;
-        Jy[ix  ][iy  ][iz-1] += vi*weight001;
-        Jy[ix  ][iy-1][iz  ] += vi*weight010;
-        Jy[ix  ][iy-1][iz-1] += vi*weight011;
-        Jy[ix-1][iy  ][iz  ] += vi*weight100;
-        Jy[ix-1][iy  ][iz-1] += vi*weight101;
-        Jy[ix-1][iy-1][iz  ] += vi*weight110;
-        Jy[ix-1][iy-1][iz-1] += vi*weight111;
-        // add current density - Z
-        //speciesMoments.addJz(temp, ix, iy, iz);
-        Jz[ix  ][iy  ][iz  ] += wi*weight000;
-        Jz[ix  ][iy  ][iz-1] += wi*weight001;
-        Jz[ix  ][iy-1][iz  ] += wi*weight010;
-        Jz[ix  ][iy-1][iz-1] += wi*weight011;
-        Jz[ix-1][iy  ][iz  ] += wi*weight100;
-        Jz[ix-1][iy  ][iz-1] += wi*weight101;
-        Jz[ix-1][iy-1][iz  ] += wi*weight110;
-        Jz[ix-1][iy-1][iz-1] += wi*weight111;
-        // Pxx - add pressure tensor
-        //speciesMoments.addPxx(temp, ix, iy, iz);
-        Pxx[ix  ][iy  ][iz  ] += uui*weight000;
-        Pxx[ix  ][iy  ][iz-1] += uui*weight001;
-        Pxx[ix  ][iy-1][iz  ] += uui*weight010;
-        Pxx[ix  ][iy-1][iz-1] += uui*weight011;
-        Pxx[ix-1][iy  ][iz  ] += uui*weight100;
-        Pxx[ix-1][iy  ][iz-1] += uui*weight101;
-        Pxx[ix-1][iy-1][iz  ] += uui*weight110;
-        Pxx[ix-1][iy-1][iz-1] += uui*weight111;
-        // Pxy - add pressure tensor
-        //speciesMoments.addPxy(temp, ix, iy, iz);
-        Pxy[ix  ][iy  ][iz  ] += uvi*weight000;
-        Pxy[ix  ][iy  ][iz-1] += uvi*weight001;
-        Pxy[ix  ][iy-1][iz  ] += uvi*weight010;
-        Pxy[ix  ][iy-1][iz-1] += uvi*weight011;
-        Pxy[ix-1][iy  ][iz  ] += uvi*weight100;
-        Pxy[ix-1][iy  ][iz-1] += uvi*weight101;
-        Pxy[ix-1][iy-1][iz  ] += uvi*weight110;
-        Pxy[ix-1][iy-1][iz-1] += uvi*weight111;
-        // Pxz - add pressure tensor
-        //speciesMoments.addPxz(temp, ix, iy, iz);
-        Pxz[ix  ][iy  ][iz  ] += uwi*weight000;
-        Pxz[ix  ][iy  ][iz-1] += uwi*weight001;
-        Pxz[ix  ][iy-1][iz  ] += uwi*weight010;
-        Pxz[ix  ][iy-1][iz-1] += uwi*weight011;
-        Pxz[ix-1][iy  ][iz  ] += uwi*weight100;
-        Pxz[ix-1][iy  ][iz-1] += uwi*weight101;
-        Pxz[ix-1][iy-1][iz  ] += uwi*weight110;
-        Pxz[ix-1][iy-1][iz-1] += uwi*weight111;
-        // Pyy - add pressure tensor
-        //speciesMoments.addPyy(temp, ix, iy, iz);
-        Pyy[ix  ][iy  ][iz  ] += vvi*weight000;
-        Pyy[ix  ][iy  ][iz-1] += vvi*weight001;
-        Pyy[ix  ][iy-1][iz  ] += vvi*weight010;
-        Pyy[ix  ][iy-1][iz-1] += vvi*weight011;
-        Pyy[ix-1][iy  ][iz  ] += vvi*weight100;
-        Pyy[ix-1][iy  ][iz-1] += vvi*weight101;
-        Pyy[ix-1][iy-1][iz  ] += vvi*weight110;
-        Pyy[ix-1][iy-1][iz-1] += vvi*weight111;
-        // Pyz - add pressure tensor
-        //speciesMoments.addPyz(temp, ix, iy, iz);
-        Pyz[ix  ][iy  ][iz  ] += vwi*weight000;
-        Pyz[ix  ][iy  ][iz-1] += vwi*weight001;
-        Pyz[ix  ][iy-1][iz  ] += vwi*weight010;
-        Pyz[ix  ][iy-1][iz-1] += vwi*weight011;
-        Pyz[ix-1][iy  ][iz  ] += vwi*weight100;
-        Pyz[ix-1][iy  ][iz-1] += vwi*weight101;
-        Pyz[ix-1][iy-1][iz  ] += vwi*weight110;
-        Pyz[ix-1][iy-1][iz-1] += vwi*weight111;
-        // Pzz - add pressure tensor
-        //speciesMoments.addPzz(temp, ix, iy, iz);
-        Pzz[ix  ][iy  ][iz  ] += wwi*weight000;
-        Pzz[ix  ][iy  ][iz-1] += wwi*weight001;
-        Pzz[ix  ][iy-1][iz  ] += wwi*weight010;
-        Pzz[ix  ][iy-1][iz-1] += wwi*weight011;
-        Pzz[ix-1][iy  ][iz  ] += wwi*weight100;
-        Pzz[ix-1][iy  ][iz-1] += wwi*weight101;
-        Pzz[ix-1][iy-1][iz  ] += wwi*weight110;
-        Pzz[ix-1][iy-1][iz-1] += wwi*weight111;
-      }
-      #endif // TENMOMENTS
-
-      #ifdef TENMOMENTS
-      {
-        // check work
-        for(int jx=0;jx<2;jx++)
-        for(int jy=0;jy<2;jy++)
-        for(int jz=0;jz<2;jz++)
-        {
-          assert_eq(rho[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][0]);
-          assert_eq(Jx [ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][1]);
-          assert_eq(Jy [ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][2]);
-          assert_eq(Jz [ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][3]);
-          assert_eq(Pxx[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][4]);
-          assert_eq(Pxy[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][5]);
-          assert_eq(Pxz[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][6]);
-          assert_eq(Pyy[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][7]);
-          assert_eq(Pyz[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][8]);
-          assert_eq(Pzz[ix-jx][iy-jy][iz-jz], moments[ix-jx][iy-jy][iz-jz][9]);
-        }
-      }
-      #endif // TENMOMENTS
     }
     if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION);
 
     // reduction
     if(!thread_num) timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION);
 
-    // split up the reduction tasks.
-    //
-    //{
-    //  //
-    //  // One-dimensional array access is presumably
-    //  // more efficient on poor compilers.
-    //  double* rho1d = &rho[0][0][0];
-    //  double* Jx1d  = &Jx [0][0][0];
-    //  double* Jy1d  = &Jy [0][0][0];
-    //  double* Jz1d  = &Jz [0][0][0];
-    //  double* Pxx1d = &Pxx[0][0][0];
-    //  double* Pxy1d = &Pxy[0][0][0];
-    //  double* Pxz1d = &Pxz[0][0][0];
-    //  double* Pyy1d = &Pyy[0][0][0];
-    //  double* Pyz1d = &Pyz[0][0][0];
-    //  double* Pzz1d = &Pzz[0][0][0];
-    //  ////
-    //  assert_eq(speciesMoments.get_nx(), nxn);
-    //  assert_eq(speciesMoments.get_ny(), nyn);
-    //  assert_eq(speciesMoments.get_nz(), nzn);
-    //  const int numel = nxn*nyn*nzn;
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) rhons1d[i] += invVOL*rho1d[i];
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) Jxs1d  [i] += invVOL*Jx1d [i];
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) Jys1d  [i] += invVOL*Jy1d [i];
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) Jzs1d  [i] += invVOL*Jz1d [i];
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) pXXsn1d[i] += invVOL*Pxx1d[i];
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) pXYsn1d[i] += invVOL*Pxy1d[i];
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) pXZsn1d[i] += invVOL*Pxz1d[i];
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) pYYsn1d[i] += invVOL*Pyy1d[i];
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) pYZsn1d[i] += invVOL*Pyz1d[i];
-    //  #pragma omp critical
-    //  for(int i=0;i<numel;i++) pZZsn1d[i] += invVOL*Pzz1d[i];
-    //}
-
     // reduce arrays
     {
-      #pragma omp critical
+      #pragma omp critical (reduceMoment0)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { rhons[is][i][j][k] += invVOL*moments[i][j][k][0]; }}
-      #pragma omp critical
+      #pragma omp critical (reduceMoment1)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1]; }}
-      #pragma omp critical
+      #pragma omp critical (reduceMoment2)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { Jys  [is][i][j][k] += invVOL*moments[i][j][k][2]; }}
-      #pragma omp critical
+      #pragma omp critical (reduceMoment3)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3]; }}
-      #pragma omp critical
+      #pragma omp critical (reduceMoment4)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4]; }}
-      #pragma omp critical
+      #pragma omp critical (reduceMoment5)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5]; }}
-      #pragma omp critical
+      #pragma omp critical (reduceMoment6)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6]; }}
-      #pragma omp critical
+      #pragma omp critical (reduceMoment7)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7]; }}
-      #pragma omp critical
+      #pragma omp critical (reduceMoment8)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8]; }}
-      #pragma omp critical
+      #pragma omp critical (reduceMoment9)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }}
     }
@@ -672,6 +469,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
   {
     const Particles3Dcomm& pcls = part[i];
     const int is = pcls.get_ns();
+    assert_eq(i,is);
 
     double const*const x = pcls.getXall();
     double const*const y = pcls.getYall();
@@ -746,7 +544,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
       weights[4] = weight100;
       weights[5] = weight101;
       weights[6] = weight110;
-      weights[0] = weight111;
+      weights[7] = weight111;
 
       // add particle to moments
       {
@@ -770,11 +568,13 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
         momentsArray[7] = moments111;
 
         double buffer[10][8];
-        // #pragma simd
+        //#pragma simd
         for(int m=0;m<10;m++)
-        for(int c=0;c<8;c++)
         {
-          buffer[m][c] = velmoments[c]*weights[m];
+          for(int c=0;c<8;c++)
+          {
+            buffer[m][c] = velmoments[m]*weights[c];
+          }
         }
         for(int c=0;c<8;c++)
         for(int m=0;m<10;m++)
@@ -790,34 +590,34 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
 
     // reduce arrays
     {
-      #pragma omp critical (0)
+      #pragma omp critical (reduceMoment0)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { rhons[is][i][j][k] += invVOL*moments[i][j][k][0]; }}
-      #pragma omp critical (1)
+      #pragma omp critical (reduceMoment1)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1]; }}
-      #pragma omp critical (2)
+      #pragma omp critical (reduceMoment2)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { Jys  [is][i][j][k] += invVOL*moments[i][j][k][2]; }}
-      #pragma omp critical (3)
+      #pragma omp critical (reduceMoment3)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3]; }}
-      #pragma omp critical (4)
+      #pragma omp critical (reduceMoment4)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4]; }}
-      #pragma omp critical (5)
+      #pragma omp critical (reduceMoment5)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5]; }}
-      #pragma omp critical (6)
+      #pragma omp critical (reduceMoment6)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6]; }}
-      #pragma omp critical (7)
+      #pragma omp critical (reduceMoment7)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7]; }}
-      #pragma omp critical (8)
+      #pragma omp critical (reduceMoment8)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8]; }}
-      #pragma omp critical (9)
+      #pragma omp critical (reduceMoment9)
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }}
     }
@@ -826,6 +626,10 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
     // when we change to use asynchronous communication.
     // communicateGhostP2G(is, 0, 0, 0, 0, vct);
   }
+  for (int i = 0; i < ns; i++)
+  {
+    communicateGhostP2G(i, 0, 0, 0, 0, vct);
+  }
 }
 
 /*! Calculate Electric field with the implicit solver: the Maxwell solver method is called here */
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 0ee817c9..449c56c2 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -177,9 +177,13 @@ void c_Solver::CalculateMoments() {
 
   EMf->updateInfoFields(grid,vct,col);
   EMf->setZeroDensities();                  // set to zero the densities
-  EMf->sumMoments(part, grid, vct);
-  //EMf->sumMomentsOld(part, grid, vct);
-  //EMf->sumOverSpecies(vct);                 // sum all over the species
+
+  //EMf->sumMoments(part, grid, vct);
+  for (int i = 0; i < ns; i++)
+  {
+    EMf->sumMomentsOld(part[i], grid, vct);
+  }
+  EMf->sumOverSpecies(vct);                 // sum all over the species
 
   // Fill with constant charge the planet
   if (col->getCase()=="Dipole") {

From c950a84e53b1e90ad92ead0f2c203aeee00546a1 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 14 Oct 2013 15:22:32 +0200
Subject: [PATCH 060/118] issue #55: Sum moments for all species in one OpenMP
 parallel clause (initial commit was e86bc65d6e4)

---
 fields/EMfields3D.cpp | 185 +++++++++---------------------------------
 include/arraysfwd.h   |   2 +
 main/iPic3Dlib.cpp    |  10 +--
 3 files changed, 47 insertions(+), 150 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index a652cda0..b130ef2d 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -285,125 +285,34 @@ void EMfields3D::sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, Virtual
       const double weight101 = qi * xi1 * eta0 * zeta1 * invVOL;
       const double weight110 = qi * xi1 * eta1 * zeta0 * invVOL;
       const double weight111 = qi * xi1 * eta1 * zeta1 * invVOL;
+      double weights[8];
+      weights[0] = weight000;
+      weights[1] = weight001;
+      weights[2] = weight010;
+      weights[3] = weight011;
+      weights[4] = weight100;
+      weights[5] = weight101;
+      weights[6] = weight110;
+      weights[7] = weight111;
 
       // add particle to moments
       {
-        arr1_double_fetch moments000 = moments[ix  ][iy  ][iz  ];
-        arr1_double_fetch moments001 = moments[ix  ][iy  ][iz-1];
-        arr1_double_fetch moments010 = moments[ix  ][iy-1][iz  ];
-        arr1_double_fetch moments011 = moments[ix  ][iy-1][iz-1];
-        arr1_double_fetch moments100 = moments[ix-1][iy  ][iz  ];
-        arr1_double_fetch moments101 = moments[ix-1][iy  ][iz-1];
-        arr1_double_fetch moments110 = moments[ix-1][iy-1][iz  ];
-        arr1_double_fetch moments111 = moments[ix-1][iy-1][iz-1];
-
-        moments000[0] += velmoments[0]*weight000;
-        moments000[1] += velmoments[1]*weight000;
-        moments000[2] += velmoments[2]*weight000;
-        moments000[3] += velmoments[3]*weight000;
-        moments000[4] += velmoments[4]*weight000;
-        moments000[5] += velmoments[5]*weight000;
-        moments000[6] += velmoments[6]*weight000;
-        moments000[7] += velmoments[7]*weight000;
-        moments000[8] += velmoments[8]*weight000;
-        moments000[9] += velmoments[9]*weight000;
-
-        moments001[0] += velmoments[0]*weight001;
-        moments001[1] += velmoments[1]*weight001;
-        moments001[2] += velmoments[2]*weight001;
-        moments001[3] += velmoments[3]*weight001;
-        moments001[4] += velmoments[4]*weight001;
-        moments001[5] += velmoments[5]*weight001;
-        moments001[6] += velmoments[6]*weight001;
-        moments001[7] += velmoments[7]*weight001;
-        moments001[8] += velmoments[8]*weight001;
-        moments001[9] += velmoments[9]*weight001;
-
-        moments010[0] += velmoments[0]*weight010;
-        moments010[1] += velmoments[1]*weight010;
-        moments010[2] += velmoments[2]*weight010;
-        moments010[3] += velmoments[3]*weight010;
-        moments010[4] += velmoments[4]*weight010;
-        moments010[5] += velmoments[5]*weight010;
-        moments010[6] += velmoments[6]*weight010;
-        moments010[7] += velmoments[7]*weight010;
-        moments010[8] += velmoments[8]*weight010;
-        moments010[9] += velmoments[9]*weight010;
-
-        moments011[0] += velmoments[0]*weight011;
-        moments011[1] += velmoments[1]*weight011;
-        moments011[2] += velmoments[2]*weight011;
-        moments011[3] += velmoments[3]*weight011;
-        moments011[4] += velmoments[4]*weight011;
-        moments011[5] += velmoments[5]*weight011;
-        moments011[6] += velmoments[6]*weight011;
-        moments011[7] += velmoments[7]*weight011;
-        moments011[8] += velmoments[8]*weight011;
-        moments011[9] += velmoments[9]*weight011;
-
-        moments100[0] += velmoments[0]*weight100;
-        moments100[1] += velmoments[1]*weight100;
-        moments100[2] += velmoments[2]*weight100;
-        moments100[3] += velmoments[3]*weight100;
-        moments100[4] += velmoments[4]*weight100;
-        moments100[5] += velmoments[5]*weight100;
-        moments100[6] += velmoments[6]*weight100;
-        moments100[7] += velmoments[7]*weight100;
-        moments100[8] += velmoments[8]*weight100;
-        moments100[9] += velmoments[9]*weight100;
-
-        moments101[0] += velmoments[0]*weight101;
-        moments101[1] += velmoments[1]*weight101;
-        moments101[2] += velmoments[2]*weight101;
-        moments101[3] += velmoments[3]*weight101;
-        moments101[4] += velmoments[4]*weight101;
-        moments101[5] += velmoments[5]*weight101;
-        moments101[6] += velmoments[6]*weight101;
-        moments101[7] += velmoments[7]*weight101;
-        moments101[8] += velmoments[8]*weight101;
-        moments101[9] += velmoments[9]*weight101;
-
-        moments110[0] += velmoments[0]*weight110;
-        moments110[1] += velmoments[1]*weight110;
-        moments110[2] += velmoments[2]*weight110;
-        moments110[3] += velmoments[3]*weight110;
-        moments110[4] += velmoments[4]*weight110;
-        moments110[5] += velmoments[5]*weight110;
-        moments110[6] += velmoments[6]*weight110;
-        moments110[7] += velmoments[7]*weight110;
-        moments110[8] += velmoments[8]*weight110;
-        moments110[9] += velmoments[9]*weight110;
-
-        moments111[0] += velmoments[0]*weight111;
-        moments111[1] += velmoments[1]*weight111;
-        moments111[2] += velmoments[2]*weight111;
-        moments111[3] += velmoments[3]*weight111;
-        moments111[4] += velmoments[4]*weight111;
-        moments111[5] += velmoments[5]*weight111;
-        moments111[6] += velmoments[6]*weight111;
-        moments111[7] += velmoments[7]*weight111;
-        moments111[8] += velmoments[8]*weight111;
-        moments111[9] += velmoments[9]*weight111;
-
-        //double weight[2][2][2];
-        //weight[0][0][0]=weight000;
-        //weight[0][0][1]=weight001;
-        //weight[0][1][0]=weight010;
-        //weight[0][1][1]=weight011;
-        //weight[1][0][0]=weight100;
-        //weight[1][0][1]=weight101;
-        //weight[1][1][0]=weight110;
-        //weight[1][1][1]=weight111;
-        ////
-        //for(int jx=0;jx<2;jx++)
-        //for(int jy=0;jy<2;jy++)
-        //for(int jz=0;jz<2;jz++)
-        //for(int m=0;m<10;m++)
-        //{
-        //  moments[ix-jx][iy-jy][iz-jz][m] += velmoments[m]*weight[jx][jy][jz];
-        //}
+        arr1_double_fetch momentsArray[8];
+        momentsArray[0] = moments[ix  ][iy  ][iz  ]; // moments000 
+        momentsArray[1] = moments[ix  ][iy  ][iz-1]; // moments001 
+        momentsArray[2] = moments[ix  ][iy-1][iz  ]; // moments010 
+        momentsArray[3] = moments[ix  ][iy-1][iz-1]; // moments011 
+        momentsArray[4] = moments[ix-1][iy  ][iz  ]; // moments100 
+        momentsArray[5] = moments[ix-1][iy  ][iz-1]; // moments101 
+        momentsArray[6] = moments[ix-1][iy-1][iz  ]; // moments110 
+        momentsArray[7] = moments[ix-1][iy-1][iz-1]; // moments111 
+
+        for(int m=0; m<10; m++)
+        for(int c=0; c<8; c++)
+        {
+          momentsArray[c][m] += velmoments[m]*weights[c];
+        }
       }
-
     }
     if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION);
 
@@ -548,38 +457,24 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
 
       // add particle to moments
       {
-        arr1_double_fetch moments000 = moments[ix  ][iy  ][iz  ];
-        arr1_double_fetch moments001 = moments[ix  ][iy  ][iz-1];
-        arr1_double_fetch moments010 = moments[ix  ][iy-1][iz  ];
-        arr1_double_fetch moments011 = moments[ix  ][iy-1][iz-1];
-        arr1_double_fetch moments100 = moments[ix-1][iy  ][iz  ];
-        arr1_double_fetch moments101 = moments[ix-1][iy  ][iz-1];
-        arr1_double_fetch moments110 = moments[ix-1][iy-1][iz  ];
-        arr1_double_fetch moments111 = moments[ix-1][iy-1][iz-1];
-
         arr1_double_fetch momentsArray[8];
-        momentsArray[0] = moments000;
-        momentsArray[1] = moments001;
-        momentsArray[2] = moments010;
-        momentsArray[3] = moments011;
-        momentsArray[4] = moments100;
-        momentsArray[5] = moments101;
-        momentsArray[6] = moments110;
-        momentsArray[7] = moments111;
-
-        double buffer[10][8];
-        //#pragma simd
-        for(int m=0;m<10;m++)
-        {
-          for(int c=0;c<8;c++)
-          {
-            buffer[m][c] = velmoments[m]*weights[c];
-          }
-        }
-        for(int c=0;c<8;c++)
-        for(int m=0;m<10;m++)
+        arr2_double_fetch moments00 = moments[ix  ][iy  ];
+        arr2_double_fetch moments01 = moments[ix  ][iy-1];
+        arr2_double_fetch moments10 = moments[ix-1][iy  ];
+        arr2_double_fetch moments11 = moments[ix-1][iy-1];
+        momentsArray[0] = moments00[iz  ]; // moments000 
+        momentsArray[1] = moments00[iz-1]; // moments001 
+        momentsArray[2] = moments01[iz  ]; // moments010 
+        momentsArray[3] = moments01[iz-1]; // moments011 
+        momentsArray[4] = moments10[iz  ]; // moments100 
+        momentsArray[5] = moments10[iz-1]; // moments101 
+        momentsArray[6] = moments11[iz  ]; // moments110 
+        momentsArray[7] = moments11[iz-1]; // moments111 
+
+        for(int m=0; m<10; m++)
+        for(int c=0; c<8; c++)
         {
-          momentsArray[c][m] = buffer[m][c];
+          momentsArray[c][m] += velmoments[m]*weights[c];
         }
       }
     }
diff --git a/include/arraysfwd.h b/include/arraysfwd.h
index 30bda425..889c950d 100644
--- a/include/arraysfwd.h
+++ b/include/arraysfwd.h
@@ -57,10 +57,12 @@ typedef iPic3D::array4<pfloat> array4_pfloat;
 typedef iPic3D::array_fetch1<double> arr1_double_fetch;
 typedef iPic3D::array_get1<double> arr1_double_get;
 typedef iPic3D::array_get1<pfloat> arr1_pfloat_get;
+typedef iPic3D::array_fetch2<double> arr2_double_fetch;
 #else
 typedef double* arr1_double_fetch;
 typedef double* arr1_double_get;
 typedef pfloat* arr1_pfloat_get;
+typedef double** arr2_double_fetch;
 #endif
 
 #endif
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 449c56c2..5faaf366 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -178,11 +178,11 @@ void c_Solver::CalculateMoments() {
   EMf->updateInfoFields(grid,vct,col);
   EMf->setZeroDensities();                  // set to zero the densities
 
-  //EMf->sumMoments(part, grid, vct);
-  for (int i = 0; i < ns; i++)
-  {
-    EMf->sumMomentsOld(part[i], grid, vct);
-  }
+  EMf->sumMoments(part, grid, vct);
+  //for (int i = 0; i < ns; i++)
+  //{
+  //  EMf->sumMomentsOld(part[i], grid, vct);
+  //}
   EMf->sumOverSpecies(vct);                 // sum all over the species
 
   // Fill with constant charge the planet

From 3837df216981f32893e506811290ac121c09e9a6 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 14 Oct 2013 16:51:28 +0200
Subject: [PATCH 061/118] create OpenMP threads only once to push particles

---
 include/Particles3D.h     |  4 +++-
 main/iPic3Dlib.cpp        |  7 ++++++-
 particles/Particles3D.cpp | 37 ++++++++-----------------------------
 3 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/include/Particles3D.h b/include/Particles3D.h
index fd89edd0..74cfbf37 100644
--- a/include/Particles3D.h
+++ b/include/Particles3D.h
@@ -57,7 +57,9 @@ class Particles3D:public Particles3Dcomm {
     /** mover with the esplicit non relativistic scheme */
     void mover_explicit(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** mover with a Predictor-Corrector Scheme */
-    int mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf);
+    void mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf);
+    /** communicate particle after moving them */
+    int communicate_particles(VirtualTopology3D * vct);
     /** relativistic mover with a Predictor-Corrector scheme */
     int mover_relativistic(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** particle repopulator */
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 5faaf366..77bd3120 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -222,12 +222,17 @@ bool c_Solver::ParticlesMover() {
     timeTasks_set_main_task(TimeTasks::PARTICLES);
     // Should change this to add background field
     EMf->set_fieldForPcls();
+    #pragma omp parallel
     for (int i = 0; i < ns; i++)  // move each species
     {
       // #pragma omp task inout(part[i]) in(grid) target_device(booster)
       //
       // should merely pass EMf->get_fieldForPcls() rather than EMf.
-      mem_avail = part[i].mover_PC(grid, vct, EMf); // use the Predictor Corrector scheme 
+      part[i].mover_PC(grid, vct, EMf); // use the Predictor Corrector scheme 
+    }
+    for (int i = 0; i < ns; i++)  // move each species
+    {
+      mem_avail = part[i].communicate_particles(vct);
     }
   }
 
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 50550ee6..7edbf634 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -311,41 +311,19 @@ void Particles3D::mover_explicit(Grid * grid, VirtualTopology3D * vct, Field * E
 
 }
 /** mover with a Predictor-Corrector scheme */
-int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
+void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
+  #pragma omp master
   if (vct->getCartesian_rank() == 0) {
     cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
   }
-  double start_mover_PC = MPI_Wtime();
-  #if 0
-  const_arr3_double Ex = EMf->getEx();
-  const_arr3_double Ey = EMf->getEy();
-  const_arr3_double Ez = EMf->getEz();
-  const_arr3_double Bx = EMf->getBx();
-  const_arr3_double By = EMf->getBy();
-  const_arr3_double Bz = EMf->getBz();
-  #endif
   const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
 
-  #if 0
-  for(int i=0;i<nxn;i++)
-  for(int j=0;j<nyn;j++)
-  for(int k=0;k<nzn;k++)
-  {
-    assert_eq(fieldForPcls[i][j][k][0], (pfloat) Bx[i][j][k]);
-    assert_eq(fieldForPcls[i][j][k][1], (pfloat) By[i][j][k]);
-    assert_eq(fieldForPcls[i][j][k][2], (pfloat) Bz[i][j][k]);
-    assert_eq(fieldForPcls[i][j][k][3], (pfloat) Ex[i][j][k]);
-    assert_eq(fieldForPcls[i][j][k][4], (pfloat) Ey[i][j][k]);
-    assert_eq(fieldForPcls[i][j][k][5], (pfloat) Ez[i][j][k]);
-  }
-  #endif
-
   const pfloat dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
   const pfloat inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
   // don't bother trying to push any particles simultaneously;
   // MIC already does vectorization automatically, and trying
   // to do it by hand only hurts performance.
-  #pragma omp parallel for
+  #pragma omp for
   // why does single precision make no difference in execution speed?
   //#pragma simd vectorlength(VECTOR_WIDTH)
   for (int rest = 0; rest < nop; rest++) {
@@ -643,10 +621,11 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
     v[rest] = vp;
     w[rest] = wp;
   }                             // END OF ALL THE PARTICLES
+}
 
-  // ********************//
-  // COMMUNICATION 
-  // *******************//
+/** communicate particle after moving them */
+int Particles3D::communicate_particles(VirtualTopology3D * vct)
+{
   timeTasks_set_communicating(); // communicating until end of scope
   const int avail = communicate(vct);
   if (avail < 0)
@@ -660,7 +639,7 @@ int Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       return (-1);
     MPI_Barrier(MPI_COMM_WORLD);
   }
-  return (0);                   // exit succcesfully (hopefully) 
+  return 0; // exit successfully
 }
 
 /** relativistic mover with a Predictor-Corrector scheme */

From 517c03a69a688fba82ead1ea7aec1856c8ce1630 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 15 Oct 2013 07:06:14 +0200
Subject: [PATCH 062/118] issue #56: mover_PC(): iterate NiterMover times

---
 particles/Particles3D.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 7edbf634..bef9b682 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -341,7 +341,7 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
     pfloat vptilde;
     pfloat wptilde;
     // calculate the average velocity iteratively
-    for (int innter = 0; innter < 1; innter++) {
+    for (int innter = 0; innter < NiterMover; innter++) {
       // interpolation G-->P
       const pfloat ixd = floor((xp - xstart) * inv_dx);
       const pfloat iyd = floor((yp - ystart) * inv_dy);

From d51cb740ad30adc41b5b8467c26cc54d602a43b8 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 15 Oct 2013 07:25:08 +0200
Subject: [PATCH 063/118] issue #54: TimeTasks now averaged across threads

---
 fields/EMfields3D.cpp | 18 ++++++----
 include/TimeTasks.h   | 24 ++++---------
 include/ompdefs.h     | 15 ++------
 utility/TimeTasks.cpp | 81 ++++++++++++++++++++++++++++++++-----------
 4 files changed, 81 insertions(+), 57 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index b130ef2d..37104107 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -192,7 +192,7 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
   injFieldsFront  = new injInfoFields(nxn, nyn, nzn);
   injFieldsRear   = new injInfoFields(nxn, nyn, nzn);
 
-  sizeMomentsArray = omp_thread_count();
+  sizeMomentsArray = omp_get_max_threads();
   moments10Array = new Moments10*[sizeMomentsArray];
   for(int i=0;i<sizeMomentsArray;i++)
   {
@@ -228,10 +228,11 @@ void EMfields3D::sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, Virtual
   // to the particles and then accumulate moments in smaller
   // subarrays.
   //#ifdef _OPENMP
-  #pragma omp parallel
+  TimeTasks timeTasksAcc;
+  #pragma omp parallel private(timeTasks)
   {
     int thread_num = omp_get_thread_num();
-    if(!thread_num) { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
+    timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION);
     Moments10& speciesMoments10 = fetch_moments10Array(thread_num);
     speciesMoments10.set_to_zero();
     arr4_double moments = speciesMoments10.fetch_arr();
@@ -314,10 +315,10 @@ void EMfields3D::sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, Virtual
         }
       }
     }
-    if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION);
+    timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION);
 
     // reduction
-    if(!thread_num) timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION);
+    timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION);
 
     // reduce arrays
     {
@@ -352,8 +353,13 @@ void EMfields3D::sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, Virtual
       for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
         { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }}
     }
-    if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_REDUCTION);
+    timeTasks_end_task(TimeTasks::MOMENT_REDUCTION);
+    #pragma omp critical
+    timeTasksAcc += timeTasks;
   }
+  // reset timeTasks to be its average value for all threads
+  timeTasksAcc /= omp_get_max_threads();
+  timeTasks = timeTasksAcc;
   communicateGhostP2G(is, 0, 0, 0, 0, vct);
 }
 // This was Particles3Dcomm::interpP2G()
diff --git a/include/TimeTasks.h b/include/TimeTasks.h
index 8387d042..fba54430 100644
--- a/include/TimeTasks.h
+++ b/include/TimeTasks.h
@@ -49,6 +49,12 @@ class TimeTasks
   //
   void resetCycle();
   //
+  // hack to support averaging timeTasks copies of all threads.
+  //
+  void operator+=(const TimeTasks& arg);
+  void operator/=(int num);
+  void operator=(const TimeTasks& arg);
+  //
   // provide start_time on ending call
   //
   void end_communicating(double start_time);
@@ -86,26 +92,9 @@ class TimeTasks
   double get_communicate(int arg) {
     return communicate[arg];
   }
-  double get_communicate() {
-    double total = 0.;
-    for (int i = NONE + 1; i < LAST; i++) {
-      total += communicate[i];
-    }
-    return total;
-  }
-  double get_time() {
-    double total = 0.;
-    for (int i = NONE + 1; i < LAST; i++) {
-      total += task_duration[i];
-    }
-    return total;
-  }
   double get_compute(int arg) {
     return get_time(arg) - get_communicate(arg);
   }
-  double get_compute() {
-    return get_time() - get_communicate();
-  }
   const char* get_taskname(int arg);
 
  private:
@@ -114,7 +103,6 @@ class TimeTasks
   bool communicating;
   double task_duration[NUMBER_OF_TASKS];
   double communicate[NUMBER_OF_TASKS];
-  double compute[NUMBER_OF_TASKS];
   int stack_depth[NUMBER_OF_TASKS];
   double start_times[NUMBER_OF_TASKS];
 };
diff --git a/include/ompdefs.h b/include/ompdefs.h
index 5c5fab00..2c16779f 100644
--- a/include/ompdefs.h
+++ b/include/ompdefs.h
@@ -7,19 +7,8 @@
 #ifdef _OPENMP
 #include <omp.h>
 #else
-inline int omp_get_thread_num() {
-    return 0;
-}
+inline int omp_get_thread_num() { return 0;}
+inline int omp_get_max_threads(){ return 1;}
 #endif
 
-inline int omp_thread_count() {
-    int n = 0;
-    #pragma omp parallel reduction(+:n)
-    n += 1;
-    #ifndef _OPENMP // USING_OMP
-    assert_eq(n,1);
-    #endif
-    return n;
-}
-
 #endif
diff --git a/utility/TimeTasks.cpp b/utility/TimeTasks.cpp
index 15a2c578..dd114ffc 100644
--- a/utility/TimeTasks.cpp
+++ b/utility/TimeTasks.cpp
@@ -34,7 +34,6 @@ void TimeTasks::resetCycle()
   for(int e=0;e<NUMBER_OF_TASKS;e++)
   {
     task_duration[e]=0.;
-    compute[e]=0.;
     communicate[e]=0.;
     active[e]=false;
     stack_depth[e]=0;
@@ -50,15 +49,12 @@ void TimeTasks::start_main_task(TimeTasks::Tasks taskid)
   active_task = taskid;
   assert(!active[taskid]);
   active[taskid]=true;
-  //if(!MPIdata::get_rank())
-  //dprintf("starting task %s at time %24.16e\n", get_taskname(taskid), MPI_Wtime());
 }
 void TimeTasks::start_task(TimeTasks::Tasks taskid)
 {
   assert(!is_exclusive(taskid));
   assert(!active[taskid]);
   active[taskid]=true;
-  //dprintf("starting task %s at time %24.16e\n", get_taskname(taskid), MPI_Wtime());
 }
 // have to manage the task stack explicitly
 void TimeTasks::start_task(TimeTasks::Tasks taskid, double start_time)
@@ -69,7 +65,6 @@ void TimeTasks::start_task(TimeTasks::Tasks taskid, double start_time)
     start_task(taskid);
   }
   stack_depth[taskid]++;
-  //dprintf("starting task %s at time %24.16e\n", get_taskname(taskid), start_time);
 }
 void TimeTasks::end_main_task(TimeTasks::Tasks taskid, double start_time)
 {
@@ -96,24 +91,15 @@ void TimeTasks::end_task(TimeTasks::Tasks taskid)
 }
 void TimeTasks::end_communicating(double start_time)
 {
-  //if(!active_task) return;
   assert(active_task);
   assert(communicating);
   double additional_communication_time = MPI_Wtime()-start_time;
-  //dprint(additional_communication_time);
   communicate[active_task] += additional_communication_time;
   communicating=false;
 }
 #define TIMING_PREFIX "| "
 void TimeTasks::print_cycle_times(int cycle)
 {
-  // calculate portion of time spent computing
-  //
-  for(int e=NONE+1; e<NUMBER_OF_TASKS; e++)
-  {
-    compute[e] = task_duration[e]-communicate[e];
-  }
-
   FILE* file = stdout;
   // we could report average for all processes
   if(!MPIdata::get_rank())
@@ -128,20 +114,36 @@ void TimeTasks::print_cycle_times(int cycle)
       fprintf(file, TIMING_PREFIX "%6.3f %6.3f %6.3f %s\n",
       get_time(e),
       get_compute(e),
-      get_communicate(e),
+      communicate[e],
       get_taskname(e));
     }
+
+    // report total times
+    //
+    // get total time spent on exclusive tasks
+    //
+    double total_task_duration = 0.;
+    for (int i = NONE + 1; i < LAST; i++) {
+      total_task_duration += task_duration[i];
+    }
+    // get total time spent in exclusive tasks spent communicating
+    //
+    double total_communicate = 0.;
+    for (int i = NONE + 1; i < LAST; i++) {
+      total_communicate += communicate[i];
+    }
+    const double total_computing_time = total_task_duration - total_communicate;
     fprintf(file, TIMING_PREFIX "%6.3f %6.3f %6.3f %s\n",
-      get_time(),
-      get_compute(),
-      get_communicate(),
+      total_task_duration,
+      total_computing_time,
+      total_communicate,
       "[total times]");
 
-    fprintf(file, TIMING_PREFIX "time  subtask\n");
+    fprintf(file, TIMING_PREFIX "time   subtask\n");
     for(int e=LAST+1; e<NUMBER_OF_TASKS; e++)
     {
       assert_eq(stack_depth[e],0);
-      fprintf(file, TIMING_PREFIX "%5.3f %s\n",
+      fprintf(file, TIMING_PREFIX "%6.3f %s\n",
       get_time(e),
       get_taskname(e));
     }
@@ -150,6 +152,45 @@ void TimeTasks::print_cycle_times(int cycle)
   }
 }
 
+// The following three methods provide for a hack by which
+// the timeTasks copies of all threads are averaged.
+// 
+void TimeTasks::operator/=(int num)
+{
+  for(int e=NONE+1;e<NUMBER_OF_TASKS;e++)
+  {
+    task_duration[e]/=num;
+    start_times[e]/=num;
+    communicate[e]/=num;
+  }
+}
+void TimeTasks::operator+=(const TimeTasks& arg)
+{
+  active_task = arg.active_task;
+  communicating = arg.communicating;
+  for(int e=NONE+1;e<NUMBER_OF_TASKS;e++)
+  {
+    active[e] = arg.active[e];
+    task_duration[e]+=arg.task_duration[e];
+    stack_depth[e] = arg.stack_depth[e];
+    start_times[e]+=arg.start_times[e];
+    communicate[e]+=arg.communicate[e];
+  }
+}
+void TimeTasks::operator=(const TimeTasks& arg)
+{
+  active_task = arg.active_task;
+  communicating = arg.communicating;
+  for(int e=NONE+1;e<NUMBER_OF_TASKS;e++)
+  {
+    active[e] = arg.active[e];
+    task_duration[e]=arg.task_duration[e];
+    stack_depth[e] = arg.stack_depth[e];
+    start_times[e]=arg.start_times[e];
+    communicate[e]=arg.communicate[e];
+  }
+}
+
 TimeTasks_caller_to_set_communication_mode_for_scope::
 TimeTasks_caller_to_set_communication_mode_for_scope()
 {

From 0fc81473c383a89bae29cb53d51c2b6aebf8bdee Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 15 Oct 2013 11:46:52 +0200
Subject: [PATCH 064/118] implemented CallFinalize input file option

---
 include/Collective.h | 3 +++
 inputfiles/GEM.inp   | 2 ++
 main/iPic3Dlib.cpp   | 5 ++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/Collective.h b/include/Collective.h
index 415baa98..382611b6 100644
--- a/include/Collective.h
+++ b/include/Collective.h
@@ -138,6 +138,7 @@ class Collective
     int getParticlesOutputCycle()const{ return (ParticlesOutputCycle); }
     int getRestartOutputCycle()const{ return (RestartOutputCycle); }
     int getDiagnosticsOutputCycle()const{ return (DiagnosticsOutputCycle); }
+    bool getCallFinalize()const{ return (CallFinalize); }
 
     /*! Boundary condition selection for BCFace for the electric field components */
     int bcEx[6], bcEy[6], bcEz[6];
@@ -328,6 +329,8 @@ class Collective
     int RestartOutputCycle;
     /*! Output for diagnostics */
     int DiagnosticsOutputCycle;
+    /*! Call Finalize() at end of program execution (true by default) */
+    bool CallFinalize;
 };
 typedef Collective CollectiveIO;
 
diff --git a/inputfiles/GEM.inp b/inputfiles/GEM.inp
index f3eb5aab..f8602707 100644
--- a/inputfiles/GEM.inp
+++ b/inputfiles/GEM.inp
@@ -149,3 +149,5 @@ w0 = 0.00325	-0.01624
    RestartOutputCycle = 4000
 # Diagnostics cycle
    DiagnosticsOutputCycle = 1
+# 1 (true) by default
+#CallFinalize = 0
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 77bd3120..f476f01f 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -360,8 +360,11 @@ void c_Solver::WriteOutput(int cycle) {
 }
 
 void c_Solver::Finalize() {
-  if (mem_avail == 0)           // write the restart only if the simulation finished succesfully
+  if (mem_avail == 0 // write the restart only if the simulation finished successfully
+   && col->getCallFinalize())
+  {
     writeRESTART(RestartDirName, myrank, (col->getNcycles() + first_cycle) - 1, ns, mpi, vct, col, grid, EMf, part, 0);
+  }
 
   // stop profiling
   my_clock->stopTiming();

From 489b6707b0bb6be786a70eab36a6a1043602672b Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 15 Oct 2013 12:14:13 +0200
Subject: [PATCH 065/118] cleaned up mover_PC() and sumMoments() methods

---
 fields/EMfields3D.cpp      |  39 +++--
 inputoutput/Collective.cpp |   1 +
 particles/Particles3D.cpp  | 301 +++++++++----------------------------
 3 files changed, 91 insertions(+), 250 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 37104107..972361e2 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -443,23 +443,30 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
       const double eta1  = grid->getYN(iy) - y[i];
       const double zeta1 = grid->getZN(iz) - z[i];
       const double qi = q[i];
-      const double weight000 = qi * xi0 * eta0 * zeta0 * invVOL;
-      const double weight001 = qi * xi0 * eta0 * zeta1 * invVOL;
-      const double weight010 = qi * xi0 * eta1 * zeta0 * invVOL;
-      const double weight011 = qi * xi0 * eta1 * zeta1 * invVOL;
-      const double weight100 = qi * xi1 * eta0 * zeta0 * invVOL;
-      const double weight101 = qi * xi1 * eta0 * zeta1 * invVOL;
-      const double weight110 = qi * xi1 * eta1 * zeta0 * invVOL;
-      const double weight111 = qi * xi1 * eta1 * zeta1 * invVOL;
+      const double invVOLqi = invVOL*qi;
+      const double weight0 = invVOLqi * xi0;
+      const double weight1 = invVOLqi * xi1;
+      const double weight00 = weight0*eta0;
+      const double weight01 = weight0*eta1;
+      const double weight10 = weight1*eta0;
+      const double weight11 = weight1*eta1;
       double weights[8];
-      weights[0] = weight000;
-      weights[1] = weight001;
-      weights[2] = weight010;
-      weights[3] = weight011;
-      weights[4] = weight100;
-      weights[5] = weight101;
-      weights[6] = weight110;
-      weights[7] = weight111;
+      weights[0] = weight00*zeta0; // weight000
+      weights[1] = weight00*zeta1; // weight001
+      weights[2] = weight01*zeta0; // weight010
+      weights[3] = weight01*zeta1; // weight011
+      weights[4] = weight10*zeta0; // weight100
+      weights[5] = weight10*zeta1; // weight101
+      weights[6] = weight11*zeta0; // weight110
+      weights[7] = weight11*zeta1; // weight111
+      //weights[0] = xi0 * eta0 * zeta0 * qi * invVOL; // weight000
+      //weights[1] = xi0 * eta0 * zeta1 * qi * invVOL; // weight001
+      //weights[2] = xi0 * eta1 * zeta0 * qi * invVOL; // weight010
+      //weights[3] = xi0 * eta1 * zeta1 * qi * invVOL; // weight011
+      //weights[4] = xi1 * eta0 * zeta0 * qi * invVOL; // weight100
+      //weights[5] = xi1 * eta0 * zeta1 * qi * invVOL; // weight101
+      //weights[6] = xi1 * eta1 * zeta0 * qi * invVOL; // weight110
+      //weights[7] = xi1 * eta1 * zeta1 * qi * invVOL; // weight111
 
       // add particle to moments
       {
diff --git a/inputoutput/Collective.cpp b/inputoutput/Collective.cpp
index 708be195..5f0b7bbe 100644
--- a/inputoutput/Collective.cpp
+++ b/inputoutput/Collective.cpp
@@ -91,6 +91,7 @@ void Collective::ReadInput(string inputfile) {
     ParticlesOutputCycle = config.read < int >("ParticlesOutputCycle");
     RestartOutputCycle = config.read < int >("RestartOutputCycle");
     DiagnosticsOutputCycle = config.read < int >("DiagnosticsOutputCycle", FieldOutputCycle);
+    CallFinalize = config.read < bool >("CallFinalize", true);
   }
 
   if (RESTART1) {               // you are restarting
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index bef9b682..55d98962 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -320,23 +320,23 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
 
   const pfloat dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
   const pfloat inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
-  // don't bother trying to push any particles simultaneously;
-  // MIC already does vectorization automatically, and trying
-  // to do it by hand only hurts performance.
   #pragma omp for
   // why does single precision make no difference in execution speed?
   //#pragma simd vectorlength(VECTOR_WIDTH)
-  for (int rest = 0; rest < nop; rest++) {
+  for (int pidx = 0; pidx < nop; pidx++) {
     // copy the particle
-    const pfloat xptilde = x[rest];
-    const pfloat yptilde = y[rest];
-    const pfloat zptilde = z[rest];
+    const pfloat xptilde = x[pidx];
+    const pfloat yptilde = y[pidx];
+    const pfloat zptilde = z[pidx];
+    const pfloat up_orig = u[pidx];
+    const pfloat vp_orig = v[pidx];
+    const pfloat wp_orig = w[pidx];
     pfloat xp = xptilde;
     pfloat yp = yptilde;
     pfloat zp = zptilde;
-    pfloat up = u[rest];
-    pfloat vp = v[rest];
-    pfloat wp = w[rest];
+    pfloat up = up_orig;
+    pfloat vp = vp_orig;
+    pfloat wp = wp_orig;
     pfloat uptilde;
     pfloat vptilde;
     pfloat wptilde;
@@ -362,15 +362,12 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       if (iz > nzn - 1)
         iz = nzn - 1;
 
-      pfloat xi[2];
-      pfloat eta[2];
-      pfloat zeta[2];
-      xi[0]   = xp - grid->get_pfloat_XN(ix-1);
-      eta[0]  = yp - grid->get_pfloat_YN(iy-1);
-      zeta[0] = zp - grid->get_pfloat_ZN(iz-1);
-      xi[1]   = grid->get_pfloat_XN(ix) - xp;
-      eta[1]  = grid->get_pfloat_YN(iy) - yp;
-      zeta[1] = grid->get_pfloat_ZN(iz) - zp;
+      const pfloat xi0   = xp - grid->get_pfloat_XN(ix-1);
+      const pfloat eta0  = yp - grid->get_pfloat_YN(iy-1);
+      const pfloat zeta0 = zp - grid->get_pfloat_ZN(iz-1);
+      const pfloat xi1   = grid->get_pfloat_XN(ix) - xp;
+      const pfloat eta1  = grid->get_pfloat_YN(iy) - yp;
+      const pfloat zeta1 = grid->get_pfloat_ZN(iz) - zp;
 
       pfloat Exl = 0.0;
       pfloat Eyl = 0.0;
@@ -379,216 +376,52 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       pfloat Byl = 0.0;
       pfloat Bzl = 0.0;
 
-      // MIC refuses to vectorize this ...
-      // 
-      // pfloat weight[2][2][2];
-      // for (int ii = 0; ii < 2; ii++)
-      // for (int jj = 0; jj < 2; jj++)
-      // for (int kk = 0; kk < 2; kk++)
-      // weight[ii][jj][kk] = xi[ii] * eta[jj] * zeta[kk] * invVOL;
-      // for (int ii = 0; ii < 2; ii++)
-      // for (int jj = 0; jj < 2; jj++)
-      // for (int kk = 0; kk < 2; kk++) {
-      // const pfloat Exlp = weight[ii][jj][kk] * Ex.get(ix - ii, iy - jj, iz - kk);
-      // const pfloat Eylp = weight[ii][jj][kk] * Ey.get(ix - ii, iy - jj, iz - kk);
-      // const pfloat Ezlp = weight[ii][jj][kk] * Ez.get(ix - ii, iy - jj, iz - kk);
-      // const pfloat Bxlp = weight[ii][jj][kk] * Bx.get(ix - ii, iy - jj, iz - kk);
-      // const pfloat Bylp = weight[ii][jj][kk] * By.get(ix - ii, iy - jj, iz - kk);
-      // const pfloat Bzlp = weight[ii][jj][kk] * Bz.get(ix - ii, iy - jj, iz - kk);
-      // Exl += Exlp;
-      // Eyl += Eylp;
-      // Ezl += Ezlp;
-      // Bxl += Bxlp;
-      // Byl += Bylp;
-      // Bzl += Bzlp;
-      // }
-
-      // ... so we expand things out instead
-      // 
-      const pfloat weight000 = xi[0] * eta[0] * zeta[0] * invVOL;
-      const pfloat weight001 = xi[0] * eta[0] * zeta[1] * invVOL;
-      const pfloat weight010 = xi[0] * eta[1] * zeta[0] * invVOL;
-      const pfloat weight011 = xi[0] * eta[1] * zeta[1] * invVOL;
-      const pfloat weight100 = xi[1] * eta[0] * zeta[0] * invVOL;
-      const pfloat weight101 = xi[1] * eta[0] * zeta[1] * invVOL;
-      const pfloat weight110 = xi[1] * eta[1] * zeta[0] * invVOL;
-      const pfloat weight111 = xi[1] * eta[1] * zeta[1] * invVOL;
+      pfloat weights[8];
+      const pfloat weight0 = invVOL*xi0;
+      const pfloat weight1 = invVOL*xi1;
+      const pfloat weight00 = weight0*eta0;
+      const pfloat weight01 = weight0*eta1;
+      const pfloat weight10 = weight1*eta0;
+      const pfloat weight11 = weight1*eta1;
+      weights[0] = weight00*zeta0; // weight000
+      weights[1] = weight00*zeta1; // weight001
+      weights[2] = weight01*zeta0; // weight010
+      weights[3] = weight01*zeta1; // weight011
+      weights[4] = weight10*zeta0; // weight100
+      weights[5] = weight10*zeta1; // weight101
+      weights[6] = weight11*zeta0; // weight110
+      weights[7] = weight11*zeta1; // weight111
+      //weights[0] = xi0 * eta0 * zeta0 * qi * invVOL; // weight000
+      //weights[1] = xi0 * eta0 * zeta1 * qi * invVOL; // weight001
+      //weights[2] = xi0 * eta1 * zeta0 * qi * invVOL; // weight010
+      //weights[3] = xi0 * eta1 * zeta1 * qi * invVOL; // weight011
+      //weights[4] = xi1 * eta0 * zeta0 * qi * invVOL; // weight100
+      //weights[5] = xi1 * eta0 * zeta1 * qi * invVOL; // weight101
+      //weights[6] = xi1 * eta1 * zeta0 * qi * invVOL; // weight110
+      //weights[7] = xi1 * eta1 * zeta1 * qi * invVOL; // weight111
 
       // creating these aliases seems to accelerate this method by about 30%
       // on the Xeon host, processor, suggesting deficiency in the optimizer.
       //
-      arr1_pfloat_get field000 = fieldForPcls[ix  ][iy  ][iz  ];
-      arr1_pfloat_get field001 = fieldForPcls[ix  ][iy  ][iz-1];
-      arr1_pfloat_get field010 = fieldForPcls[ix  ][iy-1][iz  ];
-      arr1_pfloat_get field011 = fieldForPcls[ix  ][iy-1][iz-1];
-      arr1_pfloat_get field100 = fieldForPcls[ix-1][iy  ][iz  ];
-      arr1_pfloat_get field101 = fieldForPcls[ix-1][iy  ][iz-1];
-      arr1_pfloat_get field110 = fieldForPcls[ix-1][iy-1][iz  ];
-      arr1_pfloat_get field111 = fieldForPcls[ix-1][iy-1][iz-1];
-      // 
-      #if 0 // (takes same time as other order)
-      Bxl += weight000 * field000[0];
-      Bxl += weight001 * field001[0];
-      Bxl += weight010 * field010[0];
-      Bxl += weight011 * field011[0];
-      Bxl += weight100 * field100[0];
-      Bxl += weight101 * field101[0];
-      Bxl += weight110 * field110[0];
-      Bxl += weight111 * field111[0];
-      Byl += weight000 * field000[1];
-      Byl += weight001 * field001[1];
-      Byl += weight010 * field010[1];
-      Byl += weight011 * field011[1];
-      Byl += weight100 * field100[1];
-      Byl += weight101 * field101[1];
-      Byl += weight110 * field110[1];
-      Byl += weight111 * field111[1];
-      Bzl += weight000 * field000[2];
-      Bzl += weight001 * field001[2];
-      Bzl += weight010 * field010[2];
-      Bzl += weight011 * field011[2];
-      Bzl += weight100 * field100[2];
-      Bzl += weight101 * field101[2];
-      Bzl += weight110 * field110[2];
-      Bzl += weight111 * field111[2];
-      Exl += weight000 * field000[3];
-      Exl += weight001 * field001[3];
-      Exl += weight010 * field010[3];
-      Exl += weight011 * field011[3];
-      Exl += weight100 * field100[3];
-      Exl += weight101 * field101[3];
-      Exl += weight110 * field110[3];
-      Exl += weight111 * field111[3];
-      Eyl += weight000 * field000[4];
-      Eyl += weight001 * field001[4];
-      Eyl += weight010 * field010[4];
-      Eyl += weight011 * field011[4];
-      Eyl += weight100 * field100[4];
-      Eyl += weight101 * field101[4];
-      Eyl += weight110 * field110[4];
-      Eyl += weight111 * field111[4];
-      Ezl += weight000 * field000[5];
-      Ezl += weight001 * field001[5];
-      Ezl += weight010 * field010[5];
-      Ezl += weight011 * field011[5];
-      Ezl += weight100 * field100[5];
-      Ezl += weight101 * field101[5];
-      Ezl += weight110 * field110[5];
-      Ezl += weight111 * field111[5];
-      #endif
-
-      Bxl += weight000 * field000[0];
-      Byl += weight000 * field000[1];
-      Bzl += weight000 * field000[2];
-      Exl += weight000 * field000[3];
-      Eyl += weight000 * field000[4];
-      Ezl += weight000 * field000[5];
-
-      Bxl += weight001 * field001[0];
-      Byl += weight001 * field001[1];
-      Bzl += weight001 * field001[2];
-      Exl += weight001 * field001[3];
-      Eyl += weight001 * field001[4];
-      Ezl += weight001 * field001[5];
-
-      Bxl += weight010 * field010[0];
-      Byl += weight010 * field010[1];
-      Bzl += weight010 * field010[2];
-      Exl += weight010 * field010[3];
-      Eyl += weight010 * field010[4];
-      Ezl += weight010 * field010[5];
-
-      Bxl += weight011 * field011[0];
-      Byl += weight011 * field011[1];
-      Bzl += weight011 * field011[2];
-      Exl += weight011 * field011[3];
-      Eyl += weight011 * field011[4];
-      Ezl += weight011 * field011[5];
-
-      Bxl += weight100 * field100[0];
-      Byl += weight100 * field100[1];
-      Bzl += weight100 * field100[2];
-      Exl += weight100 * field100[3];
-      Eyl += weight100 * field100[4];
-      Ezl += weight100 * field100[5];
-
-      Bxl += weight101 * field101[0];
-      Byl += weight101 * field101[1];
-      Bzl += weight101 * field101[2];
-      Exl += weight101 * field101[3];
-      Eyl += weight101 * field101[4];
-      Ezl += weight101 * field101[5];
-
-      Bxl += weight110 * field110[0];
-      Byl += weight110 * field110[1];
-      Bzl += weight110 * field110[2];
-      Exl += weight110 * field110[3];
-      Eyl += weight110 * field110[4];
-      Ezl += weight110 * field110[5];
-
-      Bxl += weight111 * field111[0];
-      Byl += weight111 * field111[1];
-      Bzl += weight111 * field111[2];
-      Exl += weight111 * field111[3];
-      Eyl += weight111 * field111[4];
-      Ezl += weight111 * field111[5];
-
-      #if 0
-      Bxl += weight000 * Bx[ix][iy][iz];
-      Bxl += weight000 * Bx[ix][iy][iz];
-      Bxl += weight001 * Bx[ix][iy][iz - 1];
-      Bxl += weight010 * Bx[ix][iy - 1][iz];
-      Bxl += weight011 * Bx[ix][iy - 1][iz - 1];
-      Bxl += weight100 * Bx[ix - 1][iy][iz];
-      Bxl += weight101 * Bx[ix - 1][iy][iz - 1];
-      Bxl += weight110 * Bx[ix - 1][iy - 1][iz];
-      Bxl += weight111 * Bx[ix - 1][iy - 1][iz - 1];
-      // 
-      Byl += weight000 * By[ix][iy][iz];
-      Byl += weight001 * By[ix][iy][iz - 1];
-      Byl += weight010 * By[ix][iy - 1][iz];
-      Byl += weight011 * By[ix][iy - 1][iz - 1];
-      Byl += weight100 * By[ix - 1][iy][iz];
-      Byl += weight101 * By[ix - 1][iy][iz - 1];
-      Byl += weight110 * By[ix - 1][iy - 1][iz];
-      Byl += weight111 * By[ix - 1][iy - 1][iz - 1];
-      // 
-      Bzl += weight000 * Bz[ix][iy][iz];
-      Bzl += weight001 * Bz[ix][iy][iz - 1];
-      Bzl += weight010 * Bz[ix][iy - 1][iz];
-      Bzl += weight011 * Bz[ix][iy - 1][iz - 1];
-      Bzl += weight100 * Bz[ix - 1][iy][iz];
-      Bzl += weight101 * Bz[ix - 1][iy][iz - 1];
-      Bzl += weight110 * Bz[ix - 1][iy - 1][iz];
-      Bzl += weight111 * Bz[ix - 1][iy - 1][iz - 1];
-      // 
-      Exl += weight000 * Ex[ix][iy][iz];
-      Exl += weight001 * Ex[ix][iy][iz - 1];
-      Exl += weight010 * Ex[ix][iy - 1][iz];
-      Exl += weight011 * Ex[ix][iy - 1][iz - 1];
-      Exl += weight100 * Ex[ix - 1][iy][iz];
-      Exl += weight101 * Ex[ix - 1][iy][iz - 1];
-      Exl += weight110 * Ex[ix - 1][iy - 1][iz];
-      Exl += weight111 * Ex[ix - 1][iy - 1][iz - 1];
-      // 
-      Eyl += weight000 * Ey[ix][iy][iz];
-      Eyl += weight001 * Ey[ix][iy][iz - 1];
-      Eyl += weight010 * Ey[ix][iy - 1][iz];
-      Eyl += weight011 * Ey[ix][iy - 1][iz - 1];
-      Eyl += weight100 * Ey[ix - 1][iy][iz];
-      Eyl += weight101 * Ey[ix - 1][iy][iz - 1];
-      Eyl += weight110 * Ey[ix - 1][iy - 1][iz];
-      Eyl += weight111 * Ey[ix - 1][iy - 1][iz - 1];
-      // 
-      Ezl += weight000 * Ez[ix][iy][iz];
-      Ezl += weight001 * Ez[ix][iy][iz - 1];
-      Ezl += weight010 * Ez[ix][iy - 1][iz];
-      Ezl += weight011 * Ez[ix][iy - 1][iz - 1];
-      Ezl += weight100 * Ez[ix - 1][iy][iz];
-      Ezl += weight101 * Ez[ix - 1][iy][iz - 1];
-      Ezl += weight110 * Ez[ix - 1][iy - 1][iz];
-      Ezl += weight111 * Ez[ix - 1][iy - 1][iz - 1];
-      #endif
+      arr1_pfloat_get field_components[8];
+      field_components[0] = fieldForPcls[ix  ][iy  ][iz  ]; // field000
+      field_components[1] = fieldForPcls[ix  ][iy  ][iz-1]; // field001
+      field_components[2] = fieldForPcls[ix  ][iy-1][iz  ]; // field010
+      field_components[3] = fieldForPcls[ix  ][iy-1][iz-1]; // field011
+      field_components[4] = fieldForPcls[ix-1][iy  ][iz  ]; // field100
+      field_components[5] = fieldForPcls[ix-1][iy  ][iz-1]; // field101
+      field_components[6] = fieldForPcls[ix-1][iy-1][iz  ]; // field110
+      field_components[7] = fieldForPcls[ix-1][iy-1][iz-1]; // field111
+
+      for(int c=0; c<8; c++)
+      {
+        Bxl += weights[c] * field_components[c][0];
+        Byl += weights[c] * field_components[c][1];
+        Bzl += weights[c] * field_components[c][2];
+        Exl += weights[c] * field_components[c][3];
+        Eyl += weights[c] * field_components[c][4];
+        Ezl += weights[c] * field_components[c][5];
+      }
 
       // end interpolation
       const pfloat omdtsq = qomdt2 * qomdt2 * (Bxl * Bxl + Byl * Byl + Bzl * Bzl);
@@ -608,18 +441,18 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       zp = zptilde + wptilde * dto2;
     }                           // end of iteration
     // update the final position and velocity
-    up = 2.0 * uptilde - u[rest];
-    vp = 2.0 * vptilde - v[rest];
-    wp = 2.0 * wptilde - w[rest];
+    up = 2.0 * uptilde - up_orig;
+    vp = 2.0 * vptilde - vp_orig;
+    wp = 2.0 * wptilde - wp_orig;
     xp = xptilde + uptilde * dt;
     yp = yptilde + vptilde * dt;
     zp = zptilde + wptilde * dt;
-    x[rest] = xp;
-    y[rest] = yp;
-    z[rest] = zp;
-    u[rest] = up;
-    v[rest] = vp;
-    w[rest] = wp;
+    x[pidx] = xp;
+    y[pidx] = yp;
+    z[pidx] = zp;
+    u[pidx] = up;
+    v[pidx] = vp;
+    w[pidx] = wp;
   }                             // END OF ALL THE PARTICLES
 }
 

From e1e32ae8bacb6ad68680d1e7a154d7b01b095410 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 17 Oct 2013 04:09:08 +0200
Subject: [PATCH 066/118] mover_PC: renamed vars, used Om:=B*q*dt/(2*m*c) to
 reduce multiplications

---
 particles/Particles3D.cpp | 95 +++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 50 deletions(-)

diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 55d98962..5f7bb915 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -318,34 +318,31 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   }
   const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
 
-  const pfloat dto2 = .5 * dt, qomdt2 = qom * dto2 / c;
+  const pfloat dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
   const pfloat inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
   #pragma omp for
   // why does single precision make no difference in execution speed?
   //#pragma simd vectorlength(VECTOR_WIDTH)
   for (int pidx = 0; pidx < nop; pidx++) {
     // copy the particle
-    const pfloat xptilde = x[pidx];
-    const pfloat yptilde = y[pidx];
-    const pfloat zptilde = z[pidx];
-    const pfloat up_orig = u[pidx];
-    const pfloat vp_orig = v[pidx];
-    const pfloat wp_orig = w[pidx];
-    pfloat xp = xptilde;
-    pfloat yp = yptilde;
-    pfloat zp = zptilde;
-    pfloat up = up_orig;
-    pfloat vp = vp_orig;
-    pfloat wp = wp_orig;
-    pfloat uptilde;
-    pfloat vptilde;
-    pfloat wptilde;
+    const pfloat xorig = x[pidx];
+    const pfloat yorig = y[pidx];
+    const pfloat zorig = z[pidx];
+    const pfloat uorig = u[pidx];
+    const pfloat vorig = v[pidx];
+    const pfloat worig = w[pidx];
+    pfloat xavg = xorig;
+    pfloat yavg = yorig;
+    pfloat zavg = zorig;
+    pfloat uavg;
+    pfloat vavg;
+    pfloat wavg;
     // calculate the average velocity iteratively
     for (int innter = 0; innter < NiterMover; innter++) {
       // interpolation G-->P
-      const pfloat ixd = floor((xp - xstart) * inv_dx);
-      const pfloat iyd = floor((yp - ystart) * inv_dy);
-      const pfloat izd = floor((zp - zstart) * inv_dz);
+      const pfloat ixd = floor((xavg - xstart) * inv_dx);
+      const pfloat iyd = floor((yavg - ystart) * inv_dy);
+      const pfloat izd = floor((zavg - zstart) * inv_dz);
       int ix = 2 + int (ixd);
       int iy = 2 + int (iyd);
       int iz = 2 + int (izd);
@@ -362,12 +359,12 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       if (iz > nzn - 1)
         iz = nzn - 1;
 
-      const pfloat xi0   = xp - grid->get_pfloat_XN(ix-1);
-      const pfloat eta0  = yp - grid->get_pfloat_YN(iy-1);
-      const pfloat zeta0 = zp - grid->get_pfloat_ZN(iz-1);
-      const pfloat xi1   = grid->get_pfloat_XN(ix) - xp;
-      const pfloat eta1  = grid->get_pfloat_YN(iy) - yp;
-      const pfloat zeta1 = grid->get_pfloat_ZN(iz) - zp;
+      const pfloat xi0   = xavg - grid->get_pfloat_XN(ix-1);
+      const pfloat eta0  = yavg - grid->get_pfloat_YN(iy-1);
+      const pfloat zeta0 = zavg - grid->get_pfloat_ZN(iz-1);
+      const pfloat xi1   = grid->get_pfloat_XN(ix) - xavg;
+      const pfloat eta1  = grid->get_pfloat_YN(iy) - yavg;
+      const pfloat zeta1 = grid->get_pfloat_ZN(iz) - zavg;
 
       pfloat Exl = 0.0;
       pfloat Eyl = 0.0;
@@ -422,37 +419,35 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
         Eyl += weights[c] * field_components[c][4];
         Ezl += weights[c] * field_components[c][5];
       }
+      const double Omx = qdto2mc*Bxl;
+      const double Omy = qdto2mc*Byl;
+      const double Omz = qdto2mc*Bzl;
 
       // end interpolation
-      const pfloat omdtsq = qomdt2 * qomdt2 * (Bxl * Bxl + Byl * Byl + Bzl * Bzl);
-      const pfloat denom = 1.0 / (1.0 + omdtsq);
+      const pfloat omsq = (Omx * Omx + Omy * Omy + Omz * Omz);
+      const pfloat denom = 1.0 / (1.0 + omsq);
       // solve the position equation
-      const pfloat ut = up + qomdt2 * Exl;
-      const pfloat vt = vp + qomdt2 * Eyl;
-      const pfloat wt = wp + qomdt2 * Ezl;
-      const pfloat udotb = ut * Bxl + vt * Byl + wt * Bzl;
+      const pfloat ut = uorig + qdto2mc * Exl;
+      const pfloat vt = vorig + qdto2mc * Eyl;
+      const pfloat wt = worig + qdto2mc * Ezl;
+      //const pfloat udotb = ut * Bxl + vt * Byl + wt * Bzl;
+      const pfloat udotOm = ut * Omx + vt * Omy + wt * Omz;
       // solve the velocity equation 
-      uptilde = (ut + qomdt2 * (vt * Bzl - wt * Byl + qomdt2 * udotb * Bxl)) * denom;
-      vptilde = (vt + qomdt2 * (wt * Bxl - ut * Bzl + qomdt2 * udotb * Byl)) * denom;
-      wptilde = (wt + qomdt2 * (ut * Byl - vt * Bxl + qomdt2 * udotb * Bzl)) * denom;
-      // update position
-      xp = xptilde + uptilde * dto2;
-      yp = yptilde + vptilde * dto2;
-      zp = zptilde + wptilde * dto2;
+      uavg = (ut + (vt * Omz - wt * Omy + udotOm * Omx)) * denom;
+      vavg = (vt + (wt * Omx - ut * Omz + udotOm * Omy)) * denom;
+      wavg = (wt + (ut * Omy - vt * Omx + udotOm * Omz)) * denom;
+      // update average position
+      xavg = xorig + uavg * dto2;
+      yavg = yorig + vavg * dto2;
+      zavg = zorig + wavg * dto2;
     }                           // end of iteration
     // update the final position and velocity
-    up = 2.0 * uptilde - up_orig;
-    vp = 2.0 * vptilde - vp_orig;
-    wp = 2.0 * wptilde - wp_orig;
-    xp = xptilde + uptilde * dt;
-    yp = yptilde + vptilde * dt;
-    zp = zptilde + wptilde * dt;
-    x[pidx] = xp;
-    y[pidx] = yp;
-    z[pidx] = zp;
-    u[pidx] = up;
-    v[pidx] = vp;
-    w[pidx] = wp;
+    x[pidx] = xorig + uavg * dt;
+    y[pidx] = yorig + vavg * dt;
+    z[pidx] = zorig + wavg * dt;
+    u[pidx] = 2.0 * uavg - uorig;
+    v[pidx] = 2.0 * vavg - vorig;
+    w[pidx] = 2.0 * wavg - worig;
   }                             // END OF ALL THE PARTICLES
 }
 

From ca89d599706cee9ac5a255284054bb08f6eaf916 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 17 Oct 2013 09:42:12 +0200
Subject: [PATCH 067/118] enforcing that field and particle topology coincide
 (assumed in subsequent execution)

---
 communication/VCtopology3D.cpp | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/communication/VCtopology3D.cpp b/communication/VCtopology3D.cpp
index b88d0878..7b6db581 100644
--- a/communication/VCtopology3D.cpp
+++ b/communication/VCtopology3D.cpp
@@ -3,6 +3,8 @@
 #include "Collective.h"
 #include "VCtopology3D.h"
 #include <iostream>
+#include "MPIdata.h"
+#include "debug.h"
 
 using std::cout;
 using std::endl;
@@ -62,6 +64,11 @@ void VCtopology3D::setup_vctopology(MPI_Comm old_comm) {
   MPI_Cart_create(old_comm, 3, divisions, periods, reorder, &CART_COMM);
   // create a matrix with ranks, and neighbours for Particles
   MPI_Cart_create(old_comm, 3, divisions, periods_P, reorder, &CART_COMM_P);
+  // Why not the following line instead of the previous?  Was
+  // this written in anticipation that a different number of MPI
+  // processes would be used for fields versus for particles?
+  // But the code has not been consistently written this way...
+  //MPI_Cart_create(CART_COMM, 3, divisions, periods_P, 0, &CART_COMM_P);
   // field Communicator
   if (CART_COMM != MPI_COMM_NULL) {
     MPI_Comm_rank(CART_COMM, &cartesian_rank);
@@ -72,21 +79,32 @@ void VCtopology3D::setup_vctopology(MPI_Comm old_comm) {
     MPI_Cart_shift(CART_COMM, ZDIR, RIGHT, &zleft_neighbor, &zright_neighbor);
   }
   else {
-    // EXCEPTION
-    cout << "A process is trown away from the new topology for fields. VCtopology3D.h" << endl;
+    // previous check that nprocs = XLEN*YLEN*ZLEN should prevent reaching this line.
+    eprintf("A process is thrown away from the new topology for fields.");
   }
   // Particles Communicator
   if (CART_COMM_P != MPI_COMM_NULL) {
-    MPI_Comm_rank(CART_COMM_P, &cartesian_rank);
-    MPI_Cart_coords(CART_COMM_P, cartesian_rank, 3, coordinates);
+    int pcl_coordinates[3];
+    int pcl_cartesian_rank;
+    MPI_Comm_rank(CART_COMM_P, &pcl_cartesian_rank);
+    MPI_Cart_coords(CART_COMM_P, pcl_cartesian_rank, 3, pcl_coordinates);
+    
+    // This seems to be assumed elsewhere in the code.
+    assert_eq(cartesian_rank, MPIdata::get_rank());
+    // should agree
+    assert_eq(cartesian_rank,pcl_cartesian_rank);
+    for(int dim=0;dim<3;dim++)
+    {
+      assert_eq(coordinates[dim],pcl_coordinates[dim]);
+    }
 
     MPI_Cart_shift(CART_COMM_P, XDIR, RIGHT, &xleft_neighbor_P, &xright_neighbor_P);
     MPI_Cart_shift(CART_COMM_P, YDIR, RIGHT, &yleft_neighbor_P, &yright_neighbor_P);
     MPI_Cart_shift(CART_COMM_P, ZDIR, RIGHT, &zleft_neighbor_P, &zright_neighbor_P);
   }
   else {
-    // EXCEPTION
-    cout << "A process is trown away from the new topology for Particles. VCtopology3D.h" << endl;
+    // previous check that nprocs = XLEN*YLEN*ZLEN should prevent reaching this line.
+    eprintf("A process is thrown away from the new topology for Particles.");
   }
 
 }

From 78e9e16555ebe0cfe8ef7f322c28d557f929f986 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 17 Oct 2013 09:44:01 +0200
Subject: [PATCH 068/118] print error once only when nxc/XLEN is non-integer

---
 grids/Grid3DCU.cpp | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index fa6bdd49..e4b7d166 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -1,11 +1,11 @@
 
 #include <mpi.h>
 #include "Grid3DCU.h"
+#include "MPIdata.h"
 
 /*! constructor */
 Grid3DCU::Grid3DCU(CollectiveIO * col, VirtualTopology3D * vct) {
-  // int get_rank();
-  // if(!get_rank())
+  if(!MPIdata::get_rank())
   {
     fflush(stdout);
     bool xerror = false;
@@ -37,17 +37,18 @@ Grid3DCU::Grid3DCU(CollectiveIO * col, VirtualTopology3D * vct) {
   invdz = 1.0 / dz;
 
   // local grid dimensions and boundaries of active nodes
-  xStart = vct->getCoordinates(0) * (col->getLx() / (double) vct->getXLEN());
-
-  xEnd = xStart + (col->getLx() / (double) vct->getXLEN());
-
-  yStart = vct->getCoordinates(1) * (col->getLy() / (double) vct->getYLEN());
-
-  yEnd = yStart + (col->getLy() / (double) vct->getYLEN());
-
-  zStart = vct->getCoordinates(2) * (col->getLz() / (double) vct->getZLEN());
-
-  zEnd = zStart + (col->getLz() / (double) vct->getZLEN());
+  //
+  const double xWidth = (col->getLx() / (double) vct->getXLEN());
+  const double yWidth = (col->getLy() / (double) vct->getYLEN());
+  const double zWidth = (col->getLz() / (double) vct->getZLEN());
+  //
+  xStart = vct->getCoordinates(0) * xWidth;
+  yStart = vct->getCoordinates(1) * yWidth;
+  zStart = vct->getCoordinates(2) * zWidth;
+  //
+  xEnd = xStart + xWidth;
+  yEnd = yStart + yWidth;
+  zEnd = zStart + zWidth;
 
   // arrays allocation: nodes ---> the first node has index 1, the last has index nxn-2!
   pfloat_node_xcoord = new pfloat[nxn];

From b106afac26df54c48ff267cecfaf8f93764f0f65 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 17 Oct 2013 09:47:25 +0200
Subject: [PATCH 069/118] eliminate Particles abstract base class (cf. iss #18,
 #41)

---
 include/Particles.h           | 113 +++++++++++++++++-----------------
 include/Particles3Dcomm.h     |  63 +++++++++----------
 particles/Particles3Dcomm.cpp |  69 +--------------------
 3 files changed, 86 insertions(+), 159 deletions(-)

diff --git a/include/Particles.h b/include/Particles.h
index cca553c0..2abe6c4a 100644
--- a/include/Particles.h
+++ b/include/Particles.h
@@ -21,60 +21,61 @@ developers: Stefano Markidis, Giovanni Lapenta
  *
  */
 
-class Particles {
-public:
-  /** allocate particles */
-  virtual void allocate(int species, CollectiveIO * col, VirtualTopology3D * vct, Grid * grid) = 0;
-  /** interpolation Particle -> grid */
-  virtual void interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vct) = 0;
-
-
-  /** get X-position array for all the particles */
-  virtual double *getXall() const = 0;
-  /** get Y-position array for all the particles */
-  virtual double *getYall() const = 0;
-  /** get Z-position array for all the particles */
-  virtual double *getZall() const = 0;
-  /** get u (X-velocity) array for all the particles */
-  virtual double *getUall() const = 0;
-  /** get v (Y-velocity) array for all the particles */
-  virtual double *getVall() const = 0;
-  /** get w (Z-velocity) array for all the particles */
-  virtual double *getWall() const = 0;
-  /** get ID array for all the particles */
-  virtual long long *getParticleIDall() const = 0;
-  /**get charge of particle array */
-  virtual double *getQall() const = 0;
-  /** get X-position of particle with label indexPart */
-  virtual double getX(int indexPart) const = 0;
-  /** get Y-position of particle with label indexPart */
-  virtual double getY(int indexPart) const = 0;
-  /** get Z-position of particle with label indexPart */
-  virtual double getZ(int indexPart) const = 0;
-  /** get u (X-velocity) of particle with label indexPart */
-  virtual double getU(int indexPart) const = 0;
-  /** get v (Y-velocity) of particle with label indexPart */
-  virtual double getV(int indexPart) const = 0;
-  /** get w (Z-velocity) of particle with label indexPart */
-  virtual double getW(int indexPart) const = 0;
-  /** get ID of particle with label indexPart */
-  virtual long long getParticleID(int indexPart) const = 0;
-  /**get charge of particle with label indexPart */
-  virtual double getQ(int indexPart) const = 0;
-  /** get the number of particles of this subdomain */
-  virtual int getNOP() const = 0;
-  /** return the Kinetic energy */
-  virtual double getKe() = 0;
-  /** return the maximum kinetic energy */
-  virtual double getMaxVelocity() = 0;
-  /** return energy distribution*/
-  virtual long long *getVelocityDistribution(int nBins, double maxVel) = 0;
-  /** retturn the momentum */
-  virtual double getP() = 0;
-  /** Print particles info: positions, velocities */
-  virtual void Print(VirtualTopology3D * ptVCT) const = 0;
-  /** Print the number of particles of this subdomain */
-  virtual void PrintNp(VirtualTopology3D * ptVCT) const = 0;
-
-};
+#include "Particles3Dcomm.h"
+//class Particles {
+//public:
+//  /** allocate particles */
+//  virtual void allocate(int species, CollectiveIO * col, VirtualTopology3D * vct, Grid * grid) = 0;
+//  /** interpolation Particle -> grid */
+//  virtual void interpP2G(Field * EMf, Grid * grid, VirtualTopology3D * vct) = 0;
+//
+//
+//  /** get X-position array for all the particles */
+//  virtual double *getXall() const = 0;
+//  /** get Y-position array for all the particles */
+//  virtual double *getYall() const = 0;
+//  /** get Z-position array for all the particles */
+//  virtual double *getZall() const = 0;
+//  /** get u (X-velocity) array for all the particles */
+//  virtual double *getUall() const = 0;
+//  /** get v (Y-velocity) array for all the particles */
+//  virtual double *getVall() const = 0;
+//  /** get w (Z-velocity) array for all the particles */
+//  virtual double *getWall() const = 0;
+//  /** get ID array for all the particles */
+//  virtual long long *getParticleIDall() const = 0;
+//  /**get charge of particle array */
+//  virtual double *getQall() const = 0;
+//  /** get X-position of particle with label indexPart */
+//  virtual double getX(int indexPart) const = 0;
+//  /** get Y-position of particle with label indexPart */
+//  virtual double getY(int indexPart) const = 0;
+//  /** get Z-position of particle with label indexPart */
+//  virtual double getZ(int indexPart) const = 0;
+//  /** get u (X-velocity) of particle with label indexPart */
+//  virtual double getU(int indexPart) const = 0;
+//  /** get v (Y-velocity) of particle with label indexPart */
+//  virtual double getV(int indexPart) const = 0;
+//  /** get w (Z-velocity) of particle with label indexPart */
+//  virtual double getW(int indexPart) const = 0;
+//  /** get ID of particle with label indexPart */
+//  virtual long long getParticleID(int indexPart) const = 0;
+//  /**get charge of particle with label indexPart */
+//  virtual double getQ(int indexPart) const = 0;
+//  /** get the number of particles of this subdomain */
+//  virtual int getNOP() const = 0;
+//  /** return the Kinetic energy */
+//  virtual double getKe() = 0;
+//  /** return the maximum kinetic energy */
+//  virtual double getMaxVelocity() = 0;
+//  /** return energy distribution*/
+//  virtual long long *getVelocityDistribution(int nBins, double maxVel) = 0;
+//  /** retturn the momentum */
+//  virtual double getP() = 0;
+//  /** Print particles info: positions, velocities */
+//  virtual void Print(VirtualTopology3D * ptVCT) const = 0;
+//  /** Print the number of particles of this subdomain */
+//  virtual void PrintNp(VirtualTopology3D * ptVCT) const = 0;
+//
+//};
 #endif
diff --git a/include/Particles3Dcomm.h b/include/Particles3Dcomm.h
index 1e646681..8ebae324 100644
--- a/include/Particles3Dcomm.h
+++ b/include/Particles3Dcomm.h
@@ -16,7 +16,8 @@ developers: Stefano Markidis, Giovanni Lapenta
  * @version 2.0
  *
  */
-class Particles3Dcomm:public Particles {
+class Particles3Dcomm // :public Particles
+{
 public:
   /** constructor */
   Particles3Dcomm();
@@ -57,40 +58,31 @@ class Particles3Dcomm:public Particles {
   int maxNpExiting();
   /** calculate the weights given the position of particles */
   // void calculateWeights(double*** weight, double xp, double yp, double zp,int ix, int iy, int iz, Grid* grid);
-  /** get X-position array for all the particles */
-  double *getXall() const;
-  /** get Y-position array for all the particles */
-  double *getYall() const;
-  /** get Z-position array for all the particles */
-  double *getZall() const;
-  /** get u (X-velocity) array for all the particles */
-  double *getUall() const;
-  /** get v (Y-velocity) array for all the particles */
-  double *getVall() const;
-  /** get w (Z-velocity) array for all the particles */
-  double *getWall() const;
-  /** get the ID array   */
-  long long *getParticleIDall() const;
-  /** get X-position of particle with label indexPart */
-  double getX(int indexPart) const;
-  /** get Y-position of particle with label indexPart */
-  double getY(int indexPart) const;
-  /** get Z-position of particle with label indexPart */
-  double getZ(int indexPart) const;
-  /** get u (X-velocity) of particle with label indexPart */
-  double getU(int indexPart) const;
-  /** get v (Y-velocity) of particle with label indexPart */
-  double getV(int indexPart) const;
-  /** get w (Z-velocity) of particle with label indexPart */
-  double getW(int indexPart) const;
-  /** get ID of particle with label indexPart */
-  long long getParticleID(int indexPart) const;
-  /**get charge of particle with label indexPart */
-  double getQ(int indexPart) const;
-  /** get charge of array for ID particles */
-  double *getQall() const;
-  /** get the number of particles of this subdomain */
-  int getNOP() const;
+
+  // inline get accessors
+  //
+  double *getXall()  const { return (x); }
+  double *getYall()  const { return (y); }
+  double *getZall()  const { return (z); }
+  double *getUall()  const { return (u); }
+  double *getVall()  const { return (v); }
+  double *getWall()  const { return (w); }
+  long long *getParticleIDall()  const { return (ParticleID); }
+  double *getQall()  const { return (q); }
+  // accessors for particle with index indexPart
+  double getX(int indexPart)  const { return (x[indexPart]); }
+  double getY(int indexPart)  const { return (y[indexPart]); }
+  double getZ(int indexPart)  const { return (z[indexPart]); }
+  double getU(int indexPart)  const { return (u[indexPart]); }
+  double getV(int indexPart)  const { return (v[indexPart]); }
+  double getW(int indexPart)  const { return (w[indexPart]); }
+  long long getParticleID(int indexPart)  const
+    { return (ParticleID[indexPart]); }
+  double getQ(int indexPart)  const { return (q[indexPart]); }
+  int getNOP()  const { return (nop); }
+
+  // computed get access
+  //
   /** return the Kinetic energy */
   double getKe();
   /** return the maximum kinetic energy */
@@ -262,5 +254,6 @@ class Particles3Dcomm:public Particles {
   double Ninj;
 };
 
+typedef Particles3Dcomm Particles;
 
 #endif
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 13a443c0..acdccdab 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -809,74 +809,7 @@ int Particles3Dcomm::maxNpExiting() {
     maxNp = npExitZleft;
   return (maxNp);
 }
-/** return X-coordinate of particle array */
-double *Particles3Dcomm::getXall()  const {
-  return (x);
-}
-/** return Y-coordinate  of particle array */
-double *Particles3Dcomm::getYall()  const {
-  return (y);
-}
-/** return Z-coordinate  of particle array*/
-double *Particles3Dcomm::getZall()  const {
-  return (z);
-}
-/** get X-velocity of particle with label indexPart */
-double *Particles3Dcomm::getUall()  const {
-  return (u);
-}
-/** get Y-velocity of particle with label indexPart */
-double *Particles3Dcomm::getVall()  const {
-  return (v);
-}
-/**get Z-velocity of particle with label indexPart */
-double *Particles3Dcomm::getWall()  const {
-  return (w);
-}
-/**get ID of particle with label indexPart */
-long long *Particles3Dcomm::getParticleIDall()  const {
-  return (ParticleID);
-}
-/**get charge of particle with label indexPart */
-double *Particles3Dcomm::getQall()  const {
-  return (q);
-}
-/** return X-coordinate of particle with index indexPart */
-double Particles3Dcomm::getX(int indexPart)  const {
-  return (x[indexPart]);
-}
-/** return Y-coordinate  of particle with index indexPart */
-double Particles3Dcomm::getY(int indexPart)  const {
-  return (y[indexPart]);
-}
-/** return Y-coordinate  of particle with index indexPart */
-double Particles3Dcomm::getZ(int indexPart)  const {
-  return (z[indexPart]);
-}
-/** get u (X-velocity) of particle with label indexPart */
-double Particles3Dcomm::getU(int indexPart)  const {
-  return (u[indexPart]);
-}
-/** get v (Y-velocity) of particle with label indexPart */
-double Particles3Dcomm::getV(int indexPart)  const {
-  return (v[indexPart]);
-}
-/**get w (Z-velocity) of particle with label indexPart */
-double Particles3Dcomm::getW(int indexPart)  const {
-  return (w[indexPart]);
-}
-/**get ID of particle with label indexPart */
-long long Particles3Dcomm::getParticleID(int indexPart)  const {
-  return (ParticleID[indexPart]);
-}
-/**get charge of particle with label indexPart */
-double Particles3Dcomm::getQ(int indexPart)  const {
-  return (q[indexPart]);
-}
-/** return the number of particles */
-int Particles3Dcomm::getNOP()  const {
-  return (nop);
-}
+
 /** return the Kinetic energy */
 double Particles3Dcomm::getKe() {
   double localKe = 0.0;

From 64541ffccb4677dac465620b73cc1797ca88472e Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 10 Jan 2014 21:22:21 +0100
Subject: [PATCH 070/118] exclude unused files from ctags file

---
 scripts/ipic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ipic.py b/scripts/ipic.py
index 90948c4b..1c9d90ad 100755
--- a/scripts/ipic.py
+++ b/scripts/ipic.py
@@ -16,7 +16,7 @@
 def ipic_ctags(args):
     # create tags file using ctags
     create_tags_command = \
-        '''find . -name '*.cpp' -or -name '*.h' | xargs ctags --extra=+qf'''
+        '''find . -name '*.cpp' -or -name '*.h' | grep -v unused | xargs ctags --extra=+qf'''
     print create_tags_command
     os.system(create_tags_command)
     # sort tags file

From 4b97b5fbf95fd7a680d2a62448d76866e260e30c Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 10 Jan 2014 21:32:53 +0100
Subject: [PATCH 071/118] vectorized mover and summing moments at cost of
 sorting serially

---
 CMakeLists.txt                |   2 +-
 fields/EMfields3D.cpp         | 177 ++++++++++++++++++
 include/EMfields3D.h          |   4 +-
 include/Grid3DCU.h            |   3 +
 include/Particles3D.h         |   2 +
 include/Particles3Dcomm.h     | 112 +++++++++++-
 include/TimeTasks.h           |   3 +
 include/arraysfwd.h           |  14 +-
 main/Parameters.cpp           |  16 ++
 main/iPic3Dlib.cpp            |  30 +++-
 particles/Particles3D.cpp     | 235 ++++++++++++++++++++++--
 particles/Particles3Dcomm.cpp | 326 +++++++++++++++++++++++++++++++++-
 utility/TimeTasks.cpp         |   3 +
 13 files changed, 893 insertions(+), 34 deletions(-)
 create mode 100644 main/Parameters.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6b9473e..f70f8451 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,7 @@ file(
         processtopology/*.cpp
         solvers/*.cpp
         utility/*.cpp
-        main/iPic3Dlib.cpp
+        main/*.cpp
 )
 
 #
diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 972361e2..00240a67 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -540,6 +540,183 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
   }
 }
 
+void EMfields3D::sumMoments_vectorized(
+  const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct)
+{
+  const double inv_dx = grid->get_invdx();
+  const double inv_dy = grid->get_invdy();
+  const double inv_dz = grid->get_invdz();
+  const int nxn = grid->getNXN();
+  const int nyn = grid->getNYN();
+  const int nzn = grid->getNZN();
+  const double xstart = grid->getXstart();
+  const double ystart = grid->getYstart();
+  const double zstart = grid->getZstart();
+  #pragma omp parallel
+  for (int species_idx = 0; species_idx < ns; species_idx++)
+  {
+    const Particles3Dcomm& pcls = part[species_idx];
+    const int is = pcls.get_ns();
+    assert_eq(species_idx,is);
+
+    double const*const x = pcls.getXall();
+    double const*const y = pcls.getYall();
+    double const*const z = pcls.getZall();
+    double const*const u = pcls.getUall();
+    double const*const v = pcls.getVall();
+    double const*const w = pcls.getWall();
+    double const*const q = pcls.getQall();
+
+    const int nop = pcls.getNOP();
+    #pragma omp master
+    timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION);
+    Moments10& speciesMoments10 = fetch_moments10Array(0);
+    speciesMoments10.set_to_zero();
+    arr4_double moments = speciesMoments10.fetch_arr();
+    #pragma omp for collapse(2) // schedule(static)
+    for(int cx=0;cx<nxc;cx++)
+    for(int cy=0;cy<nyc;cy++)
+    for(int cz=0;cz<nzc;cz++)
+    {
+      const int numpcls_in_cell = pcls.get_numpcls_in_bucket(cx,cy,cz);
+      const int bucket_offset = pcls.get_bucket_offset(cx,cy,cz);
+      const int bucket_end = bucket_offset+numpcls_in_cell;
+      #pragma simd
+      for(int i=bucket_offset; i<bucket_end; i++)
+      {
+        // compute the quadratic moments of velocity
+        //
+        const double ui=u[i];
+        const double vi=v[i];
+        const double wi=w[i];
+        const double uui=ui*ui;
+        const double uvi=ui*vi;
+        const double uwi=ui*wi;
+        const double vvi=vi*vi;
+        const double vwi=vi*wi;
+        const double wwi=wi*wi;
+        double velmoments[10];
+        velmoments[0] = 1.;
+        velmoments[1] = ui;
+        velmoments[2] = vi;
+        velmoments[3] = wi;
+        velmoments[4] = uui;
+        velmoments[5] = uvi;
+        velmoments[6] = uwi;
+        velmoments[7] = vvi;
+        velmoments[8] = vwi;
+        velmoments[9] = wwi;
+
+        // compute the weights to distribute the moments
+        //
+        double weights[8];
+        const double abs_xpos = x[i];
+        const double abs_ypos = y[i];
+        const double abs_zpos = z[i];
+        const double rel_xpos = abs_xpos - xstart;
+        const double rel_ypos = abs_ypos - ystart;
+        const double rel_zpos = abs_zpos - zstart;
+        const double cxm1_pos = rel_xpos * inv_dx;
+        const double cym1_pos = rel_ypos * inv_dy;
+        const double czm1_pos = rel_zpos * inv_dz;
+        if(false)
+        {
+          const int cx_inf = int(floor(cxm1_pos));
+          const int cy_inf = int(floor(cym1_pos));
+          const int cz_inf = int(floor(czm1_pos));
+          assert_eq(cx-1,cx_inf);
+          assert_eq(cy-1,cy_inf);
+          assert_eq(cz-1,cz_inf);
+        }
+        // index of interface to right of cell
+        const int ix = cx + 1;
+        const int iy = cy + 1;
+        const int iz = cz + 1;
+        // fraction of the distance from the right of the cell
+        const double w1x = cx - cxm1_pos;
+        const double w1y = cy - cym1_pos;
+        const double w1z = cz - czm1_pos;
+        // fraction of distance from the left
+        const double w0x = 1-w1x;
+        const double w0y = 1-w1y;
+        const double w0z = 1-w1z;
+        // we are calculating a charge moment.
+        const double qi=q[i];
+        const double weight0 = qi*w0x;
+        const double weight1 = qi*w1x;
+        const double weight00 = weight0*w0y;
+        const double weight01 = weight0*w1y;
+        const double weight10 = weight1*w0y;
+        const double weight11 = weight1*w1y;
+        weights[0] = weight00*w0z; // weight000
+        weights[1] = weight00*w1z; // weight001
+        weights[2] = weight01*w0z; // weight010
+        weights[3] = weight01*w1z; // weight011
+        weights[4] = weight10*w0z; // weight100
+        weights[5] = weight10*w1z; // weight101
+        weights[6] = weight11*w0z; // weight110
+        weights[7] = weight11*w1z; // weight111
+
+        // add particle to moments
+        {
+          arr1_double_fetch momentsArray[8];
+          arr2_double_fetch moments00 = moments[ix][iy];
+          arr2_double_fetch moments01 = moments[ix][cy];
+          arr2_double_fetch moments10 = moments[cx][iy];
+          arr2_double_fetch moments11 = moments[cx][cy];
+          momentsArray[0] = moments00[iz]; // moments000 
+          momentsArray[1] = moments00[cz]; // moments001 
+          momentsArray[2] = moments01[iz]; // moments010 
+          momentsArray[3] = moments01[cz]; // moments011 
+          momentsArray[4] = moments10[iz]; // moments100 
+          momentsArray[5] = moments10[cz]; // moments101 
+          momentsArray[6] = moments11[iz]; // moments110 
+          momentsArray[7] = moments11[cz]; // moments111 
+
+          for(int m=0; m<10; m++)
+          for(int c=0; c<8; c++)
+          {
+            momentsArray[c][m] += velmoments[m]*weights[c];
+          }
+        }
+      }
+    }
+    #pragma omp master
+    timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION);
+
+    // reduction
+    #pragma omp master
+    timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION);
+    {
+      #pragma omp for collapse(2)
+      for(int i=0;i<nxn;i++)
+      for(int j=0;j<nyn;j++)
+      for(int k=0;k<nzn;k++)
+      {
+        rhons[is][i][j][k] = invVOL*moments[i][j][k][0];
+        Jxs  [is][i][j][k] = invVOL*moments[i][j][k][1];
+        Jys  [is][i][j][k] = invVOL*moments[i][j][k][2];
+        Jzs  [is][i][j][k] = invVOL*moments[i][j][k][3];
+        pXXsn[is][i][j][k] = invVOL*moments[i][j][k][4];
+        pXYsn[is][i][j][k] = invVOL*moments[i][j][k][5];
+        pXZsn[is][i][j][k] = invVOL*moments[i][j][k][6];
+        pYYsn[is][i][j][k] = invVOL*moments[i][j][k][7];
+        pYZsn[is][i][j][k] = invVOL*moments[i][j][k][8];
+        pZZsn[is][i][j][k] = invVOL*moments[i][j][k][9];
+      }
+    }
+    #pragma omp master
+    timeTasks_end_task(TimeTasks::MOMENT_REDUCTION);
+    // uncomment this and remove the loop below
+    // when we change to use asynchronous communication.
+    // communicateGhostP2G(is, 0, 0, 0, 0, vct);
+  }
+  for (int i = 0; i < ns; i++)
+  {
+    communicateGhostP2G(i, 0, 0, 0, 0, vct);
+  }
+}
+
 /*! Calculate Electric field with the implicit solver: the Maxwell solver method is called here */
 void EMfields3D::calculateE(Grid * grid, VirtualTopology3D * vct, Collective *col) {
   if (vct->getCartesian_rank() == 0)
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 2aaef7c4..ab0cc1df 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -119,8 +119,10 @@ class EMfields3D                // :public Field
     void set_fieldForPcls();
     /*! communicate ghost for grid -> Particles interpolation */
     void communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, VirtualTopology3D * vct);
-    void sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
+    /*! sum moments (interp_P2G) versions */
     void sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
+    void sumMoments_vectorized(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
+    void sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
     /*! add accumulated moments to the moments for a given species */
     //void addToSpeciesMoments(const TenMoments & in, int is);
     /*! add an amount of charge density to charge density field at node X,Y,Z */
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index b7eb7a6b..03b56c1d 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -163,6 +163,9 @@ class Grid3DCU                  // :public Grid
   double getDX() { return (dx); }
   double getDY() { return (dy); }
   double getDZ() { return (dz); }
+  double get_invdx() { return (invdx); }
+  double get_invdy() { return (invdy); }
+  double get_invdz() { return (invdz); }
   //
   // coordinate accessors
   //
diff --git a/include/Particles3D.h b/include/Particles3D.h
index 74cfbf37..ccd210b4 100644
--- a/include/Particles3D.h
+++ b/include/Particles3D.h
@@ -58,6 +58,8 @@ class Particles3D:public Particles3Dcomm {
     void mover_explicit(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** mover with a Predictor-Corrector Scheme */
     void mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf);
+    /** vectorized version of mover_PC **/
+    void mover_PC_vectorized(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** communicate particle after moving them */
     int communicate_particles(VirtualTopology3D * vct);
     /** relativistic mover with a Predictor-Corrector scheme */
diff --git a/include/Particles3Dcomm.h b/include/Particles3Dcomm.h
index 8ebae324..5d40cca8 100644
--- a/include/Particles3Dcomm.h
+++ b/include/Particles3Dcomm.h
@@ -59,6 +59,53 @@ class Particles3Dcomm // :public Particles
   /** calculate the weights given the position of particles */
   // void calculateWeights(double*** weight, double xp, double yp, double zp,int ix, int iy, int iz, Grid* grid);
 
+  /*! sort particles for vectorized push (needs to be parallelized) */
+  void sort_particles_serial(Grid * grid, VirtualTopology3D * vct);
+  /*! sort particles with respect to provided position data */
+  void sort_particles_serial(
+    pfloat *xpos, pfloat *ypos, pfloat *zpos,
+    Grid * grid, VirtualTopology3D * vct);
+  void get_safe_cell_for_pos(
+    int& cx, int& cy, int& cz, 
+    pfloat xpos, pfloat ypos, pfloat zpos)
+  {
+    // xstart is left edge of domain excluding ghost cells
+    // cx=0 for ghost cell layer.
+    cx = 1 + int(floor((xpos - xstart) * inv_dx));
+    cy = 1 + int(floor((ypos - ystart) * inv_dy));
+    cz = 1 + int(floor((zpos - zstart) * inv_dz));
+    //
+    // if the cell is outside the domain, then treat it as
+    // in the nearest ghost cell.
+    //
+    if (cx < 0) cx = 0;
+    if (cy < 0) cy = 0;
+    if (cz < 0) cz = 0;
+    // number of cells in x direction including ghosts is nxc
+    if (cx >= nxc) cx = nxc-1;
+    if (cy >= nyc) cy = nyc-1;
+    if (cz >= nzc) cz = nzc-1;
+  }
+
+  /*! version that assumes particle is in domain */
+  void get_cell_for_pos_in_domain(
+    int& cx, int& cy, int& cz, 
+    pfloat xpos, pfloat ypos, pfloat zpos)
+  {
+    // xstart is left edge of domain excluding ghost cells
+    // cx=0 for ghost cell layer.
+    cx = 1 + int(floor((xpos - xstart) * inv_dx));
+    cy = 1 + int(floor((ypos - ystart) * inv_dy));
+    cz = 1 + int(floor((zpos - zstart) * inv_dz));
+    //
+    assert_le(0,cx);
+    assert_le(0,cy);
+    assert_le(0,cz);
+    assert_le(cx,nxc);
+    assert_le(cy,nyc);
+    assert_le(cz,nzc);
+  }
+
   // inline get accessors
   //
   double *getXall()  const { return (x); }
@@ -99,6 +146,10 @@ class Particles3Dcomm // :public Particles
 public:
   // accessors
   int get_ns()const{return ns;}
+  int get_numpcls_in_bucket(int cx, int cy, int cz)const
+  { return (*numpcls_in_bucket)[cx][cy][cz]; }
+  int get_bucket_offset(int cx, int cy, int cz)const
+  { return (*bucket_offset)[cx][cy][cz]; }
 
 protected:
   /** number of this species */
@@ -133,7 +184,10 @@ class Particles3Dcomm // :public Particles
   double v0;
   /** w0 Drift velocity - Direction Z */
   double w0;
-  /** Positions arra - X component */
+
+  // particles data
+  //
+  /** Positions array - X component */
   double *x;
   /** Positions array - Y component */
   double *y;
@@ -145,16 +199,58 @@ class Particles3Dcomm // :public Particles
   double *v;
   /** Velocities array - Z component */
   double *w;
+  /** Charge array */
+  double *q;
   /** TrackParticleID */
   bool TrackParticleID;
   /** ParticleID */
   long long *ParticleID;
+  /** Average position data (used during particle push) **/
+  double *xavg;
+  double *yavg;
+  double *zavg;
+
+  // structures for sorting particles
+  //
+  // alternate storage for sorting particles
+  //
+  double *xtmp;
+  double *ytmp;
+  double *ztmp;
+  double *utmp;
+  double *vtmp;
+  double *wtmp;
+  double *qtmp;
+  long long *ParticleIDtmp;
+  double *xavgtmp;
+  double *yavgtmp;
+  double *zavgtmp;
+  //int *xcell;
+  //int *ycell;
+  //int *zcell;
+
+  // references for buckets
+  //
+  array3_int* numpcls_in_bucket;
+  array3_int* numpcls_in_bucket_now; // accumulator used during sorting
+  //array3_int* bucket_size; // maximum number of particles in bucket
+  array3_int* bucket_offset;
+  // 
+  // bucket totals per thread
+  //
+  //int num_threads;
+  //array3_int* numpcls_in_bucket_thr;
+  //arr3_int fetch_numpcls_in_bucket_thr(int i)
+  //{
+  //  assert_le(0,i);
+  //  assert_lt(i,num_threads);
+  //  return *(numpcls_in_bucket_thr[i]);
+  //};
+
   /** rank of processor in which particle is created (for ID) */
   int BirthRank[2];
   /** number of variables to be stored in buffer for communication for each particle  */
   int nVar;
-  /** Charge array */
-  double *q;
   /** Simulation domain lengths */
   double xstart, xend, ystart, yend, zstart, zend, invVOL;
   /** time step */
@@ -167,9 +263,10 @@ class Particles3Dcomm // :public Particles
   double Lz;
   /** grid spacings */
   double dx, dy, dz;
-  /** number of grid 
-          nodes */
+  /** number of grid nodes */
   int nxn, nyn, nzn;
+  /** number of grid cells */
+  int nxc, nyc, nzc;
   /** buffers for communication */
   /** size of sending buffers for exiting particles, DEFINED IN METHOD "COMMUNICATE" */
   int buffer_size;
@@ -252,6 +349,11 @@ class Particles3Dcomm // :public Particles
   double Q_removed;
   /** density of the injection of the particles */
   double Ninj;
+
+  // convenience values from grid
+  double inv_dx;
+  double inv_dy;
+  double inv_dz;
 };
 
 typedef Particles3Dcomm Particles;
diff --git a/include/TimeTasks.h b/include/TimeTasks.h
index fba54430..3ac67b23 100644
--- a/include/TimeTasks.h
+++ b/include/TimeTasks.h
@@ -27,8 +27,11 @@ class TimeTasks
     PARTICLES,
     LAST, // no more exclusive tasks
     BFIELD,
+    MOMENT_PCL_SORTING,
     MOMENT_ACCUMULATION,
     MOMENT_REDUCTION,
+    MOVER_PCL_SORTING,
+    MOVER_PCL_MOVING,
     NUMBER_OF_TASKS // this line should be last
   };
 
diff --git a/include/arraysfwd.h b/include/arraysfwd.h
index 889c950d..706c057d 100644
--- a/include/arraysfwd.h
+++ b/include/arraysfwd.h
@@ -33,12 +33,10 @@ namespace iPic3D
 // - so that they can be redefined according to the user's
 //   preferred array implementation.
 //
-//typedef array_ref1<int> intArr1;
-//typedef array_ref2<int> intArr2;
-//typedef array_ref3<int> intArr3;
-//typedef array_ref4<int> intArr4;
-//typedef const_array_ref1<double> arr1_double;
-//typedef const_array_ref2<double> arr2_double;
+typedef iPic3D::array_ref1<int> arr1_int;
+typedef iPic3D::array_ref2<int> arr2_int;
+typedef iPic3D::array_ref3<int> arr3_int;
+typedef iPic3D::array_ref4<int> arr4_int;
 //
 typedef iPic3D::const_array_ref3<double> const_arr3_double;
 typedef iPic3D::const_array_ref4<double> const_arr4_double;
@@ -47,6 +45,10 @@ typedef iPic3D::array_ref1<double> arr1_double;
 typedef iPic3D::array_ref2<double> arr2_double;
 typedef iPic3D::array_ref3<double> arr3_double;
 typedef iPic3D::array_ref4<double> arr4_double;
+typedef iPic3D::array1<int> array1_int;
+typedef iPic3D::array2<int> array2_int;
+typedef iPic3D::array3<int> array3_int;
+typedef iPic3D::array4<int> array4_int;
 typedef iPic3D::array1<double> array1_double;
 typedef iPic3D::array2<double> array2_double;
 typedef iPic3D::array3<double> array3_double;
diff --git a/main/Parameters.cpp b/main/Parameters.cpp
new file mode 100644
index 00000000..740586de
--- /dev/null
+++ b/main/Parameters.cpp
@@ -0,0 +1,16 @@
+#include "Parameters.h"
+
+using namespace Parameters;
+
+static bool SORTING_PARTICLES;
+
+void Parameters::init_parameters()
+{
+  SORTING_PARTICLES = get_VECTORIZE_MOMENTS() || get_VECTORIZE_MOVER();
+}
+
+bool Parameters::get_SORTING_PARTICLES() { return SORTING_PARTICLES; }
+bool Parameters::get_VECTORIZE_MOMENTS() { return false; }
+bool Parameters::get_VECTORIZE_MOVER() { return false; }
+// this will also return true if we communicate particles per iteration
+bool Parameters::get_USING_XAVG() { return get_VECTORIZE_MOVER(); }
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index f476f01f..891a16af 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -3,6 +3,7 @@
 #include "TimeTasks.h"
 #include "ipicdefs.h"
 #include "debug.h"
+#include "Parameters.h"
 
 using namespace iPic3D;
 MPIdata* iPic3D::c_Solver::mpi=0;
@@ -16,6 +17,7 @@ int c_Solver::Init(int argc, char **argv) {
   // initialized MPI environment
   // nprocs = number of processors
   // myrank = rank of tha process*/
+  Parameters::init_parameters();
   mpi = &MPIdata::instance();
   nprocs = MPIdata::get_nprocs();
   myrank = MPIdata::get_rank();
@@ -178,7 +180,27 @@ void c_Solver::CalculateMoments() {
   EMf->updateInfoFields(grid,vct,col);
   EMf->setZeroDensities();                  // set to zero the densities
 
-  EMf->sumMoments(part, grid, vct);
+  if(Parameters::get_SORTING_PARTICLES())
+  {
+    // sort particles
+    #pragma omp master
+    timeTasks_begin_task(TimeTasks::MOMENT_PCL_SORTING);
+    for(int species_idx=0; species_idx<ns; species_idx++)
+      part[species_idx].sort_particles_serial(grid,vct);
+    #pragma omp master
+    timeTasks_end_task(TimeTasks::MOMENT_PCL_SORTING);
+  }
+
+  if(Parameters::get_VECTORIZE_MOMENTS())
+  {
+    // since particles are sorted,
+    // we can vectorize interpolation of particles to grid
+    EMf->sumMoments_vectorized(part, grid, vct);
+  }
+  else
+  {
+    EMf->sumMoments(part, grid, vct);
+  }
   //for (int i = 0; i < ns; i++)
   //{
   //  EMf->sumMomentsOld(part[i], grid, vct);
@@ -228,7 +250,11 @@ bool c_Solver::ParticlesMover() {
       // #pragma omp task inout(part[i]) in(grid) target_device(booster)
       //
       // should merely pass EMf->get_fieldForPcls() rather than EMf.
-      part[i].mover_PC(grid, vct, EMf); // use the Predictor Corrector scheme 
+      // use the Predictor Corrector scheme to move particles
+      if(Parameters::get_VECTORIZE_MOVER())
+        part[i].mover_PC_vectorized(grid, vct, EMf);
+      else
+        part[i].mover_PC(grid, vct, EMf);
     }
     for (int i = 0; i < ns; i++)  // move each species
     {
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 5f7bb915..1daed3fa 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -26,7 +26,7 @@ developers: Stefano Markidis, Giovanni Lapenta
 
 #include "Particles3D.h"
 
-
+#include "debug.h"
 #include "hdf5.h"
 #include <complex>
 
@@ -318,9 +318,10 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   }
   const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
 
+  #pragma omp master
+  timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING);
   const pfloat dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
-  const pfloat inv_dx = 1.0 / dx, inv_dy = 1.0 / dy, inv_dz = 1.0 / dz;
-  #pragma omp for
+  #pragma omp for schedule(static)
   // why does single precision make no difference in execution speed?
   //#pragma simd vectorlength(VECTOR_WIDTH)
   for (int pidx = 0; pidx < nop; pidx++) {
@@ -343,21 +344,23 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       const pfloat ixd = floor((xavg - xstart) * inv_dx);
       const pfloat iyd = floor((yavg - ystart) * inv_dy);
       const pfloat izd = floor((zavg - zstart) * inv_dz);
-      int ix = 2 + int (ixd);
-      int iy = 2 + int (iyd);
-      int iz = 2 + int (izd);
-      if (ix < 1)
-        ix = 1;
-      if (iy < 1)
-        iy = 1;
-      if (iz < 1)
-        iz = 1;
-      if (ix > nxn - 1)
-        ix = nxn - 1;
-      if (iy > nyn - 1)
-        iy = nyn - 1;
-      if (iz > nzn - 1)
-        iz = nzn - 1;
+      // interface of index to right of cell
+      int ix = 2 + int(ixd);
+      int iy = 2 + int(iyd);
+      int iz = 2 + int(izd);
+
+      // use field data of closest cell in domain
+      //
+      if (ix < 1) ix = 1;
+      if (iy < 1) iy = 1;
+      if (iz < 1) iz = 1;
+      if (ix > nxc) ix = nxc;
+      if (iy > nyc) iy = nyc;
+      if (iz > nzc) iz = nzc;
+      // index of cell of particle;
+      //const int cx = ix - 1;
+      //const int cy = iy - 1;
+      //const int cz = iz - 1;
 
       const pfloat xi0   = xavg - grid->get_pfloat_XN(ix-1);
       const pfloat eta0  = yavg - grid->get_pfloat_YN(iy-1);
@@ -449,6 +452,202 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
     v[pidx] = 2.0 * vavg - vorig;
     w[pidx] = 2.0 * wavg - worig;
   }                             // END OF ALL THE PARTICLES
+  #pragma omp master
+  timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING);
+}
+
+/** mover with a Predictor-Corrector scheme */
+void Particles3D::mover_PC_vectorized(
+  Grid * grid, VirtualTopology3D * vct, Field * EMf)
+{
+  assert_eq(nxc,nxn-1);
+  assert_eq(nyc,nyn-1);
+  assert_eq(nzc,nzn-1);
+  #pragma omp master
+  if (vct->getCartesian_rank() == 0) {
+    cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
+  }
+  const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
+
+  // initialize average positions
+  #pragma omp for schedule(static)
+  for(int pidx = 0; pidx < nop; pidx++)
+  {
+    xavg[pidx] = x[pidx];
+    yavg[pidx] = y[pidx];
+    zavg[pidx] = z[pidx];
+  }
+
+  const pfloat dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
+  for(int niter=1; niter<=NiterMover; niter++)
+  {
+    // sort particles based on the time-averaged position
+    if(niter>1) // on first iteration already was sorted to sum moments
+    {
+      #pragma omp master
+      timeTasks_begin_task(TimeTasks::MOVER_PCL_SORTING);
+      sort_particles_serial(xavg, yavg, zavg, grid,vct);
+      #pragma omp master
+      timeTasks_end_task(TimeTasks::MOVER_PCL_SORTING);
+    }
+
+    #pragma omp master
+    timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING);
+    // move particles in parallel
+    //
+    // iterate over mesh cells
+    //const int ncells=nxc*nyc*nzc;
+    //int *numpcls_in_bucket_1d = &numpcls_in_bucket[0][0][0];
+    //int *bucket_offset_1d = &bucket_offset[0][0][0];
+    int serial_pidx = 0;
+    #pragma omp for collapse(2) // schedule(static)
+    for(int cx=0;cx<nxc;cx++)
+    for(int cy=0;cy<nyc;cy++)
+    for(int cz=0;cz<nzc;cz++)
+    //for(int cell=0; cell<ncells; cell++)
+    {
+      // interface to the right of cell
+      const int ix = cx+1;
+      const int iy = cy+1;
+      const int iz = cz+1;
+
+      arr1_pfloat_get field_components[8];
+      field_components[0] = fieldForPcls[ix][iy][iz]; // field000
+      field_components[1] = fieldForPcls[ix][iy][cz]; // field001
+      field_components[2] = fieldForPcls[ix][cy][iz]; // field010
+      field_components[3] = fieldForPcls[ix][cy][cz]; // field011
+      field_components[4] = fieldForPcls[cx][iy][iz]; // field100
+      field_components[5] = fieldForPcls[cx][iy][cz]; // field101
+      field_components[6] = fieldForPcls[cx][cy][iz]; // field110
+      field_components[7] = fieldForPcls[cx][cy][cz]; // field111
+
+      // push all particles in mesh cell
+      //
+      //const int numpcls_in_cell = numpcls_in_bucket_1d[cell];
+      const int numpcls_in_cell = get_numpcls_in_bucket(cx,cy,cz);
+      const int bucket_offset = get_bucket_offset(cx,cy,cz);
+      const int bucket_end = bucket_offset+numpcls_in_cell;
+      // this should vectorize, but could be faster if particle
+      // data were aligned.
+      #pragma simd
+      //for(int pidx=bucket_offset_1d[cell]; pidx<numpcls_in_cell; pidx++)
+      for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
+      {
+        // serial case: check that pidx is correct
+        if(true)
+        {
+          assert_eq(pidx,serial_pidx++);
+        }
+        // confirm that particle is in correct cell
+        if(true)
+        {
+          int cx_,cy_,cz_;
+          get_safe_cell_for_pos(cx_,cy_,cz_,xavg[pidx],yavg[pidx],zavg[pidx]);
+          assert_eq(cx_,cx);
+          assert_eq(cy_,cy);
+          assert_eq(cz_,cz);
+        }
+
+        // copy the particle
+        const pfloat xorig = x[pidx];
+        const pfloat yorig = y[pidx];
+        const pfloat zorig = z[pidx];
+        const pfloat uorig = u[pidx];
+        const pfloat vorig = v[pidx];
+        const pfloat worig = w[pidx];
+
+        // compute weights for field components
+        //
+        double weights[8];
+        const double abs_xpos = xavg[pidx];
+        const double abs_ypos = yavg[pidx];
+        const double abs_zpos = zavg[pidx];
+        // xstart marks start of domain excluding ghosts
+        const double rel_xpos = abs_xpos - xstart;
+        const double rel_ypos = abs_ypos - ystart;
+        const double rel_zpos = abs_zpos - zstart;
+        // cell position minus 1 (due to ghost cells)
+        const double cxm1_pos = rel_xpos * inv_dx;
+        const double cym1_pos = rel_ypos * inv_dy;
+        const double czm1_pos = rel_zpos * inv_dz;
+        // index of interface to right of cell
+        const int ix = cx + 1;
+        const int iy = cy + 1;
+        const int iz = cz + 1;
+        // fraction of the distance from the right of the cell
+        const double w1x = cx - cxm1_pos;
+        const double w1y = cy - cym1_pos;
+        const double w1z = cz - czm1_pos;
+        // fraction of distance from the left
+        const double w0x = 1-w1x;
+        const double w0y = 1-w1y;
+        const double w0z = 1-w1z;
+        const double weight00 = w0x*w0y;
+        const double weight01 = w0x*w1y;
+        const double weight10 = w1x*w0y;
+        const double weight11 = w1x*w1y;
+        weights[0] = weight00*w0z; // weight000
+        weights[1] = weight00*w1z; // weight001
+        weights[2] = weight01*w0z; // weight010
+        weights[3] = weight01*w1z; // weight011
+        weights[4] = weight10*w0z; // weight100
+        weights[5] = weight10*w1z; // weight101
+        weights[6] = weight11*w0z; // weight110
+        weights[7] = weight11*w1z; // weight111
+
+        pfloat Exl = 0.0;
+        pfloat Eyl = 0.0;
+        pfloat Ezl = 0.0;
+        pfloat Bxl = 0.0;
+        pfloat Byl = 0.0;
+        pfloat Bzl = 0.0;
+        for(int c=0; c<8; c++)
+        {
+          Bxl += weights[c] * field_components[c][0];
+          Byl += weights[c] * field_components[c][1];
+          Bzl += weights[c] * field_components[c][2];
+          Exl += weights[c] * field_components[c][3];
+          Eyl += weights[c] * field_components[c][4];
+          Ezl += weights[c] * field_components[c][5];
+        }
+        const double Omx = qdto2mc*Bxl;
+        const double Omy = qdto2mc*Byl;
+        const double Omz = qdto2mc*Bzl;
+
+        // end interpolation
+        const pfloat omsq = (Omx * Omx + Omy * Omy + Omz * Omz);
+        const pfloat denom = 1.0 / (1.0 + omsq);
+        // solve the position equation
+        const pfloat ut = uorig + qdto2mc * Exl;
+        const pfloat vt = vorig + qdto2mc * Eyl;
+        const pfloat wt = worig + qdto2mc * Ezl;
+        //const pfloat udotb = ut * Bxl + vt * Byl + wt * Bzl;
+        const pfloat udotOm = ut * Omx + vt * Omy + wt * Omz;
+        // solve the velocity equation 
+        const pfloat uavg = (ut + (vt * Omz - wt * Omy + udotOm * Omx)) * denom;
+        const pfloat vavg = (vt + (wt * Omx - ut * Omz + udotOm * Omy)) * denom;
+        const pfloat wavg = (wt + (ut * Omy - vt * Omx + udotOm * Omz)) * denom;
+        // update average position
+        xavg[pidx] = xorig + uavg * dto2;
+        yavg[pidx] = yorig + vavg * dto2;
+        zavg[pidx] = zorig + wavg * dto2;
+
+        // if it is the last iteration, update the position and velocity
+        // (hopefully this will not compromise vectorization...)
+        if(niter==NiterMover)
+        {
+          x[pidx] = xorig + uavg * dt;
+          y[pidx] = yorig + vavg * dt;
+          z[pidx] = zorig + wavg * dt;
+          u[pidx] = 2.0 * uavg - uorig;
+          v[pidx] = 2.0 * vavg - vorig;
+          w[pidx] = 2.0 * wavg - worig;
+        }
+      }
+    }
+    #pragma omp master
+    timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING);
+  }
 }
 
 /** communicate particle after moving them */
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index acdccdab..f64e7eb1 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -9,6 +9,7 @@ developers: Stefano Markidis, Giovanni Lapenta.
 #include <math.h>
 #include <limits.h>
 #include "asserts.h"
+#include <algorithm> // for swap
 #include "VirtualTopology3D.h"
 #include "VCtopology3D.h"
 #include "CollectiveIO.h"
@@ -24,6 +25,7 @@ developers: Stefano Markidis, Giovanni Lapenta.
 #include "ompdefs.h"
 
 #include "Particles3Dcomm.h"
+#include "Parameters.h"
 
 #include "hdf5.h"
 #include <vector>
@@ -47,7 +49,7 @@ using std::endl;
  */
 
 /** constructor */
-Particles3Dcomm::Particles3Dcomm() {
+Particles3Dcomm::Particles3Dcomm(){
   // see allocate(int species, CollectiveIO* col, VirtualTopology3D* vct, Grid* grid)
 
 }
@@ -60,6 +62,22 @@ Particles3Dcomm::~Particles3Dcomm() {
   delete[]v;
   delete[]w;
   delete[]q;
+  delete[]ParticleID;
+  delete[]xavg;
+  delete[]yavg;
+  delete[]zavg;
+  // deallocate alternate storage
+  delete[]xtmp;
+  delete[]ytmp;
+  delete[]ztmp;
+  delete[]utmp;
+  delete[]vtmp;
+  delete[]wtmp;
+  delete[]qtmp;
+  delete[]ParticleIDtmp;
+  delete[]xavgtmp;
+  delete[]yavgtmp;
+  delete[]zavgtmp;
   // deallocate buffers
   delete[]b_X_RIGHT;
   delete[]b_X_LEFT;
@@ -67,6 +85,9 @@ Particles3Dcomm::~Particles3Dcomm() {
   delete[]b_Y_LEFT;
   delete[]b_Z_RIGHT;
   delete[]b_Z_LEFT;
+  delete numpcls_in_bucket;
+  delete numpcls_in_bucket_now;
+  delete bucket_offset;
 }
 /** constructors fo a single species*/
 void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3D * vct, Grid * grid) {
@@ -112,10 +133,19 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   dx = grid->getDX();
   dy = grid->getDY();
   dz = grid->getDZ();
+  inv_dx = 1/dx;
+  inv_dy = 1/dy;
+  inv_dz = 1/dz;
 
   nxn = grid->getNXN();
   nyn = grid->getNYN();
   nzn = grid->getNZN();
+  nxc = grid->getNXC();
+  nyc = grid->getNYC();
+  nzc = grid->getNZC();
+  assert_eq(nxc,nxn-1);
+  assert_eq(nyc,nyn-1);
+  assert_eq(nzc,nzn-1);
   invVOL = grid->getInvVOL();
   // info from VirtualTopology3D
   cVERBOSE = vct->getcVERBOSE();
@@ -131,6 +161,20 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   bcPfaceYleft = col->getBcPfaceYleft();
   bcPfaceZright = col->getBcPfaceZright();
   bcPfaceZleft = col->getBcPfaceZleft();
+  //
+  // allocate arrays for sorting particles
+  //
+  numpcls_in_bucket = new array3_int(nxc,nyc,nzc);
+  numpcls_in_bucket_now = new array3_int(nxc,nyc,nzc);
+  bucket_offset = new array3_int(nxc,nyc,nzc);
+  //num_threads = omp_get_max_threads();
+  //numpcls_in_bucket_thr = (arr3_int*)malloc(sizeof(void*)*num_threads);
+  //for(int i=0; i<num_threads; i++)
+  //{
+  //  numpcls_in_bucket_thr[i] = new array3_int(nxc,nyc,nzc);
+  //}
+  
+  //
   // //////////////////////////////////////////////////////////////
   // ////////////// ALLOCATE ARRAYS /////////////////////////
   // //////////////////////////////////////////////////////////////
@@ -144,9 +188,51 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   w = new double[npmax];
   // charge
   q = new double[npmax];
+  // average positions, used in iterative particle advance
+  xavg = 0;
+  yavg = 0;
+  zavg = 0;
+  if(Parameters::get_USING_XAVG())
+  {
+    xavg = new double[npmax];
+    yavg = new double[npmax];
+    zavg = new double[npmax];
+  }
+  //
+  xtmp = 0;
+  ytmp = 0;
+  ztmp = 0;
+  utmp = 0;
+  vtmp = 0;
+  wtmp = 0;
+  qtmp = 0;
+  xavgtmp = 0;
+  yavgtmp = 0;
+  zavgtmp = 0;
+  if(Parameters::get_SORTING_PARTICLES())
+  {
+    xtmp = new double[npmax];
+    ytmp = new double[npmax];
+    ztmp = new double[npmax];
+    // velocities
+    utmp = new double[npmax];
+    vtmp = new double[npmax];
+    wtmp = new double[npmax];
+    // charge
+    qtmp = new double[npmax];
+    // average positions, used in iterative particle advance
+    xavgtmp = new double[npmax];
+    yavgtmp = new double[npmax];
+    zavgtmp = new double[npmax];
+  }
+
+  ParticleID = 0;
+  ParticleIDtmp = 0;
   // ID
   if (TrackParticleID) {
     ParticleID = new long long[npmax];
+    if(Parameters::get_SORTING_PARTICLES())
+      ParticleIDtmp = new long long[npmax];
     BirthRank[0] = vct->getCartesian_rank();
     if (vct->getNprocs() > 1)
       BirthRank[1] = (int) ceil(log10((double) (vct->getNprocs())));  // Number of digits needed for # of process in ID
@@ -887,3 +973,241 @@ void Particles3Dcomm::PrintNp(VirtualTopology3D * ptVCT)  const {
   cout << "Subgrid (" << ptVCT->getCoordinates(0) << "," << ptVCT->getCoordinates(1) << "," << ptVCT->getCoordinates(2) << ")" << endl;
   cout << endl;
 }
+
+/***** particle sorting routines *****/
+
+void Particles3Dcomm::sort_particles_serial(Grid * grid, VirtualTopology3D * vct)
+{
+  sort_particles_serial(x,y,z, grid,vct);
+}
+
+// need to sort and communicate particles after each iteration
+void Particles3Dcomm::sort_particles_serial(
+  double *xpos, double *ypos, double *zpos,
+  Grid * grid, VirtualTopology3D * vct)
+{
+  #pragma omp critical (sort_particles_serial)
+  {
+    numpcls_in_bucket->setall(0);
+    // iterate through particles and count where they will go
+    for (int pidx = 0; pidx < nop; pidx++)
+    {
+      // get the cell indices of the particle
+      //
+      int cx,cy,cz;
+      get_safe_cell_for_pos(cx,cy,cz,xpos[pidx],ypos[pidx],zpos[pidx]);
+      //
+      // is it better just to recompute this?
+      //
+      //xcell[pidx]=cx;
+      //ycell[pidx]=cy;
+      //zcell[pidx]=cz;
+
+      // increment the number of particles in bucket of this particle
+      (*numpcls_in_bucket)[cx][cy][cz]++;
+    }
+
+    // compute prefix sum to determine initial position
+    // of each bucket (could parallelize this)
+    //
+    int accpcls=0;
+    for(int cx=0;cx<nxc;cx++)
+    for(int cy=0;cy<nyc;cy++)
+    for(int cz=0;cz<nzc;cz++)
+    {
+      (*bucket_offset)[cx][cy][cz] = accpcls;
+      accpcls += (*numpcls_in_bucket)[cx][cy][cz];
+    }
+
+    numpcls_in_bucket_now->setall(0);
+    // put the particles where they are supposed to go
+    for (int pidx = 0; pidx < nop; pidx++)
+    {
+      // get the cell indices of the particle
+      //
+      int cx,cy,cz;
+      get_safe_cell_for_pos(cx,cy,cz,xpos[pidx],ypos[pidx],zpos[pidx]);
+      //
+      //cx = xcell[pidx];
+      //cy = ycell[pidx];
+      //cz = zcell[pidx];
+
+      // compute where the data should go
+      const int numpcls_now = (*numpcls_in_bucket_now)[cx][cy][cz]++;
+      const int outpidx = (*bucket_offset)[cx][cy][cz] + numpcls_now;
+
+      // copy particle data to new location
+      //
+      xtmp[outpidx] = x[pidx];
+      ytmp[outpidx] = y[pidx];
+      ztmp[outpidx] = z[pidx];
+      utmp[outpidx] = u[pidx];
+      vtmp[outpidx] = v[pidx];
+      wtmp[outpidx] = w[pidx];
+      qtmp[outpidx] = q[pidx];
+      if (TrackParticleID)
+        ParticleIDtmp[outpidx] = ParticleID[pidx];
+      xavgtmp[outpidx] = xavg[pidx];
+      yavgtmp[outpidx] = yavg[pidx];
+      zavgtmp[outpidx] = zavg[pidx];
+    }
+    // swap the tmp particle memory with the official particle memory
+    {
+      swap(xtmp,x);
+      swap(ytmp,y);
+      swap(ztmp,z);
+      swap(utmp,u);
+      swap(vtmp,v);
+      swap(wtmp,w);
+      swap(qtmp,q);
+      swap(ParticleIDtmp,ParticleID);
+      swap(xavgtmp,xavg);
+      swap(yavgtmp,yavg);
+      swap(zavgtmp,zavg);
+    }
+
+    // check if the particles were sorted incorrectly
+    if(true)
+    {
+      for(int cx=0;cx<nxc;cx++)
+      for(int cy=0;cy<nyc;cy++)
+      for(int cz=0;cz<nzc;cz++)
+      {
+        assert_eq((*numpcls_in_bucket_now)[cx][cy][cz], (*numpcls_in_bucket)[cx][cy][cz]);
+      }
+    }
+  }
+}
+
+//void Particles3Dcomm::sort_particles_parallel(
+//  double *xpos, double *ypos, double *zpos,
+//  Grid * grid, VirtualTopology3D * vct)
+//{
+//  // should change this to first communicate particles so that
+//  // they are in the correct process and all particles
+//  // lie in this subdomain.
+//
+//  // count the number of particles to go in each bucket
+//  numpcls_in_bucket.setall(0);
+//  #pragma omp parallel
+//  {
+//    const int thread_num = omp_get_thread_num();
+//    arr3_int numpcls_in_bucket_thr = fetch_numpcls_in_bucket_thr(thread_num);
+//    numpcls_in_bucket_thr.setall(0);
+//    // iterate through particles and count where they will go
+//    #pragma omp for // nowait
+//    for (int pidx = 0; pidx < nop; pidx++)
+//    {
+//      // get the cell indices of the particle
+//      // (should change this to use xavg[pidx])
+//      const pfloat xpos = xpos[pidx];
+//      const pfloat ypos = ypos[pidx];
+//      const pfloat zpos = zpos[pidx];
+//      int cx,cy,cz;
+//      get_safe_cell_for_pos(cx,cy,cz,xpos,ypos,zpos);
+//
+//      // need to allocate these
+//      //
+//      //xidx[pidx]=cx;
+//      //yidx[pidx]=cy;
+//      //zidx[pidx]=cz;
+//
+//      // increment the number of particles in bucket of this particle
+//      numpcls_in_bucket_thr[cx][cy][cz]++;
+//    }
+//    // reduce the thread buckets into the main bucket
+//    // #pragma omp critical (numpcls_in_bucket_reduction)
+//    {
+//      #pragma omp for collapse(2)
+//      for(int cx=0;cx<nxc;cx++)
+//      for(int cy=0;cy<nyc;cy++)
+//      for(int th=0;th<num_threads;th++)
+//      for(int cz=0;cz<nzc;cz++)
+//      {
+//        numpcls_in_bucket[cx][cy][cz]
+//          += get_numpcls_in_bucket_thr(th)[cx][cy][cz];
+//      }
+//    }
+//
+//    // compute prefix sum to determine initial position
+//    // of each bucket (could parallelize this)
+//    //
+//    int accpcls=0;
+//    #pragma omp critical (bucket_offset_reduction)
+//    for(int cx=0;cx<nxc;cx++)
+//    for(int cy=0;cy<nyc;cy++)
+//    for(int cz=0;cz<nzc;cz++)
+//    {
+//      bucket_offset[cx][cy][cz] = accpcls;
+//      accpcls += numpcls_in_bucket[cx][cy][cz];
+//    }
+//
+//    // cycle through the mesh cells mod 3
+//    // (or mod(2*N+1), where N is number of mesh cells
+//    // that a slow particle can move).
+//    // This ensures that slow particles can be moved
+//    // to their destinations without write conflicts
+//    // among threads.  But what about cache contention?
+//    //
+//    for(int cxmod3=0; cxmod3<3; cxmod3++)
+//    #pragma omp for collapse(2)
+//    for(int cx=cxmod3; cx<nxc; cx+=3)
+//    for(int cy=0; cy<nyc; cy++)
+//    for(int cz=0; cz<nzc; cz++)
+//    {
+//      // put the slow particles where they are supposed to go and
+//      // set aside the fast particles for separate processing.
+//      // (to vectorize would need to sort separately in each
+//      // dimension of space).
+//      //
+//      // problem: particles might have to be moved not because
+//      // they are fast but because of an overall shift in the
+//      // number of particles in a location, e.g. because of
+//      // particles flowing in from a jet. Need a different
+//      // approach, where memory is allocated for each cell.
+//      _numpcls_in_bucket = numpcls_in_bucket[cx][cy][cz];
+//      for(int pidx=bucket_offset[cx][cy][cz]; pidx<_numpcls_in_bucket; pidx++)
+//      {
+//        const int outcx = xidx[pidx];
+//        const int outcy = yidx[pidx];
+//        const int outcz = zidx[pidx];
+//        const int cxlower = outcx <= 0 ? 0 : outcx-1;
+//        const int cxupper = outcx >= (nxc-1) ? nxc-1 : outcx+1;
+//        const int lowerindex = bucket_offset[cxlower][cylower][czlower];
+//        const int upperoffset = bucket_offset[cxupper][cyupper][czupper];
+//        const int upperindex = upperoffset + numpcls_in_bucket[outcx][outcy][outcz];
+//        ...
+//      }
+//    }
+//    // (1) put fast particles that must be moved more than one
+//    // mesh cell at the end of the cell's list, and
+//    // (2) put slow particles in the correct location
+//
+//    // count the number of particles that need to be moved
+//    // more than one mesh cell and allocate a special buffer for them.
+//    // (could change to count number of particles that need
+//    // to move more than N mesh cells).
+//    //
+//    int numpcls_long_move_thr = 0;
+//    #pragma omp for // nowait
+//    for (int i = 0; i < nop; i++)
+//    {
+//      const int cx = xidx[pidx];
+//      const int cy = yidx[pidx];
+//      const int cz = zidx[pidx];
+//
+//      const int cxlower = cx <= 0 ? 0 : cx-1;
+//      const int cxupper = cx >= (nxc-1) ? nxc-1 : cx+1;
+//      const int lowerindex = bucket_offset[cxlower][cylower][czlower];
+//      const int upperoffset = bucket_offset[cxupper][cyupper][czupper];
+//      const int upperindex = upperoffset + numpcls_in_bucket[cx][cy][cz];
+//      if(i < lowerindex || i > upperindex)
+//      {
+//        numpcls_long_move_thr++;
+//      }
+//    }
+//  }
+//}
+//#endif
+
+
diff --git a/utility/TimeTasks.cpp b/utility/TimeTasks.cpp
index dd114ffc..13001f88 100644
--- a/utility/TimeTasks.cpp
+++ b/utility/TimeTasks.cpp
@@ -18,8 +18,11 @@ static const char *taskNames[] = // order must agree with Tasks in TimeTasks.h
   "particles",
   "last",
   "bfield",
+  "moment_pcl_sorting",
   "moment_accumulation",
   "moment_reduction",
+  "mover_pcl_sorting",
+  "mover_pcl_moving",
   "number_of_tasks"
 };
 

From a513bb1b47e7e35ddbf31c0555d7de1168b397d4 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 10 Jan 2014 22:14:21 +0100
Subject: [PATCH 072/118] committing new file forgotten in previous commit

---
 include/Parameters.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 include/Parameters.h

diff --git a/include/Parameters.h b/include/Parameters.h
new file mode 100644
index 00000000..ba4e980d
--- /dev/null
+++ b/include/Parameters.h
@@ -0,0 +1,15 @@
+#ifndef _Parameters_h_
+#define _Parameters_h_
+
+// namespace provides a more flexible, succinct singleton via "using Parameters"
+//
+namespace Parameters
+{
+  void init_parameters();
+
+  bool get_SORTING_PARTICLES();
+  bool get_VECTORIZE_MOMENTS();
+  bool get_VECTORIZE_MOVER();
+  bool get_USING_XAVG();
+}
+#endif

From b3cc7254e71b7f9363af3b03a3604503a4eb0715 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 13 Jan 2014 10:36:37 +0100
Subject: [PATCH 073/118] corrected compile error on mic introduced two commits
 earlier

---
 fields/EMfields3D.cpp     | 8 ++++----
 particles/Particles3D.cpp | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 00240a67..a059fc77 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -569,7 +569,7 @@ void EMfields3D::sumMoments_vectorized(
 
     const int nop = pcls.getNOP();
     #pragma omp master
-    timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION);
+    { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
     Moments10& speciesMoments10 = fetch_moments10Array(0);
     speciesMoments10.set_to_zero();
     arr4_double moments = speciesMoments10.fetch_arr();
@@ -682,11 +682,11 @@ void EMfields3D::sumMoments_vectorized(
       }
     }
     #pragma omp master
-    timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION);
+    { timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION); }
 
     // reduction
     #pragma omp master
-    timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION);
+    { timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION); }
     {
       #pragma omp for collapse(2)
       for(int i=0;i<nxn;i++)
@@ -706,7 +706,7 @@ void EMfields3D::sumMoments_vectorized(
       }
     }
     #pragma omp master
-    timeTasks_end_task(TimeTasks::MOMENT_REDUCTION);
+    { timeTasks_end_task(TimeTasks::MOMENT_REDUCTION); }
     // uncomment this and remove the loop below
     // when we change to use asynchronous communication.
     // communicateGhostP2G(is, 0, 0, 0, 0, vct);
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 1daed3fa..9a88d51c 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -492,7 +492,7 @@ void Particles3D::mover_PC_vectorized(
     }
 
     #pragma omp master
-    timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING);
+    { timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING); }
     // move particles in parallel
     //
     // iterate over mesh cells
@@ -646,7 +646,7 @@ void Particles3D::mover_PC_vectorized(
       }
     }
     #pragma omp master
-    timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING);
+    { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
   }
 }
 

From 504b3be37d717fb999d1fe949303ada2311fcfb3 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 13 Jan 2014 10:15:09 +0100
Subject: [PATCH 074/118] improved thread-awareness of debug and some asserts

---
 include/asserts.h           |  29 +++---
 include/debug.h             |  12 ++-
 include/errors.h            |  21 +++--
 utility/asserts.cpp         | 162 ++++++++++++++++++++++++++++---
 utility/asserts.o           | Bin 0 -> 7624 bytes
 utility/debug.cpp           | 110 +++++++++++++++++++---
 utility/errors.cpp          |  98 +++++++++++++++----
 utility/new/diagnostics.cpp | 183 ++++++++++++++++++++++++++++++++++++
 8 files changed, 546 insertions(+), 69 deletions(-)
 create mode 100644 utility/asserts.o
 create mode 100644 utility/new/diagnostics.cpp

diff --git a/include/asserts.h b/include/asserts.h
index e46b9c77..8a8a8718 100644
--- a/include/asserts.h
+++ b/include/asserts.h
@@ -38,16 +38,14 @@
 
 #else // ifndef NDEBUG
 
-// override system assert.h
-// #define assert_fileLine(e, file, line) \
-// ((void)printf ("%s:%u: failed assertion `%s'\n", file, line, e), abort())
-// void eprintf_fileLine(const char *func, const char *file, int line_number,
-// const char *format, ...);
+void eprintf_fileLine(FILE * fptr, const char *type, const char *func,
+  const char *file, int line_number, const char *format, ...);
 
 #define dassert_fileLine(e, file, line, func) \
-  (void)(printf("ERROR: file %s, line %d, function %s:\n\tfailed assertion: (%s)\n", file, line, func,e),abort())
-#define dassert_printf_fileLine(e, file, line, func, args...) \
-  (void)(printf("ERROR: file %s, line %d, function %s:\n\tfailed assertion: (%s)\n\t", file, line, func,e), printf(args), printf("\n"), abort())
+  (void)(eprintf_fileLine(stdout,"ERROR", func, file, line, \
+           "\n\tfailed assertion: (%s)", e), abort())
+//#define dassert_printf_fileLine(e, file, line, func, args...) \
+//  (void)(printf("ERROR: file %s, line %d, function %s:\n\tfailed assertion: (%s)\n\t", file, line, func,e), printf(args), printf("\n"), abort())
 
 // comment out the next line if __builtin_expect causes problems
 #define USE_GCC_OPTIMIZATION
@@ -59,16 +57,16 @@
 
 #define dassert_(e)  \
   ((void) ((e) ? (void)0 : dassert_fileLine(#e, __FILE__, __LINE__, __func__)))
-#define dassert_printf_(e, args...)  \
-  ((void) ((e) ? (void)0 : dassert_printf_fileLine(#e, __FILE__, __LINE__, __func__,##args)))
+//#define dassert_printf_(e, args...)  \
+//  ((void) ((e) ? (void)0 : dassert_printf_fileLine(#e, __FILE__, __LINE__, __func__,##args)))
 #else // ifdef USE_GCC_OPTIMIZATION
 // optimized version of preceding
 // #define assert(e) \
 // (__builtin_expect(!(e), 0) ? assert_fileLine (#e, __FILE__, __LINE__) : (void)0)
 #define dassert_(e)  \
   (__builtin_expect(!(e), 0) ? dassert_fileLine (#e, __FILE__, __LINE__, __func__) : (void)0)
-#define dassert_printf(e, args...)  \
-  (__builtin_expect(!(e), 0) ? dassert_printf_fileLine (#e, __FILE__, __LINE__, __func__,##args) : (void)0)
+//#define dassert_printf(e, args...)  \
+//  (__builtin_expect(!(e), 0) ? dassert_printf_fileLine (#e, __FILE__, __LINE__, __func__,##args) : (void)0)
 #endif // USE_GCC_OPTIMIZATION
 
 #if(MAX_ASSERT_LEVEL>=1)
@@ -126,14 +124,19 @@ extern "C" {
 #else
 #define builtin_expect(a,b) __builtin_expect(a,b)
 #endif
+// check whether two numbers are equal within machine precision
 #define assert_not_almost_eq(lhs,rhs) \
   (fcmp(lhs, rhs, 1e-14) \
    ? (void)0 \
    : assert_error(__FILE__, __LINE__, __func__, " !=~= ", #lhs, #rhs, lhs, rhs))
 #define assert_almost_eq(lhs,rhs) \
-  (builtin_expect(fcmp(lhs, rhs, 1e-10),0) \
+  (builtin_expect(fcmp(lhs, rhs, 1e-14),0) \
    ? assert_error(__FILE__, __LINE__, __func__, " =~= ", #lhs, #rhs, lhs, rhs) \
    : (void)0)
+//#define assert_almost_eq(lhs,rhs) \
+//  (builtin_expect(fcmp((lhs-rhs)/(fabs(lhs)+fabs(rhs)),1e-14),0) \
+//   ? assert_error(__FILE__, __LINE__, __func__, " =~= ", #lhs, #rhs, lhs, rhs) \
+//   : (void)0)
 #define assert_divides(lhs,rhs) \
   (builtin_expect(rhs%lhs,0) \
    ? assert_error(__FILE__, __LINE__, __func__, "(divides)", #lhs, #rhs, lhs, rhs) \
diff --git a/include/debug.h b/include/debug.h
index 402d3819..fe1524c0 100644
--- a/include/debug.h
+++ b/include/debug.h
@@ -7,18 +7,20 @@
 #include <cstdarg>
 #include <cstdio>
 
-#include "debug.h"
+#include "errors.h"
 
-void dfprintf_fileLine(FILE * fptr, const char *func, const char *file, int line_number, const char *format, ...);
+void fprintf_fileLine(FILE * fptr, const char *type, const char *func,
+  const char *file, int line_number, const char *format, ...);
 
-#define dprintf(args...) dfprintf_fileLine(stdout, __func__, __FILE__, __LINE__,## args)
-#define dprint(var) dprintvar_fileLine(__func__,__FILE__,__LINE__,#var,var);
+#define dprintf(args...) fprintf_fileLine(stdout, "DEBUG", __func__, __FILE__, __LINE__,## args)
+#define dprint(var) printvar_fileLine(__func__, __FILE__,__LINE__,#var,var);
 #define dprint0(var) dprint(var)
 #define declare_dprintvar_fileLine(type) \
-void dprintvar_fileLine(const char*,const char*,int,const char*,type);
+void printvar_fileLine(const char*,const char*,int,const char*,type);
 
 declare_dprintvar_fileLine(int);
 declare_dprintvar_fileLine(double);
 declare_dprintvar_fileLine(const char *);
+declare_dprintvar_fileLine(const void *);
 
 #endif
diff --git a/include/errors.h b/include/errors.h
index ca80313f..ace6b5f2 100644
--- a/include/errors.h
+++ b/include/errors.h
@@ -1,16 +1,21 @@
 #ifndef ipic_errors_H
 #define ipic_errors_H
 
-void errmsg_printf_fileLine(const char *func, const char *file, int line_number, const char *format, ...);
-void eprintf_fileLine(const char *func, const char *file, int line_number, const char *format, ...);
-void Wprintf_fileLine(const char *func, const char *file, int line_number, const char *format, ...);
+//void errmsg_printf_fileLine(const char *func, const char *file, int line_number, const char *format, ...);
+//void eprintf_fileLine(const char *func, const char *file, int line_number, const char *format, ...);
+//void Wprintf_fileLine(const char *func, const char *file, int line_number, const char *format, ...);
+void eprintf_fileLine(FILE * fptr, const char *type,
+  const char *func, const char *file, int line_number,
+  const char *format, ...);
 
-#define errmsg_printf(args...) \
-  errmsg_printf_fileLine(__func__, __FILE__, __LINE__, ## args);
 #define eprintf(args...) \
-  errmsg_printf_fileLine(__func__, __FILE__, __LINE__, ## args);
-#define Wprintf(args...) \
-  Wprintf_fileLine(__func__, __FILE__, __LINE__, ## args);
+  eprintf_fileLine(stdout,"ERROR",__func__, __FILE__, __LINE__, ## args);
+#define error_printf(args...) \
+  eprintf_fileLine(stdout,"ERROR",__func__, __FILE__, __LINE__, ## args);
+//#define eprintf(args...) \
+//  eprintf_fileLine("ERROR",__func__, __FILE__, __LINE__, ## args);
+#define warning_printf(args...) \
+  eprintf_fileLine("WARNING",__func__, __FILE__, __LINE__, ## args);
 #define declare_invalid_value_error(t1) \
   void invalid_value_error_fileLine(const char* file, int line, const char* func, \
     const char* type, const char* expr, t1 val);
diff --git a/utility/asserts.cpp b/utility/asserts.cpp
index 312fe79b..5b58a1ae 100644
--- a/utility/asserts.cpp
+++ b/utility/asserts.cpp
@@ -1,23 +1,47 @@
 
+#ifndef NO_MPI
+  #include "MPIdata.h" // for get_rank
+#endif
+#include "ompdefs.h" // for omp_get_thread_num
 #include <iostream>
 #include "asserts.h"
 
 void assert_error(const char *file, int line, const char *func, const char *op, const char *lhs_str, const char *rhs_str, double lhs, double rhs) {
-  fprintf(stdout, "ERROR in file %s, line %d, function %s" "\n\tassertion failed: %s %s %s, i.e., %24.16e %s %24.16e\n", file, line, func, lhs_str, op, rhs_str, lhs, op, rhs);
+
+  eprintf_fileLine(stdout, "ERROR", func,file,line,
+    "\n\tassertion failed: %s %s %s, i.e., %24.16e %s %24.16e\n", lhs_str, op, rhs_str, lhs, op, rhs);
   abort();
 }
 
-#define implement_assert_errmsg(t1,t2) \
-  void assert_error(const char* file, int line, const char* func, \
-    const char* op, const char* lhs_str, const char* rhs_str, \
-    t1 lhs, t2 rhs) \
-  { \
-    std::cerr<< "ERROR in file " << file << ", line " << line  \
-      << ", function " << func  \
-      <<"\n\tassertion failed: " << lhs_str << op << rhs_str \
-      << ", i.e., " << lhs << op << rhs << std::endl; \
-      abort(); \
-  }
+#ifndef NO_MPI
+  #ifdef _OPENMP
+    #define process_string \
+      std::cerr << "(" << MPIdata::get_rank() << "." <<  omp_get_thread_num() << ")";
+  #else
+    #define process_string \
+      std::cerr << "(" << MPIdata::get_rank() << ")";
+  #endif
+#else
+  #ifdef _OPENMP
+    #define process_string \
+      std::cerr << "(." << omp_get_thread_num() << ")";
+  #else
+    #define process_string 
+  #endif
+#endif
+
+ #define implement_assert_errmsg(t1,t2) \
+   void assert_error(const char* file, int line, const char* func, \
+     const char* op, const char* lhs_str, const char* rhs_str, \
+     t1 lhs, t2 rhs) \
+   { \
+     process_string \
+     std::cerr << " ERROR in file " << file << ", line " << line  \
+       << ", function " << func  \
+       <<"\n\tassertion failed: " << lhs_str << op << rhs_str \
+       << ", i.e., " << lhs << op << rhs << std::endl; \
+       abort(); \
+   }
 
 implement_assert_errmsg(size_t, size_t);
 implement_assert_errmsg(int, size_t);
@@ -25,3 +49,117 @@ implement_assert_errmsg(size_t, int);
 implement_assert_errmsg(int, int);
 implement_assert_errmsg(long long, long long);
 implement_assert_errmsg(const char *, const char *);
+
+/*
+ fcmp
+ Copyright (c) 1998-2000 Theodore C. Belding
+ University of Michigan Center for the Study of Complex Systems
+ <mailto:Ted.Belding@umich.edu>
+ <http://www-personal.umich.edu/~streak/>		
+
+ This file is part of the fcmp distribution. fcmp is free software;
+ you can redistribute and modify it under the terms of the GNU Library
+ General Public License (LGPL), version 2 or later.  This software
+ comes with absolutely no warranty. See the file COPYING for details
+ and terms of copying.
+
+ File: fcmp.h 
+
+ Description:
+ 
+ Knuth's floating point comparison operators, from:
+ Knuth, D. E. (1998). The Art of Computer Programming.
+ Volume 2: Seminumerical Algorithms. 3rd ed. Addison-Wesley.
+ Section 4.2.2, p. 233. ISBN 0-201-89684-2.
+
+ Input parameters:
+ x1, x2: numbers to be compared
+ epsilon: determines tolerance
+
+ epsilon should be carefully chosen based on the machine's precision,
+ the observed magnitude of error, the desired precision, and the
+ magnitude of the numbers to be compared. See the fcmp README file for
+ more information.
+
+ This routine may be used for both single-precision (float) and
+ double-precision (double) floating-point numbers.
+ 
+ Returns:
+ -1 if x1 < x2
+  0 if x1 == x2
+  1 if x1 > x2
+*/
+
+/*
+ fcmp
+ Copyright (c) 1998-2000 Theodore C. Belding
+ University of Michigan Center for the Study of Complex Systems
+ <mailto:Ted.Belding@umich.edu>
+ <http://www-personal.umich.edu/~streak/>		
+
+ This file is part of the fcmp distribution. fcmp is free software;
+ you can redistribute and modify it under the terms of the GNU Library
+ General Public License (LGPL), version 2 or later.  This software
+ comes with absolutely no warranty. See the file COPYING for details
+ and terms of copying.
+
+ File: fcmp.c
+
+ Description: see fcmp.h and README files.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+int fcmp(double x1, double x2, double epsilon)
+{
+  double diff = x1-x2;
+  if(diff>epsilon) return 1;
+  if(diff<-epsilon) return -1;
+  return 0;
+
+  // the code below was failing for some reason. -eaj
+
+  int exponent;
+  double delta;
+  double difference;
+  
+  /* Get exponent(max(fabs(x1), fabs(x2))) and store it in exponent. */
+
+  /* If neither x1 nor x2 is 0, */
+  /* this is equivalent to max(exponent(x1), exponent(x2)). */
+
+  /* If either x1 or x2 is 0, its exponent returned by frexp would be 0, */
+  /* which is much larger than the exponents of numbers close to 0 in */
+  /* magnitude. But the exponent of 0 should be less than any number */
+  /* whose magnitude is greater than 0. */
+  
+  /* So we only want to set exponent to 0 if both x1 and */
+  /* x2 are 0. Hence, the following works for all x1 and x2. */
+
+  frexp(fabs(x1) > fabs(x2) ? x1 : x2, &exponent);
+
+  /* Do the comparison. */
+
+  /* delta = epsilon * pow(2, exponent) */
+
+  /* Form a neighborhood around x2 of size delta in either direction. */
+  /* If x1 is within this delta neighborhood of x2, x1 == x2. */
+  /* Otherwise x1 > x2 or x1 < x2, depending on which side of */
+  /* the neighborhood x1 is on. */
+  
+  delta = ldexp(epsilon, exponent); 
+  
+  difference = x1 - x2;
+
+  if (difference > delta)
+    return 1; /* x1 > x2 */
+  else if (difference < -delta) 
+    return -1;  /* x1 < x2 */
+  else /* -delta <= difference <= delta */
+    return 0;  /* x1 == x2 */
+}
+
diff --git a/utility/asserts.o b/utility/asserts.o
new file mode 100644
index 0000000000000000000000000000000000000000..27b794781d3db901fba768c5bda8c7f223843b9b
GIT binary patch
literal 7624
zcmdU!Z)_aZ5y0QsAvS4xb|?jcC_a!K1(a;ge@H_V=$*+WcVH*M=OhXeY}a>t&6fKU
zZa2l2Qj*9hXa%J~0*QPGpa}6#sNzFEMEq+5DER<Yf<TIps>&b~wf)dGNKmDvz|7vv
zdG6uo5c(-kzI*e2J2P+I?Ce|b-rmf}hPGI&T~UjvTUCq)qVK$Vc94q1l<HDHn1z~z
zJ{G96Uu#$`2hgtCbgZn1H^XQiAB^~d=9q=G5x<0o&3+W5HZ{wWbYwg9F_@Vfukix_
z?P?+8X1^aoY*!u4F$*t5ypxB`{%FK6ZH`$u7V$+qZ1yRTy5X4KT;w{0@vja12?IZ3
z;4cHe4D!QynSn93v+$W~-ZhY@xp>XMUo!C54Ez{yjHerVY%|ffQlAq4m&Q9FXLH4~
z1UULRMLA_ZL&RU9Q_p<6!OuOUe~?OU3&rz{L7xt*ZGS-NulN;7Z=&=ner2cM>HHfC
zT|MjZ>p_%NdOW=~y)C^xzja&qc{cp-tkGzAax^@i3;&c0KaBik!xP!?FDyomc@X}C
zr4~FAo-yz*n2*ewj)Y$^ZSj(QIOt9B5uuMf%Y4=%BBhIHeTL=Dv%0Px*z|h#UF7F)
z-UVJn>oc6c`O34a=OS9)|8@Os8x7xvoo6N;ewGVQWyAkO7b@PRRQBOpu!6JU;@**P
zq_=Z4{3r{3_xkYt##KH2P9hR$!`g@1nX$3WV+pUCD0yW!q11{**{g!EB2n61Ed*Yz
znox@_b$s8gM}v|Bdd0P%iMpQUS}P*C)lZ*Ke)@{SQJ+7Tr#?Rs{js;cfg!7ZwHv{P
zh<DkQMYrVaE(c|AVl8|ot2=9ckdSn;Y9m`cR7_ls(Lsx;*kt$Oj?RO0v$+Nsez+w`
zNXNVHj&~}x4#xQ5HbcU2y!(;1;rLa*X&;U!9$hdTU;f)m((&Z<!gPGifsS-MMS5tv
zhh*$#NWKvJ*U1LE8Imu={!6mKZieIwv0q2I`abr(V6;cr^Gz1=yD^@;Ki<5-Th^-G
zfmil|sa1{A@~y(IUCQ!<df-ezKUInPq7yjEn(%#P6@pqFy1<<blx-LNn!VGh7RxRS
z{J^vA-i)&E$_Hy~+pBq150viO1VgXp+Y^rO4&{K|)}Q&+Tzq}R?ZTvEJ1{|HoDYTy
zZoLlb{R52{Z5Y<-Tka@$`J}D=DXU(?U~vWg){wGB>;c=(kB{XxZM6EM$jW{$AN2PZ
zb~<%Cs5@TZ=L(t37_dPI&@0$bymi;9<O<_<CV!I+6ScY-h(fQG{fyUoV6b(l&3;of
z`;ZIC$`!s_1mpI25@J|wH)q_kVJu(E+Zp9d)asbq`IF*#=T)ho%BDOnr*d9p@BAv2
zit>YEZTI<uD;8VN*5E*E{*tP+tL{~UlCr9`z_m7R+PcyYoWi|Ogtbb=tp>_+ciN@8
zQ*ptQTXwiOBRQ(j25xH6br?N?%|6566HTMx4%fj-LdN59_<Z?wqb(*K)bAo0yCo#+
zey$Mwha_XyLo)iui<14PiK8F(iD-l1uunD{K7C<BKR<#G`<XxpP3OOCV1@qC4{r!H
z;=2Bn;JB%?|NX?-&s8)&AoOKCj~VzO%@1y1*q&;1--`c51OL5&NB{l9F@0SpNT1`!
z8wa-c8r`?_KQi!74IH2Pu<7%jYv3sVKPDNwuSiCHtQpR8J8kC3uN3?`;uycIlVyU-
zI{B&aBhSmQf#0tA;X2uD(BC1rtP@A`!*x;;`ZAvT417xS!*%k2K|eI`#|`{x1Am^l
zUMDXb^p6TI^YFIda=j-7m-?rP<Giv?{w(xmoqQ~~tdkfP4m7<^778xw<Vxb$$vCe!
z@EbKhTqmmx`Zo(M*PGV-aGh)r`ZAt74Sa{@hwB92K48O6`tb}rFz~$w{t$7!P98Dn
z9}*m20P^+ztl%=vR|J=Ni=I2cUaylkgubkkw*;4U@-cC}PR<A}>*U|UkBqa8-Ui`e
z$rlml{BWIg8T6M5F4udV=7;NKnb4Q>_8a)knjfx{TMha-10OeV+rWQ8ob$l<ow7lH
zQgE4vX~E@s4+<{p>Pg~yog5bWvQCZ&F6-o&@FVX##|4*la$5M2aeiXpf7kqQo&3|F
ze@<|jpV$H@Kxmw2u9Jnt_4-_5;7c_>y$!eOT7&)#20moqKQr)+#5sPhlM#dd4#DMl
zFA6U6f1lt|e~P$XCl3gHStrwi%Q`tC{K$9rzZ3dDfn4&wcS3Od-<bI+!KMEMod>+_
z$^Yk58c))ApT_yE_#ZXSZ^iki8urg`zf*LdWzKKE_i3Epawj<en&-m11enlf+hUyJ
z5nTcJixECyP{I^f{BmFz9I1iSXek{KXM5Je-;eN$*45cw*Vo5`U_0OZa<{-a_3>J=
z*MCXMQb04>guG**FKi#c2j?HafoRKVqO4#jvO7AVYCj}@F8f_%|8vbgL1VVZ-?ikE
zC99v(7pmE@9^T>E7c=bHex-yU>$=Ct9=ewHKbrp;&4Aa>%2-GGxyNINaf^U<R&%Vm
zQ}4oY-6z^TN{_=_r)Y<IoEGM%4ffBG{Zaa<6zy>*(Ct4l*x$1hl%c)Q=%T#wZi|TL
F{|1634<Y~n

literal 0
HcmV?d00001

diff --git a/utility/debug.cpp b/utility/debug.cpp
index 50ad6e01..bd4daa9f 100644
--- a/utility/debug.cpp
+++ b/utility/debug.cpp
@@ -2,35 +2,117 @@
 #ifndef NO_MPI
   #include "MPIdata.h" // for get_rank
 #endif
+#include "ompdefs.h" // for omp_get_thread_num
 #include "debug.h"
 
 #define implement_dprintvar_fileLine(code,type) \
-  void dprintvar_fileLine(const char* func, const char* file, int line, \
+  void printvar_fileLine(const char* func, const char* file, int line, \
     const char* name, type val) \
   { \
-    dfprintf_fileLine(stdout,func,file,line, code " == %s",val,name); \
+    fprintf_fileLine(stdout,"DEBUG", func,file,line, code " == %s",val,name); \
   }
 
 implement_dprintvar_fileLine("%s", const char *);
 implement_dprintvar_fileLine("%d", int);
-implement_dprintvar_fileLine("%f", double);
+implement_dprintvar_fileLine("%g", double);
+implement_dprintvar_fileLine("%p", const void *);
 
-void dfprintf_fileLine(FILE * fptr, const char *func, const char *file, int line_number, const char *format, ...) {
-  fflush(fptr);
+// void dfprintf_fileLine(FILE * fptr, const char *func, const char *file, int line_number, const char *format, ...)
+// {
+//   // writing directly to fptr would avoid limiting the length
+//   // of the output string, but by first writing to a string
+//   // we achieve thread safety.
+//   //
+//   // write the message to a string.
+//   //
+//   const int maxchars = 1024;
+//   char error_msg[maxchars+2];
+//   // identify the process and thread
+//   char process_thread_str[20];
+//   #ifndef NO_MPI
+//     #ifdef _OPENMP
+//       snprintf(process_thread_str, 20, "(%d.%d) ",
+//         MPIdata::get_rank(), omp_get_thread_num());
+//     #else
+//       snprintf(process_thread_str, 20, "(%d)",
+//         MPIdata::get_rank());
+//     #endif
+//   #else
+//     #ifdef _OPENMP
+//       snprintf(process_thread_str, 20, "(.%d) ",
+//         omp_get_thread_num());
+//     #else
+//       snprintf(process_thread_str, 20, "");
+//     #endif
+//   #endif
+//   char *sptr = error_msg;
+//   int chars_so_far=0;
+//   va_list args;
+//   va_start(args, format);
+//   chars_so_far = snprintf(sptr, maxchars,
+//     "%sDEBUG %s(), %s:%d: ",
+//     process_thread_str,
+//     func, file, // my_basename(file),
+//     line_number);
+//   /* print out remainder of message */
+//   chars_so_far += vsnprintf(sptr+chars_so_far, maxchars-chars_so_far, format, args);
+//   va_end(args);
+//   sprintf(sptr+chars_so_far, "\n");
+// 
+//   // print the message
+//   fflush(fptr);
+//     fprintf(fptr,error_msg);
+//   fflush(fptr);
+// }
+
+void fprintf_fileLine(FILE * fptr,
+  const char *type, const char *func, const char *file, int line_number,
+  const char *format, ...)
+{
+  // writing directly to fptr would avoid limiting the length
+  // of the output string, but by first writing to a string
+  // we achieve thread safety.
+  //
+  // write the message to a string.
+  //
+  const int maxchars = 1024;
+  char error_msg[maxchars+2];
+  // identify the process and thread
+  char process_thread_str[20];
+  #ifndef NO_MPI
+    #ifdef _OPENMP
+      snprintf(process_thread_str, 20, "(%d.%d) ",
+        MPIdata::get_rank(), omp_get_thread_num());
+    #else
+      snprintf(process_thread_str, 20, "(%d)",
+        MPIdata::get_rank());
+    #endif
+  #else
+    #ifdef _OPENMP
+      snprintf(process_thread_str, 20, "(.%d) ",
+        omp_get_thread_num());
+    #else
+      snprintf(process_thread_str, 20, "");
+    #endif
+  #endif
+  char *sptr = error_msg;
+  int chars_so_far=0;
   va_list args;
   va_start(args, format);
-  fprintf(fptr,
-#ifndef NO_MPI
-    "(%d) DEBUG %s(), %s:%d: ",
-    MPIdata::get_rank(),
-#else
-    "DEBUG %s(), %s:%d: ",
-#endif
+  chars_so_far = snprintf(sptr, maxchars,
+    "%s%s %s(), %s:%d: ",
+    process_thread_str,
+    type,
     func, file, // my_basename(file),
     line_number);
   /* print out remainder of message */
-  vfprintf(fptr, format, args);
+  chars_so_far += vsnprintf(sptr+chars_so_far, maxchars-chars_so_far, format, args);
   va_end(args);
-  fprintf(fptr, "\n");
+  sprintf(sptr+chars_so_far, "\n");
+
+  // print the message
+  fflush(fptr);
+    fprintf(fptr,error_msg);
   fflush(fptr);
 }
+
diff --git a/utility/errors.cpp b/utility/errors.cpp
index 3572df99..9d5d66a9 100644
--- a/utility/errors.cpp
+++ b/utility/errors.cpp
@@ -1,4 +1,8 @@
  
+#ifndef NO_MPI
+  #include "MPIdata.h" // for get_rank
+#endif
+#include "ompdefs.h" // for omp_get_thread_num
 #include <cstdarg>
 #include <cstdio>
 #include <cstdlib>
@@ -7,25 +11,32 @@
 
 /** implementation of declarations in errors.h **/
 
-void errmsg_printf_fileLine(const char *func, const char *file, int line_number,
-  const char *format, ...)
-{
-  FILE* fptr = stdout;
-  fflush(fptr);
-  va_list args;
-  va_start(args, format);
-  fprintf(fptr, "ERROR in function %s, file %s, line %d: \n\t",
-    func, file, line_number);
-  /* print out remainder of message */
-  vfprintf(fptr, format, args);
-  va_end(args);
-  // append terminating newline so user does not have to do it
-  fprintf(fptr, "\n");
-  fflush(fptr);
+void fprintf_fileLine(FILE * fptr, const char *type, const char *func, const char *file, int line_number, const char *format, ...);
 
-  abort();
-}
+// This is not thread-safe.
+//void errmsg_printf_fileLine(const char *func, const char *file, int line_number,
+//  const char *format, ...)
+//{
+//  FILE* fptr = stdout;
+//  fflush(fptr);
+//  va_list args;
+//  va_start(args, format);
+//  fprintf(fptr, "ERROR in function %s, file %s, line %d: \n\t",
+//    func, file, line_number);
+//  /* print out remainder of message */
+//  vfprintf(fptr, format, args);
+//  va_end(args);
+//  // append terminating newline so user does not have to do it
+//  fprintf(fptr, "\n");
+//  fflush(fptr);
+//
+//  abort();
+//}
 
+// This needs to be fixed to be thread-safe like
+// eprintf_fileLine() below.  Write the message to a string and
+// then print it out as an atomic operation.
+//
 #include <iostream>
 using namespace std;
 #define implement_invalid_value_error(t1) \
@@ -42,3 +53,56 @@ implement_invalid_value_error(double);
 implement_invalid_value_error(int);
 implement_invalid_value_error(const char*);
 
+/*! a more verbose version of fprintf_fileLine for use in 
+ * warnings and error messages */
+void eprintf_fileLine(FILE * fptr, const char *type,
+  const char *func, const char *file, int line_number,
+  const char *format, ...) 
+{
+  // writing directly to fptr would avoid limiting the length
+  // of the output string, but by first writing to a string
+  // we achieve thread safety.
+  //
+  // write the message to a string.
+  //
+  const int maxchars = 1024;
+  char error_msg[maxchars+2];
+  // identify the process and thread
+  char process_thread_str[50];
+  #ifndef NO_MPI
+    #ifdef _OPENMP
+      snprintf(process_thread_str, 50, ", process %d, thread %d",
+        MPIdata::get_rank(), omp_get_thread_num());
+    #else
+      snprintf(process_thread_str, 50, ", process %d",
+        MPIdata::get_rank());
+    #endif
+  #else
+    #ifdef _OPENMP
+      snprintf(process_thread_str, 50, ", thread %d",
+        omp_get_thread_num());
+    #else
+      sprintf(process_thread_str, "");
+    #endif
+  #endif
+  char *sptr = error_msg;
+  int chars_so_far=0;
+  va_list args;
+  va_start(args, format);
+  chars_so_far = snprintf(sptr, maxchars,
+    "%s in method %s(), file %s, line %d%s:\n\t",
+    type,
+    func, file, // my_basename(file),
+    line_number, process_thread_str);
+  /* print out remainder of message */
+  chars_so_far += vsnprintf(sptr+chars_so_far, maxchars-chars_so_far, format, args);
+  va_end(args);
+  sprintf(sptr+chars_so_far, "\n");
+
+  // print the message
+  fflush(fptr);
+    // #pragma omp critical // need this?
+    { fprintf(fptr,error_msg); }
+  fflush(fptr);
+  abort();
+}
diff --git a/utility/new/diagnostics.cpp b/utility/new/diagnostics.cpp
new file mode 100644
index 00000000..3b8d7372
--- /dev/null
+++ b/utility/new/diagnostics.cpp
@@ -0,0 +1,183 @@
+
+/** implementation of declarations in utility/debug.h **/
+
+#include <stdarg.h>
+#include "TimeTasks.h"
+#include "debug.h"
+#include "asserts.h"
+#include "../mpidata/MPIdata.h" // for rank
+
+#define implement_dprintvar_fileLine(code,type) \
+  void dprintvar_fileLine(const char* func, const char* file, int line, \
+    const char* name, type val) \
+  { \
+    dfprintf_fileLine(stderr,func,file,line, \
+      code " == %s",val,name); \
+  }
+implement_dprintvar_fileLine("%s",const char*);
+implement_dprintvar_fileLine("%d",int);
+//implement_dprintvar_fileLine("%24.16e",double);
+implement_dprintvar_fileLine("%f",double);
+
+void dfprintf_fileLine(FILE* fptr, const char *func, const char *file, int line_number,
+  const char *format, ...)
+{
+  fflush(fptr);
+  va_list args;
+  va_start(args, format);
+  fprintf(fptr, "(%d) DEBUG %s(), %s:%d: ",
+    get_rank(), func,
+    file, // my_basename(file),
+    line_number);
+  /* print out remainder of message */
+  vfprintf(fptr, format, args);
+  va_end(args);
+  fprintf(fptr,"\n");
+  fflush(fptr);
+}
+
+int get_rank() { return mpi->rank; }
+
+/** implementation of declarations in utility/assert.h **/
+
+// so that we can print doubles to desired precision
+//
+void assert_error(const char* file, int line, const char* func,
+  const char* op, const char* lhs_str, const char* rhs_str,
+  double lhs, double rhs)
+{
+  fprintf(stderr,"ERROR in file %s, line %d, function %s"
+      "\n\tassertion failed: %s %s %s, i.e., %24.16e %s %24.16e\n",
+    file, line, func, lhs_str, op, rhs_str, lhs, op, rhs);
+  abort();
+}
+
+#define implement_assert_errmsg(t1,t2) \
+  void assert_error(const char* file, int line, const char* func, \
+    const char* op, const char* lhs_str, const char* rhs_str, \
+    t1 lhs, t2 rhs) \
+  { \
+    std::cerr<< "ERROR in file " << file << ", line " << line  \
+      << ", function " << func  \
+      <<"\n\tassertion failed: " << lhs_str << op << rhs_str \
+      << ", i.e., " << lhs << op << rhs << endl; \
+      abort(); \
+  }
+
+implement_assert_errmsg(int,int);
+implement_assert_errmsg(const char*,const char*);
+implement_assert_errmsg(const string&,const string&);
+
+/** implementation of declarations in utility/TimeTasks.h **/
+
+void TimeTasks::resetCycle()
+{
+  for(int e=0;e<LAST;e++)
+  {
+    //compute[e]=0.;
+    start_times[e]=0.;
+    task_duration[e]=0.;
+    communicate[e]=0.;
+  }
+  active_task=NONE;
+  active_mode=COMPUTATION;
+  t_start_communicate = 0.;
+}
+void TimeTasks::start(int taskid)
+{
+  assert_eq(active_task+1,taskid);
+  active_task = taskid;
+  double now = MPI_Wtime();
+  start_times[active_task] = now;
+}
+void TimeTasks::end(int taskid)
+{
+  assert_eq(taskid,active_task);
+  double now = MPI_Wtime();
+  task_duration[active_task] = now - start_times[active_task];
+  compute[active_task] = task_duration[active_task]-communicate[active_task];
+}
+void TimeTasks::start_communicate()
+{
+  if(!active_task) return;
+  assert_eq(active_mode,COMPUTATION);
+  t_start_communicate = MPI_Wtime();
+  active_mode=COMMUNICATION;
+}
+void TimeTasks::addto_communicate()
+{
+  if(!active_task) return;
+  assert_eq(active_mode,COMMUNICATION);
+  assert_ne(t_start_communicate,0.);
+  communicate[active_task] += MPI_Wtime()-t_start_communicate;
+  t_start_communicate = 0.;
+  active_mode=COMPUTATION;
+}
+#define TIMING_PREFIX "| "
+void TimeTasks::print_cycle_times()
+{
+  if(!get_rank())
+  {
+    fflush(stdout);
+    fprintf(stdout,"=== timing information for cycle=== \n");
+    fprintf(stdout, TIMING_PREFIX
+      "moms flds pcls Bfld cycl\n");
+    fprintf(stdout, TIMING_PREFIX
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f (total time)\n",
+      get_time(TimeTasks::MOMENTS),
+      get_time(TimeTasks::FIELDS),
+      get_time(TimeTasks::PARTICLES),
+      get_time(TimeTasks::BFIELD),
+      get_time()
+      );
+    fprintf(stdout, TIMING_PREFIX
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f (communication)\n",
+      get_communicate(TimeTasks::MOMENTS),
+      get_communicate(TimeTasks::FIELDS),
+      get_communicate(TimeTasks::PARTICLES),
+      get_communicate(TimeTasks::BFIELD),
+      get_communicate()
+      );
+    fprintf(stdout, TIMING_PREFIX
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f "
+      "%4.2f (computation)\n",
+      get_compute(TimeTasks::MOMENTS),
+      get_compute(TimeTasks::FIELDS),
+      get_compute(TimeTasks::PARTICLES),
+      get_compute(TimeTasks::BFIELD),
+      get_compute()
+      );
+    fprintf(stdout, TIMING_PREFIX
+      "MOMS comm  FLDS comm  PCLS comm  CYCL comm\n");
+    fprintf(stdout, TIMING_PREFIX
+      "%4.2f "
+      "%4.2f  "
+      "%4.2f "
+      "%4.2f  "
+      "%4.2f "
+      "%4.2f  "
+      "%4.2f "
+      "%4.2f\n",
+      get_time(TimeTasks::MOMENTS),
+      get_communicate(TimeTasks::MOMENTS),
+      get_time(TimeTasks::FIELDS),
+      get_communicate(TimeTasks::FIELDS),
+      get_time(TimeTasks::PARTICLES),
+      get_communicate(TimeTasks::PARTICLES),
+      get_time(),
+      get_communicate()
+      );
+    fflush(stdout);
+  }
+}

From 44e99ea753de2510416ce2929abe129e5675d6a8 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 13 Jan 2014 14:29:43 +0100
Subject: [PATCH 075/118] fixed bug in sumMoments_vectorized()

---
 fields/EMfields3D.cpp         | 174 ++++++++++++++++++++++++++--------
 fields/Moments.cpp            |  17 ++--
 include/Alloc.h               |  11 ++-
 include/EMfields3D.h          |   4 +-
 main/Parameters.cpp           |  10 +-
 main/iPic3Dlib.cpp            |  27 ++++--
 particles/Particles3D.cpp     |  30 ++++--
 particles/Particles3Dcomm.cpp |   6 +-
 8 files changed, 208 insertions(+), 71 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index a059fc77..b7bd9b32 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -192,7 +192,8 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
   injFieldsFront  = new injInfoFields(nxn, nyn, nzn);
   injFieldsRear   = new injInfoFields(nxn, nyn, nzn);
 
-  sizeMomentsArray = omp_get_max_threads();
+  // EDIT: delete "+ns" before checking in
+  sizeMomentsArray = omp_get_max_threads()+ns;
   moments10Array = new Moments10*[sizeMomentsArray];
   for(int i=0;i<sizeMomentsArray;i++)
   {
@@ -380,6 +381,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
   // subarrays.
   //#ifdef _OPENMP
   #pragma omp parallel
+  {
   for (int i = 0; i < ns; i++)
   {
     const Particles3Dcomm& pcls = part[i];
@@ -399,8 +401,16 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
     int thread_num = omp_get_thread_num();
     if(!thread_num) { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
     Moments10& speciesMoments10 = fetch_moments10Array(thread_num);
-    speciesMoments10.set_to_zero();
     arr4_double moments = speciesMoments10.fetch_arr();
+    //
+    // moments.setmode(ompmode::mine);
+    // moments.setall(0.);
+    // 
+    double *moments1d = &moments[0][0][0][0];
+    int moments1dsize = moments.get_size();
+    for(int i=0; i<moments1dsize; i++) moments1d[i]=0;
+    //
+    #pragma omp barrier
     // The following loop is expensive, so it is wise to assume that the
     // compiler is stupid.  Therefore we should on the one hand
     // expand things out and on the other hand avoid repeating computations.
@@ -534,6 +544,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
     // when we change to use asynchronous communication.
     // communicateGhostP2G(is, 0, 0, 0, 0, vct);
   }
+  }
   for (int i = 0; i < ns; i++)
   {
     communicateGhostP2G(i, 0, 0, 0, 0, vct);
@@ -553,6 +564,7 @@ void EMfields3D::sumMoments_vectorized(
   const double ystart = grid->getYstart();
   const double zstart = grid->getZstart();
   #pragma omp parallel
+  {
   for (int species_idx = 0; species_idx < ns; species_idx++)
   {
     const Particles3Dcomm& pcls = part[species_idx];
@@ -570,14 +582,61 @@ void EMfields3D::sumMoments_vectorized(
     const int nop = pcls.getNOP();
     #pragma omp master
     { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
-    Moments10& speciesMoments10 = fetch_moments10Array(0);
-    speciesMoments10.set_to_zero();
+    // EDIT change arr_idx to 0 before checking this in!!
+    const int arr_idx = omp_get_max_threads()+is;
+    Moments10& speciesMoments10 = fetch_moments10Array(arr_idx);
     arr4_double moments = speciesMoments10.fetch_arr();
-    #pragma omp for collapse(2) // schedule(static)
-    for(int cx=0;cx<nxc;cx++)
-    for(int cy=0;cy<nyc;cy++)
+    //
+    // moments.setmode(ompmode::ompfor);
+    //moments.setall(0.);
+    double *moments1d = &moments[0][0][0][0];
+    int moments1dsize = moments.get_size();
+    #pragma omp for // because shared
+    for(int i=0; i<moments1dsize; i++) moments1d[i]=0;
+    //
+    #pragma omp for collapse(2)
+    for(int cx=0;cx<nxc; cx++)
+    for(int cy=0;cy<nyc; cy++)
+    for(int cz=0;cz<nzc; cz++)
+    for(int m=0;m<10;m++)
+    {
+      assert_eq(moments[cx][cy][cz][m],0.);
+      //if(moments[cx][cy][cz][m]!=0.)
+      //{
+      //  eprintf("moments[%d][%d][%d][%d]==%g", cx, cy, cz, m, moments[cx][cy][cz][m]);
+      //}
+    }
+    
+    for(int cxmod2=0; cxmod2<2; cxmod2++)
+    // threads should not be writing to the same location
+    #pragma omp for
+    for(int cx=cxmod2;cx<nxc;cx+=2) {
+    for(int cy=0;cy<nyc;cy++) {
     for(int cz=0;cz<nzc;cz++)
     {
+     //dprint(cz);
+     // index of interface to right of cell
+     const int ix = cx + 1;
+     const int iy = cy + 1;
+     const int iz = cz + 1;
+     {
+      // reference the 8 nodes to which we will
+      // write moment data for particles in this mesh cell.
+      //
+      arr1_double_fetch momentsArray[8];
+      arr2_double_fetch moments00 = moments[ix][iy];
+      arr2_double_fetch moments01 = moments[ix][cy];
+      arr2_double_fetch moments10 = moments[cx][iy];
+      arr2_double_fetch moments11 = moments[cx][cy];
+      momentsArray[0] = moments00[iz]; // moments000 
+      momentsArray[1] = moments00[cz]; // moments001 
+      momentsArray[2] = moments01[iz]; // moments010 
+      momentsArray[3] = moments01[cz]; // moments011 
+      momentsArray[4] = moments10[iz]; // moments100 
+      momentsArray[5] = moments10[cz]; // moments101 
+      momentsArray[6] = moments11[iz]; // moments110 
+      momentsArray[7] = moments11[cz]; // moments111 
+
       const int numpcls_in_cell = pcls.get_numpcls_in_bucket(cx,cy,cz);
       const int bucket_offset = pcls.get_bucket_offset(cx,cy,cz);
       const int bucket_end = bucket_offset+numpcls_in_cell;
@@ -619,19 +678,15 @@ void EMfields3D::sumMoments_vectorized(
         const double cxm1_pos = rel_xpos * inv_dx;
         const double cym1_pos = rel_ypos * inv_dy;
         const double czm1_pos = rel_zpos * inv_dz;
-        if(false)
-        {
-          const int cx_inf = int(floor(cxm1_pos));
-          const int cy_inf = int(floor(cym1_pos));
-          const int cz_inf = int(floor(czm1_pos));
-          assert_eq(cx-1,cx_inf);
-          assert_eq(cy-1,cy_inf);
-          assert_eq(cz-1,cz_inf);
-        }
-        // index of interface to right of cell
-        const int ix = cx + 1;
-        const int iy = cy + 1;
-        const int iz = cz + 1;
+        //if(true)
+        //{
+        //  const int cx_inf = int(floor(cxm1_pos));
+        //  const int cy_inf = int(floor(cym1_pos));
+        //  const int cz_inf = int(floor(czm1_pos));
+        //  assert_eq(cx-1,cx_inf);
+        //  assert_eq(cy-1,cy_inf);
+        //  assert_eq(cz-1,cz_inf);
+        //}
         // fraction of the distance from the right of the cell
         const double w1x = cx - cxm1_pos;
         const double w1y = cy - cym1_pos;
@@ -659,27 +714,17 @@ void EMfields3D::sumMoments_vectorized(
 
         // add particle to moments
         {
-          arr1_double_fetch momentsArray[8];
-          arr2_double_fetch moments00 = moments[ix][iy];
-          arr2_double_fetch moments01 = moments[ix][cy];
-          arr2_double_fetch moments10 = moments[cx][iy];
-          arr2_double_fetch moments11 = moments[cx][cy];
-          momentsArray[0] = moments00[iz]; // moments000 
-          momentsArray[1] = moments00[cz]; // moments001 
-          momentsArray[2] = moments01[iz]; // moments010 
-          momentsArray[3] = moments01[cz]; // moments011 
-          momentsArray[4] = moments10[iz]; // moments100 
-          momentsArray[5] = moments10[cz]; // moments101 
-          momentsArray[6] = moments11[iz]; // moments110 
-          momentsArray[7] = moments11[cz]; // moments111 
-
           for(int m=0; m<10; m++)
           for(int c=0; c<8; c++)
           {
             momentsArray[c][m] += velmoments[m]*weights[c];
+            assert_isnum(momentsArray[c][m]);
           }
         }
       }
+     }
+    }
+    }
     }
     #pragma omp master
     { timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION); }
@@ -689,8 +734,8 @@ void EMfields3D::sumMoments_vectorized(
     { timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION); }
     {
       #pragma omp for collapse(2)
-      for(int i=0;i<nxn;i++)
-      for(int j=0;j<nyn;j++)
+      for(int i=0;i<nxn;i++){
+      for(int j=0;j<nyn;j++){
       for(int k=0;k<nzn;k++)
       {
         rhons[is][i][j][k] = invVOL*moments[i][j][k][0];
@@ -703,7 +748,7 @@ void EMfields3D::sumMoments_vectorized(
         pYYsn[is][i][j][k] = invVOL*moments[i][j][k][7];
         pYZsn[is][i][j][k] = invVOL*moments[i][j][k][8];
         pZZsn[is][i][j][k] = invVOL*moments[i][j][k][9];
-      }
+      }}}
     }
     #pragma omp master
     { timeTasks_end_task(TimeTasks::MOMENT_REDUCTION); }
@@ -711,12 +756,67 @@ void EMfields3D::sumMoments_vectorized(
     // when we change to use asynchronous communication.
     // communicateGhostP2G(is, 0, 0, 0, 0, vct);
   }
+  }
   for (int i = 0; i < ns; i++)
   {
     communicateGhostP2G(i, 0, 0, 0, 0, vct);
   }
 }
 
+void EMfields3D::checkMoment(const Particles3Dcomm* part)
+{
+}
+
+void EMfields3D::checkMoments(const Particles3Dcomm* part)
+{
+  #pragma omp parallel
+  for (int species_idx = 0; species_idx < ns; species_idx++)
+  {
+    const Particles3Dcomm& pcls = part[species_idx];
+    const int is = pcls.get_ns();
+    assert_eq(species_idx,is);
+
+    const int nop = pcls.getNOP();
+    const int arr_idx = omp_get_max_threads()+is;
+    Moments10& speciesMoments10 = fetch_moments10Array(arr_idx);
+    arr4_double moments = speciesMoments10.fetch_arr();
+
+    //#pragma omp master
+    //eprintf("rhons[%d][1][1][1]:=%g but invVOL*moments[1][1][1][0]:=%g",
+    // is,
+    // getRHOns(is,1,1,1),
+    // // rhons[is][1][1][1],
+    // invVOL*moments[1][1][1][0]);
+    //#pragma omp barrier
+    // ghost and boundary cell values will be changed,
+    // but interior values should be the same.
+    #pragma omp for collapse(2)
+    for(int i=2;i<nxn-2;i++)
+    for(int j=2;j<nyn-2;j++)
+    for(int k=2;k<nzn-2;k++)
+    {
+      if(fcmp(rhons[is][i][j][k], invVOL*moments[i][j][k][0], 1e-14))
+      {
+        dprintf("rhons[%d][%d][%d][%d]:=%g but invVOL*moments[%d][%d][%d][0]:=%g",
+          is,i,j,k, rhons[is][i][j][k],
+          i,j,k, invVOL*moments[i][j][k][0]);
+        abort();
+      }
+      //assert_almost_eq(rhons[is][i][j][k], invVOL*moments[i][j][k][0]);
+      assert_almost_eq(rhons[is][i][j][k], invVOL*moments[i][j][k][0]);
+      assert_almost_eq(Jxs  [is][i][j][k], invVOL*moments[i][j][k][1]);
+      assert_almost_eq(Jys  [is][i][j][k], invVOL*moments[i][j][k][2]);
+      assert_almost_eq(Jzs  [is][i][j][k], invVOL*moments[i][j][k][3]);
+      assert_almost_eq(pXXsn[is][i][j][k], invVOL*moments[i][j][k][4]);
+      assert_almost_eq(pXYsn[is][i][j][k], invVOL*moments[i][j][k][5]);
+      assert_almost_eq(pXZsn[is][i][j][k], invVOL*moments[i][j][k][6]);
+      assert_almost_eq(pYYsn[is][i][j][k], invVOL*moments[i][j][k][7]);
+      assert_almost_eq(pYZsn[is][i][j][k], invVOL*moments[i][j][k][8]);
+      assert_almost_eq(pZZsn[is][i][j][k], invVOL*moments[i][j][k][9]);
+    }
+  }
+}
+
 /*! Calculate Electric field with the implicit solver: the Maxwell solver method is called here */
 void EMfields3D::calculateE(Grid * grid, VirtualTopology3D * vct, Collective *col) {
   if (vct->getCartesian_rank() == 0)
diff --git a/fields/Moments.cpp b/fields/Moments.cpp
index c1518b4d..b7b09b85 100644
--- a/fields/Moments.cpp
+++ b/fields/Moments.cpp
@@ -3,13 +3,14 @@
 
 void Moments10::set_to_zero()
 {
-  #pragma omp parallel for collapse(4)
-  for (register int i = 0; i < nx; i++)
-  for (register int j = 0; j < ny; j++)
-  for (register int k = 0; k < nz; k++)
-  for (register int m = 0; m < 10; m++)
-  {
-    arr[i][j][k][m] = 0.0;
-  }
+  arr.setall(0);
+  //#pragma omp parallel for collapse(4)
+  //for (register int i = 0; i < nx; i++)
+  //for (register int j = 0; j < ny; j++)
+  //for (register int k = 0; k < nz; k++)
+  //for (register int m = 0; m < 10; m++)
+  //{
+  //  arr[i][j][k][m] = 0.0;
+  //}
 }
 
diff --git a/include/Alloc.h b/include/Alloc.h
index e8456785..63100dc3 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -212,8 +212,10 @@ namespace iPic3D
       base_arr(size_t s) : size(s), arr(AlignedAlloc(type, s)) {}
       base_arr(type* in, size_t s) : size(s), arr(in) {}
       ~base_arr(){}
+      int get_size() { return size; }
       void free() { AlignedFree(arr); }
       void setall(type val){
+        // #pragma omp for
         for(size_t i=0;i<size;i++) arr[i]=val;
       }
       //type* fetch_arr(){return arr;}
@@ -547,6 +549,7 @@ namespace iPic3D
       void set(size_t n3,size_t n2,size_t n1, type value)
         { const_array_ref3<type>::set(n3,n2,n1, value); }
       void setall(type val){
+        // #pragma omp for
         for(size_t i=0;i<size;i++) arr[i]=val;
       }
       type*** fetch_arr3(){ return (type***) arr3; }
@@ -577,6 +580,7 @@ namespace iPic3D
         S4(s4), S3(s3), S2(s2), S1(s1),
         arr4(in)
       { }
+      int get_size() const { return size; }
     #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
       const const_array_get3<type> operator[](size_t n4)const{
         check_bounds(n4, S4);
@@ -615,9 +619,8 @@ namespace iPic3D
     protected:
       void setall(type val)
       {
-        #pragma omp for
-        for(int i=0;i<size;i++)
-          arr[i]=val;
+        // #pragma omp for
+        for(int i=0;i<size;i++) arr[i]=val;
       }
   };
   
@@ -632,6 +635,8 @@ namespace iPic3D
       using const_array_ref4<type>::S1;
       using const_array_ref4<type>::arr4;
       using const_array_ref4<type>::getidx;
+    public: // this did not work unless I made the using statment public.
+      using const_array_ref4<type>::get_size;
     public:
       ~array_ref4(){}
       array_ref4(size_t s4, size_t s3, size_t s2, size_t s1) :
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index ab0cc1df..c55d68ce 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -122,6 +122,8 @@ class EMfields3D                // :public Field
     /*! sum moments (interp_P2G) versions */
     void sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
     void sumMoments_vectorized(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
+    void checkMoments(const Particles3Dcomm* part);
+    void checkMoment(const Particles3Dcomm* part);
     void sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
     /*! add accumulated moments to the moments for a given species */
     //void addToSpeciesMoments(const TenMoments & in, int is);
@@ -264,7 +266,7 @@ class EMfields3D                // :public Field
     /*! fetch array for summing moments of thread i */
     Moments10& fetch_moments10Array(int i){
       assert_le(0,i);
-      assert_le(i,sizeMomentsArray);
+      assert_lt(i,sizeMomentsArray);
       return *(moments10Array[i]);
     }
 
diff --git a/main/Parameters.cpp b/main/Parameters.cpp
index 740586de..b94b239d 100644
--- a/main/Parameters.cpp
+++ b/main/Parameters.cpp
@@ -9,8 +9,10 @@ void Parameters::init_parameters()
   SORTING_PARTICLES = get_VECTORIZE_MOMENTS() || get_VECTORIZE_MOVER();
 }
 
-bool Parameters::get_SORTING_PARTICLES() { return SORTING_PARTICLES; }
+//bool Parameters::get_SORTING_PARTICLES() { return SORTING_PARTICLES; }
+bool Parameters::get_SORTING_PARTICLES() { return true; }
 bool Parameters::get_VECTORIZE_MOMENTS() { return false; }
-bool Parameters::get_VECTORIZE_MOVER() { return false; }
-// this will also return true if we communicate particles per iteration
-bool Parameters::get_USING_XAVG() { return get_VECTORIZE_MOVER(); }
+bool Parameters::get_VECTORIZE_MOVER() { return true; }
+// this must also return true if we communicate particles per iteration
+//bool Parameters::get_USING_XAVG() { return get_VECTORIZE_MOVER(); }
+bool Parameters::get_USING_XAVG() { return get_SORTING_PARTICLES(); }
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 891a16af..3462c132 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -4,6 +4,9 @@
 #include "ipicdefs.h"
 #include "debug.h"
 #include "Parameters.h"
+#include "ompdefs.h"
+
+#include "Moments.h" // for debugging
 
 using namespace iPic3D;
 MPIdata* iPic3D::c_Solver::mpi=0;
@@ -178,17 +181,19 @@ void c_Solver::CalculateMoments() {
   timeTasks_set_main_task(TimeTasks::MOMENTS);
 
   EMf->updateInfoFields(grid,vct,col);
-  EMf->setZeroDensities();                  // set to zero the densities
+  EMf->setZeroDensities();
 
   if(Parameters::get_SORTING_PARTICLES())
   {
     // sort particles
-    #pragma omp master
-    timeTasks_begin_task(TimeTasks::MOMENT_PCL_SORTING);
-    for(int species_idx=0; species_idx<ns; species_idx++)
-      part[species_idx].sort_particles_serial(grid,vct);
-    #pragma omp master
-    timeTasks_end_task(TimeTasks::MOMENT_PCL_SORTING);
+    //#pragma omp master
+    {
+      //dprint(omp_get_thread_num());
+      timeTasks_begin_task(TimeTasks::MOMENT_PCL_SORTING);
+      for(int species_idx=0; species_idx<ns; species_idx++)
+        part[species_idx].sort_particles_serial(grid,vct);
+      timeTasks_end_task(TimeTasks::MOMENT_PCL_SORTING);
+    }
   }
 
   if(Parameters::get_VECTORIZE_MOMENTS())
@@ -201,6 +206,12 @@ void c_Solver::CalculateMoments() {
   {
     EMf->sumMoments(part, grid, vct);
   }
+  // do the moments calculated by the old and new code agree?
+  //EMf->setZeroDensities();
+  //EMf->sumMoments_vectorized(part, grid, vct);
+  //EMf->setZeroDensities();
+  //EMf->sumMoments(part, grid, vct);
+  //EMf->checkMoments(part);
   //for (int i = 0; i < ns; i++)
   //{
   //  EMf->sumMomentsOld(part[i], grid, vct);
@@ -245,6 +256,7 @@ bool c_Solver::ParticlesMover() {
     // Should change this to add background field
     EMf->set_fieldForPcls();
     #pragma omp parallel
+    {
     for (int i = 0; i < ns; i++)  // move each species
     {
       // #pragma omp task inout(part[i]) in(grid) target_device(booster)
@@ -256,6 +268,7 @@ bool c_Solver::ParticlesMover() {
       else
         part[i].mover_PC(grid, vct, EMf);
     }
+    }
     for (int i = 0; i < ns; i++)  // move each species
     {
       mem_avail = part[i].communicate_particles(vct);
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 9a88d51c..fde7536f 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -319,7 +319,7 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
 
   #pragma omp master
-  timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING);
+  { timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING); }
   const pfloat dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
   #pragma omp for schedule(static)
   // why does single precision make no difference in execution speed?
@@ -453,7 +453,7 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
     w[pidx] = 2.0 * wavg - worig;
   }                             // END OF ALL THE PARTICLES
   #pragma omp master
-  timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING);
+  { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
 }
 
 /** mover with a Predictor-Corrector scheme */
@@ -485,10 +485,12 @@ void Particles3D::mover_PC_vectorized(
     if(niter>1) // on first iteration already was sorted to sum moments
     {
       #pragma omp master
-      timeTasks_begin_task(TimeTasks::MOVER_PCL_SORTING);
-      sort_particles_serial(xavg, yavg, zavg, grid,vct);
-      #pragma omp master
-      timeTasks_end_task(TimeTasks::MOVER_PCL_SORTING);
+      {
+        timeTasks_begin_task(TimeTasks::MOVER_PCL_SORTING);
+        sort_particles_serial(xavg, yavg, zavg, grid,vct);
+        timeTasks_end_task(TimeTasks::MOVER_PCL_SORTING);
+      }
+      #pragma omp barrier
     }
 
     #pragma omp master
@@ -534,15 +536,23 @@ void Particles3D::mover_PC_vectorized(
       for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
       {
         // serial case: check that pidx is correct
-        if(true)
-        {
-          assert_eq(pidx,serial_pidx++);
-        }
+        //assert_eq(pidx,serial_pidx++);
         // confirm that particle is in correct cell
         if(true)
         {
           int cx_,cy_,cz_;
           get_safe_cell_for_pos(cx_,cy_,cz_,xavg[pidx],yavg[pidx],zavg[pidx]);
+          //if((cx_!=cx)
+          // ||(cy_!=cy)
+          // ||(cz_!=cz))
+          //{
+          //  dprintf("\n\t cx =%d, cy =%d, cz =%d"
+          //          "\n\t cx_=%d, cy_=%d, cz_=%d"
+          //          "\n\t x=%g, y=%g, z_=%g",
+          //          cx,cy,cz,
+          //          cx_,cy_,cz_,
+          //          xavg[pidx], yavg[pidx], zavg[pidx]);
+          //}
           assert_eq(cx_,cx);
           assert_eq(cy_,cy);
           assert_eq(cz_,cz);
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index f64e7eb1..0f267a59 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -986,7 +986,6 @@ void Particles3Dcomm::sort_particles_serial(
   double *xpos, double *ypos, double *zpos,
   Grid * grid, VirtualTopology3D * vct)
 {
-  #pragma omp critical (sort_particles_serial)
   {
     numpcls_in_bucket->setall(0);
     // iterate through particles and count where they will go
@@ -1018,6 +1017,7 @@ void Particles3Dcomm::sort_particles_serial(
       (*bucket_offset)[cx][cy][cz] = accpcls;
       accpcls += (*numpcls_in_bucket)[cx][cy][cz];
     }
+    assert_eq(accpcls,nop);
 
     numpcls_in_bucket_now->setall(0);
     // put the particles where they are supposed to go
@@ -1035,6 +1035,10 @@ void Particles3Dcomm::sort_particles_serial(
       // compute where the data should go
       const int numpcls_now = (*numpcls_in_bucket_now)[cx][cy][cz]++;
       const int outpidx = (*bucket_offset)[cx][cy][cz] + numpcls_now;
+      assert_lt(outpidx, nop);
+      assert_ge(outpidx, 0);
+      assert_lt(pidx, nop);
+      assert_ge(pidx, 0);
 
       // copy particle data to new location
       //

From 4c36c2dcd0ba16d24ee02815f4dc58c9992cde9d Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 14 Jan 2014 23:34:26 +0100
Subject: [PATCH 076/118] cleanup after previous commit

---
 fields/EMfields3D.cpp | 142 +++++++++++-------------------------------
 include/EMfields3D.h  |   6 +-
 main/iPic3Dlib.cpp    |   9 +--
 3 files changed, 44 insertions(+), 113 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index b7bd9b32..534743f4 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -4,6 +4,7 @@
 #include "Particles3Dcomm.h"
 #include "TimeTasks.h"
 #include "Moments.h"
+#include "Parameters.h"
 #include "ompdefs.h"
 #include "debug.h"
 
@@ -192,8 +193,17 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
   injFieldsFront  = new injInfoFields(nxn, nyn, nzn);
   injFieldsRear   = new injInfoFields(nxn, nyn, nzn);
 
-  // EDIT: delete "+ns" before checking in
-  sizeMomentsArray = omp_get_max_threads()+ns;
+  if(Parameters::get_VECTORIZE_MOMENTS())
+  {
+    // In this case particles are sorted
+    // and there is no need for each thread
+    // to sum moments in a separate array.
+    sizeMomentsArray = 1;
+  }
+  else
+  {
+    sizeMomentsArray = omp_get_max_threads();
+  }
   moments10Array = new Moments10*[sizeMomentsArray];
   for(int i=0;i<sizeMomentsArray;i++)
   {
@@ -582,9 +592,7 @@ void EMfields3D::sumMoments_vectorized(
     const int nop = pcls.getNOP();
     #pragma omp master
     { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
-    // EDIT change arr_idx to 0 before checking this in!!
-    const int arr_idx = omp_get_max_threads()+is;
-    Moments10& speciesMoments10 = fetch_moments10Array(arr_idx);
+    Moments10& speciesMoments10 = fetch_moments10Array(0);
     arr4_double moments = speciesMoments10.fetch_arr();
     //
     // moments.setmode(ompmode::ompfor);
@@ -593,25 +601,13 @@ void EMfields3D::sumMoments_vectorized(
     int moments1dsize = moments.get_size();
     #pragma omp for // because shared
     for(int i=0; i<moments1dsize; i++) moments1d[i]=0;
-    //
-    #pragma omp for collapse(2)
-    for(int cx=0;cx<nxc; cx++)
-    for(int cy=0;cy<nyc; cy++)
-    for(int cz=0;cz<nzc; cz++)
-    for(int m=0;m<10;m++)
-    {
-      assert_eq(moments[cx][cy][cz][m],0.);
-      //if(moments[cx][cy][cz][m]!=0.)
-      //{
-      //  eprintf("moments[%d][%d][%d][%d]==%g", cx, cy, cz, m, moments[cx][cy][cz][m]);
-      //}
-    }
     
+    // prevent threads from writing to the same location
     for(int cxmod2=0; cxmod2<2; cxmod2++)
-    // threads should not be writing to the same location
-    #pragma omp for
-    for(int cx=cxmod2;cx<nxc;cx+=2) {
-    for(int cy=0;cy<nyc;cy++) {
+    for(int cymod2=0; cymod2<2; cymod2++)
+    #pragma omp for collapse(2)
+    for(int cx=cxmod2;cx<nxc;cx+=2)
+    for(int cy=cymod2;cy<nyc;cy+=2)
     for(int cz=0;cz<nzc;cz++)
     {
      //dprint(cz);
@@ -640,7 +636,8 @@ void EMfields3D::sumMoments_vectorized(
       const int numpcls_in_cell = pcls.get_numpcls_in_bucket(cx,cy,cz);
       const int bucket_offset = pcls.get_bucket_offset(cx,cy,cz);
       const int bucket_end = bucket_offset+numpcls_in_cell;
-      #pragma simd
+      // Why does uncommenting here cause a segmentation fault below on xeon?
+      //#pragma simd
       for(int i=bucket_offset; i<bucket_end; i++)
       {
         // compute the quadratic moments of velocity
@@ -714,18 +711,19 @@ void EMfields3D::sumMoments_vectorized(
 
         // add particle to moments
         {
+          // which is the superior order for the following loop?
           for(int m=0; m<10; m++)
           for(int c=0; c<8; c++)
           {
             momentsArray[c][m] += velmoments[m]*weights[c];
-            assert_isnum(momentsArray[c][m]);
+            // When simd above is uncommented,
+            // the following statement prevents segmentation fault
+            //assert_isnum(momentsArray[c][m]);
           }
         }
       }
      }
     }
-    }
-    }
     #pragma omp master
     { timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION); }
 
@@ -763,60 +761,6 @@ void EMfields3D::sumMoments_vectorized(
   }
 }
 
-void EMfields3D::checkMoment(const Particles3Dcomm* part)
-{
-}
-
-void EMfields3D::checkMoments(const Particles3Dcomm* part)
-{
-  #pragma omp parallel
-  for (int species_idx = 0; species_idx < ns; species_idx++)
-  {
-    const Particles3Dcomm& pcls = part[species_idx];
-    const int is = pcls.get_ns();
-    assert_eq(species_idx,is);
-
-    const int nop = pcls.getNOP();
-    const int arr_idx = omp_get_max_threads()+is;
-    Moments10& speciesMoments10 = fetch_moments10Array(arr_idx);
-    arr4_double moments = speciesMoments10.fetch_arr();
-
-    //#pragma omp master
-    //eprintf("rhons[%d][1][1][1]:=%g but invVOL*moments[1][1][1][0]:=%g",
-    // is,
-    // getRHOns(is,1,1,1),
-    // // rhons[is][1][1][1],
-    // invVOL*moments[1][1][1][0]);
-    //#pragma omp barrier
-    // ghost and boundary cell values will be changed,
-    // but interior values should be the same.
-    #pragma omp for collapse(2)
-    for(int i=2;i<nxn-2;i++)
-    for(int j=2;j<nyn-2;j++)
-    for(int k=2;k<nzn-2;k++)
-    {
-      if(fcmp(rhons[is][i][j][k], invVOL*moments[i][j][k][0], 1e-14))
-      {
-        dprintf("rhons[%d][%d][%d][%d]:=%g but invVOL*moments[%d][%d][%d][0]:=%g",
-          is,i,j,k, rhons[is][i][j][k],
-          i,j,k, invVOL*moments[i][j][k][0]);
-        abort();
-      }
-      //assert_almost_eq(rhons[is][i][j][k], invVOL*moments[i][j][k][0]);
-      assert_almost_eq(rhons[is][i][j][k], invVOL*moments[i][j][k][0]);
-      assert_almost_eq(Jxs  [is][i][j][k], invVOL*moments[i][j][k][1]);
-      assert_almost_eq(Jys  [is][i][j][k], invVOL*moments[i][j][k][2]);
-      assert_almost_eq(Jzs  [is][i][j][k], invVOL*moments[i][j][k][3]);
-      assert_almost_eq(pXXsn[is][i][j][k], invVOL*moments[i][j][k][4]);
-      assert_almost_eq(pXYsn[is][i][j][k], invVOL*moments[i][j][k][5]);
-      assert_almost_eq(pXZsn[is][i][j][k], invVOL*moments[i][j][k][6]);
-      assert_almost_eq(pYYsn[is][i][j][k], invVOL*moments[i][j][k][7]);
-      assert_almost_eq(pYZsn[is][i][j][k], invVOL*moments[i][j][k][8]);
-      assert_almost_eq(pZZsn[is][i][j][k], invVOL*moments[i][j][k][9]);
-    }
-  }
-}
-
 /*! Calculate Electric field with the implicit solver: the Maxwell solver method is called here */
 void EMfields3D::calculateE(Grid * grid, VirtualTopology3D * vct, Collective *col) {
   if (vct->getCartesian_rank() == 0)
@@ -1765,30 +1709,8 @@ void EMfields3D::communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft,
   communicateNode_P(nxn, nyn, nzn, pZZsn, ns, vct);
 }
 
-/* add moments (e.g. from an OpenMP thread) to the accumulated moments */
-//void EMfields3D::addToSpeciesMoments(const TenMoments & in, int is) {
-//  assert_eq(in.get_nx(), nxn);
-//  assert_eq(in.get_ny(), nyn);
-//  assert_eq(in.get_nz(), nzn);
-//  for (register int i = 0; i < nxn; i++) {
-//    for (register int j = 0; j < nyn; j++)
-//      for (register int k = 0; k < nzn; k++) {
-//        rhons[is][i][j][k] += invVOL*in.get_rho(i, j, k);
-//        Jxs  [is][i][j][k] += invVOL*in.get_Jx(i, j, k);
-//        Jys  [is][i][j][k] += invVOL*in.get_Jy(i, j, k);
-//        Jzs  [is][i][j][k] += invVOL*in.get_Jz(i, j, k);
-//        pXXsn[is][i][j][k] += invVOL*in.get_pXX(i, j, k);
-//        pXYsn[is][i][j][k] += invVOL*in.get_pXY(i, j, k);
-//        pXZsn[is][i][j][k] += invVOL*in.get_pXZ(i, j, k);
-//        pYYsn[is][i][j][k] += invVOL*in.get_pYY(i, j, k);
-//        pYZsn[is][i][j][k] += invVOL*in.get_pYZ(i, j, k);
-//        pZZsn[is][i][j][k] += invVOL*in.get_pZZ(i, j, k);
-//      }
-//  }
-//}
-
-/*! set to 0 all the densities fields */
-void EMfields3D::setZeroDensities() {
+void EMfields3D::setZeroDerivedMoments()
+{
   for (register int i = 0; i < nxn; i++)
     for (register int j = 0; j < nyn; j++)
       for (register int k = 0; k < nzn; k++) {
@@ -1806,6 +1728,12 @@ void EMfields3D::setZeroDensities() {
         rhoc[i][j][k] = 0.0;
         rhoh[i][j][k] = 0.0;
       }
+}
+
+void EMfields3D::setZeroPrimaryMoments() {
+
+  // set primary moments to zero
+  //
   for (register int kk = 0; kk < ns; kk++)
     for (register int i = 0; i < nxn; i++)
       for (register int j = 0; j < nyn; j++)
@@ -1823,6 +1751,12 @@ void EMfields3D::setZeroDensities() {
         }
 
 }
+/*! set to 0 all the densities fields */
+void EMfields3D::setZeroDensities() {
+  setZeroDerivedMoments();
+  setZeroPrimaryMoments();
+}
+
 /*!SPECIES: Sum the charge density of different species on NODES */
 void EMfields3D::sumOverSpecies(VirtualTopology3D * vct) {
   for (int is = 0; is < ns; is++)
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index c55d68ce..5d2f4780 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -104,6 +104,10 @@ class EMfields3D                // :public Field
     void interpDensitiesN2C(VirtualTopology3D * vct, Grid * grid);
     /*! set to 0 all the densities fields */
     void setZeroDensities();
+    /*! set to 0 primary moments */
+    void setZeroPrimaryMoments();
+    /*! set to 0 all densities derived from primary moments */
+    void setZeroDerivedMoments();
     /*! Sum rhon over species */
     void sumOverSpecies(VirtualTopology3D * vct);
     /*! Sum current over different species */
@@ -122,8 +126,6 @@ class EMfields3D                // :public Field
     /*! sum moments (interp_P2G) versions */
     void sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
     void sumMoments_vectorized(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
-    void checkMoments(const Particles3Dcomm* part);
-    void checkMoment(const Particles3Dcomm* part);
     void sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
     /*! add accumulated moments to the moments for a given species */
     //void addToSpeciesMoments(const TenMoments & in, int is);
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 3462c132..4d8c532d 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -181,7 +181,6 @@ void c_Solver::CalculateMoments() {
   timeTasks_set_main_task(TimeTasks::MOMENTS);
 
   EMf->updateInfoFields(grid,vct,col);
-  EMf->setZeroDensities();
 
   if(Parameters::get_SORTING_PARTICLES())
   {
@@ -204,18 +203,14 @@ void c_Solver::CalculateMoments() {
   }
   else
   {
+    EMf->setZeroPrimaryMoments();
     EMf->sumMoments(part, grid, vct);
   }
-  // do the moments calculated by the old and new code agree?
-  //EMf->setZeroDensities();
-  //EMf->sumMoments_vectorized(part, grid, vct);
-  //EMf->setZeroDensities();
-  //EMf->sumMoments(part, grid, vct);
-  //EMf->checkMoments(part);
   //for (int i = 0; i < ns; i++)
   //{
   //  EMf->sumMomentsOld(part[i], grid, vct);
   //}
+  EMf->setZeroDerivedMoments();
   EMf->sumOverSpecies(vct);                 // sum all over the species
 
   // Fill with constant charge the planet

From bef8049c5bbd52c6917a8f2e3c50b4939bb29b12 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 14 Jan 2014 23:36:35 +0100
Subject: [PATCH 077/118] created mechanism to restrict debug to master thread
 of main process

---
 include/parallel.h | 18 ++++++++++++++++++
 utility/debug.cpp  |  5 ++++-
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 include/parallel.h

diff --git a/include/parallel.h b/include/parallel.h
new file mode 100644
index 00000000..4c6f9ece
--- /dev/null
+++ b/include/parallel.h
@@ -0,0 +1,18 @@
+#ifndef _parallel_h_
+#define _parallel_h_
+/*********************************
+ * General header for parallelism
+ * (MPI, OpenMP, and SIMD)
+ *********************************/
+
+#include "MPIdata.h"
+#include "ompdefs.h"
+
+/*! used to restrict output to a single thread of a single process */
+//inline bool is_main_master_thread()
+inline bool is_output_thread()
+{
+  return !(MPIdata::get_rank() || omp_get_thread_num());
+}
+
+#endif
diff --git a/utility/debug.cpp b/utility/debug.cpp
index bd4daa9f..44d79626 100644
--- a/utility/debug.cpp
+++ b/utility/debug.cpp
@@ -4,6 +4,7 @@
 #endif
 #include "ompdefs.h" // for omp_get_thread_num
 #include "debug.h"
+#include "parallel.h" // temporary
 
 #define implement_dprintvar_fileLine(code,type) \
   void printvar_fileLine(const char* func, const char* file, int line, \
@@ -14,7 +15,7 @@
 
 implement_dprintvar_fileLine("%s", const char *);
 implement_dprintvar_fileLine("%d", int);
-implement_dprintvar_fileLine("%g", double);
+implement_dprintvar_fileLine("%e", double);
 implement_dprintvar_fileLine("%p", const void *);
 
 // void dfprintf_fileLine(FILE * fptr, const char *func, const char *file, int line_number, const char *format, ...)
@@ -69,6 +70,8 @@ void fprintf_fileLine(FILE * fptr,
   const char *type, const char *func, const char *file, int line_number,
   const char *format, ...)
 {
+  //if(!is_output_thread()) return; // temporary
+
   // writing directly to fptr would avoid limiting the length
   // of the output string, but by first writing to a string
   // we achieve thread safety.

From a351d13e2b856ba86405c02731f0613a4436285c Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 14 Jan 2014 23:38:33 +0100
Subject: [PATCH 078/118] implemented assert_almost_eq() with tolerance
 parameter like with fcmp

---
 include/asserts.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/asserts.h b/include/asserts.h
index 8a8a8718..7bac3452 100644
--- a/include/asserts.h
+++ b/include/asserts.h
@@ -125,12 +125,12 @@ extern "C" {
 #define builtin_expect(a,b) __builtin_expect(a,b)
 #endif
 // check whether two numbers are equal within machine precision
-#define assert_not_almost_eq(lhs,rhs) \
-  (fcmp(lhs, rhs, 1e-14) \
+#define assert_not_almost_eq(lhs,rhs,tol) \
+  (fcmp(lhs, rhs, tol) \
    ? (void)0 \
    : assert_error(__FILE__, __LINE__, __func__, " !=~= ", #lhs, #rhs, lhs, rhs))
-#define assert_almost_eq(lhs,rhs) \
-  (builtin_expect(fcmp(lhs, rhs, 1e-14),0) \
+#define assert_almost_eq(lhs,rhs,tol) \
+  (builtin_expect(fcmp(lhs, rhs, tol),0) \
    ? assert_error(__FILE__, __LINE__, __func__, " =~= ", #lhs, #rhs, lhs, rhs) \
    : (void)0)
 //#define assert_almost_eq(lhs,rhs) \

From eda80987da50efc079a8bbd2033084f36f974289 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 14 Jan 2014 23:44:45 +0100
Subject: [PATCH 079/118] setting get_VECTORIZE_MOMENTS to true

---
 main/Parameters.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main/Parameters.cpp b/main/Parameters.cpp
index b94b239d..77abb87e 100644
--- a/main/Parameters.cpp
+++ b/main/Parameters.cpp
@@ -11,7 +11,7 @@ void Parameters::init_parameters()
 
 //bool Parameters::get_SORTING_PARTICLES() { return SORTING_PARTICLES; }
 bool Parameters::get_SORTING_PARTICLES() { return true; }
-bool Parameters::get_VECTORIZE_MOMENTS() { return false; }
+bool Parameters::get_VECTORIZE_MOMENTS() { return true; }
 bool Parameters::get_VECTORIZE_MOVER() { return true; }
 // this must also return true if we communicate particles per iteration
 //bool Parameters::get_USING_XAVG() { return get_VECTORIZE_MOVER(); }

From 7b35413de496f5f6f5d038009e3b3e25dac8e4c3 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 15 Jan 2014 11:21:15 +0100
Subject: [PATCH 080/118] Restored iteration order over momentsArray[c][m] so m
 is inner.

---
 fields/EMfields3D.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 534743f4..ed247f17 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -712,8 +712,8 @@ void EMfields3D::sumMoments_vectorized(
         // add particle to moments
         {
           // which is the superior order for the following loop?
-          for(int m=0; m<10; m++)
           for(int c=0; c<8; c++)
+          for(int m=0; m<10; m++)
           {
             momentsArray[c][m] += velmoments[m]*weights[c];
             // When simd above is uncommented,

From 1a4e10bbb7acb508d41a8afe5d0ee561d0a44ac5 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 15 Jan 2014 13:52:43 +0100
Subject: [PATCH 081/118] fixed compile errors from merge

---
 fields/EMfields3D.cpp | 23 +++++++++++------------
 include/EMfields3D.h  | 22 +++++++++++-----------
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index f07c4c74..d3d21653 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -120,7 +120,7 @@ EMfields3D::EMfields3D(Collective * col, Grid * grid) :
   vectY (nxn, nyn, nzn),
   vectZ (nxn, nyn, nzn),
   divC  (nxc, nyc, nzc),
-  //arr (double,nxc-2,nyc-2,nzc-2);
+  arr (nxc-2,nyc-2,nzc-2),
   // B_ext and J_ext should not be allocated unless used.
   Bx_ext(nxn,nyn,nzn),
   By_ext(nxn,nyn,nzn),
@@ -3517,8 +3517,7 @@ void EMfields3D::BoundaryConditionsE(arr3_double vectorX, arr3_double vectorY, a
 }
 
 /*! get Electric Field component X array cell without the ghost cells */
-void EMfields3D::getExc(Grid3DCU *grid) {
-
+arr3_double EMfields3D::getExc(Grid3DCU *grid) {
   array3_double tmp(nxc,nyc,nzc);
   grid->interpN2C(tmp, Ex);
 
@@ -3529,7 +3528,7 @@ void EMfields3D::getExc(Grid3DCU *grid) {
   return arr;
 }
 /*! get Electric Field component Y array cell without the ghost cells */
-double ***EMfields3D::getEyc(Grid3DCU *grid) {
+arr3_double EMfields3D::getEyc(Grid3DCU *grid) {
   array3_double tmp(nxc,nyc,nzc);
   grid->interpN2C(tmp, Ey);
 
@@ -3540,7 +3539,7 @@ double ***EMfields3D::getEyc(Grid3DCU *grid) {
   return arr;
 }
 /*! get Electric Field component Z array cell without the ghost cells */
-double ***EMfields3D::getEzc(Grid3DCU *grid) {
+arr3_double EMfields3D::getEzc(Grid3DCU *grid) {
   array3_double tmp(nxc,nyc,nzc);
   grid->interpN2C(tmp, Ez);
 
@@ -3551,7 +3550,7 @@ double ***EMfields3D::getEzc(Grid3DCU *grid) {
   return arr;
 }
 /*! get Magnetic Field component X array cell without the ghost cells */
-double ***EMfields3D::getBxc() {
+arr3_double EMfields3D::getBxc() {
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
@@ -3559,7 +3558,7 @@ double ***EMfields3D::getBxc() {
   return arr;
 }
 /*! get Magnetic Field component Y array cell without the ghost cells */
-double ***EMfields3D::getByc() {
+arr3_double EMfields3D::getByc() {
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
@@ -3567,7 +3566,7 @@ double ***EMfields3D::getByc() {
   return arr;
 }
 /*! get Magnetic Field component Z array cell without the ghost cells */
-double ***EMfields3D::getBzc() {
+arr3_double EMfields3D::getBzc() {
   for (int i = 1; i < nxc-1; i++)
     for (int j = 1; j < nyc-1; j++)
       for (int k = 1; k < nzc-1; k++)
@@ -3575,7 +3574,7 @@ double ***EMfields3D::getBzc() {
   return arr;
 }
 /*! get species density component X array cell without the ghost cells */
-double ***EMfields3D::getRHOcs(Grid3DCU *grid, int is) {
+arr3_double EMfields3D::getRHOcs(Grid3DCU *grid, int is) {
   array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, rhons);
 
@@ -3587,7 +3586,7 @@ double ***EMfields3D::getRHOcs(Grid3DCU *grid, int is) {
 }
 
 /*! get Magnetic Field component X array species is cell without the ghost cells */
-double ***EMfields3D::getJxsc(Grid3DCU *grid, int is) {
+arr3_double EMfields3D::getJxsc(Grid3DCU *grid, int is) {
   array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, Jxs);
 
@@ -3599,7 +3598,7 @@ double ***EMfields3D::getJxsc(Grid3DCU *grid, int is) {
 }
 
 /*! get current component Y array species is cell without the ghost cells */
-double ***EMfields3D::getJysc(Grid3DCU *grid, int is) {
+arr3_double EMfields3D::getJysc(Grid3DCU *grid, int is) {
   array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, Jys);
 
@@ -3610,7 +3609,7 @@ double ***EMfields3D::getJysc(Grid3DCU *grid, int is) {
   return arr;
 }
 /*! get current component Z array species is cell without the ghost cells */
-double ***EMfields3D::getJzsc(Grid3DCU *grid, int is) {
+arr3_double EMfields3D::getJzsc(Grid3DCU *grid, int is) {
   array4_double tmp(ns,nxc,nyc,nzc);
   grid->interpN2C(tmp, is, Jzs);
 
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 8bdd954e..e4e6f624 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -203,12 +203,12 @@ class EMfields3D                // :public Field
 
     // field components without ghost cells
     //
-    void getExc(arr3_double arr, Grid3DCU *grid);
-    void getEyc(arr3_double arr, Grid3DCU *grid);
-    void getEzc(arr3_double arr, Grid3DCU *grid);
-    void getBxc(arr3_double arr);
-    void getByc(arr3_double arr);
-    void getBzc(arr3_double arr);
+    arr3_double getExc(Grid3DCU *grid);
+    arr3_double getEyc(Grid3DCU *grid);
+    arr3_double getEzc(Grid3DCU *grid);
+    arr3_double getBxc();
+    arr3_double getByc();
+    arr3_double getBzc();
 
     arr3_double getRHOc() { return rhoc; }
     arr3_double getRHOn() { return rhon; }
@@ -221,7 +221,7 @@ class EMfields3D                // :public Field
     double getRHOns(int X,int Y,int Z,int is)const{return rhons.get(is,X,Y,Z);}
     arr4_double getRHOns(){return rhons;}
     /* density on cells without ghost cells */
-    void getRHOcs(arr3_double arr, Grid3DCU *grid, int is);
+    arr3_double getRHOcs(Grid3DCU *grid, int is);
 
     double getBx_ext(int X, int Y, int Z) const{return Bx_ext.get(X,Y,Z);}
     double getBy_ext(int X, int Y, int Z) const{return By_ext.get(X,Y,Z);}
@@ -256,9 +256,9 @@ class EMfields3D                // :public Field
 
     // get current for species in all cells except ghost
     //
-    void getJxsc(arr3_double arr, Grid3DCU *grid, int is);
-    void getJysc(arr3_double arr, Grid3DCU *grid, int is);
-    void getJzsc(arr3_double arr, Grid3DCU *grid, int is);
+    arr3_double getJxsc(Grid3DCU *grid, int is);
+    arr3_double getJysc(Grid3DCU *grid, int is);
+    arr3_double getJzsc(Grid3DCU *grid, int is);
 
     /*! get the electric field energy */
     double getEenergy();
@@ -397,7 +397,7 @@ class EMfields3D                // :public Field
     array3_double vectY;
     array3_double vectZ;
     array3_double divC;
-    //array3_double arr;
+    array3_double arr;
     /* temporary arrays for summing moments */
     int sizeMomentsArray;
     Moments10 **moments10Array;

From d20621b90e1f471ecc02bfaf5e5a7423aecfb488 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 15 Jan 2014 14:40:15 +0100
Subject: [PATCH 082/118] trying to vectorize moments acumulation

---
 fields/EMfields3D.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index d3d21653..7283eb2f 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -606,6 +606,7 @@ void EMfields3D::sumMoments_vectorized(
     // prevent threads from writing to the same location
     for(int cxmod2=0; cxmod2<2; cxmod2++)
     for(int cymod2=0; cymod2<2; cymod2++)
+    // each mesh cell is handled by its own thread
     #pragma omp for collapse(2)
     for(int cx=cxmod2;cx<nxc;cx+=2)
     for(int cy=cymod2;cy<nyc;cy+=2)
@@ -634,6 +635,14 @@ void EMfields3D::sumMoments_vectorized(
       momentsArray[6] = moments11[iz]; // moments110 
       momentsArray[7] = moments11[cz]; // moments111 
 
+      // accumulator for moments per each of 8 threads
+      double momentsAcc[8][10][8];
+      for(int c=0; c<8; c++)
+      for(int m=0; m<10; m++)
+      for(int i=0; i<8; i++)
+      {
+        momentsAcc[c][m][i] = 0;
+      }
       const int numpcls_in_cell = pcls.get_numpcls_in_bucket(cx,cy,cz);
       const int bucket_offset = pcls.get_bucket_offset(cx,cy,cz);
       const int bucket_end = bucket_offset+numpcls_in_cell;
@@ -716,13 +725,24 @@ void EMfields3D::sumMoments_vectorized(
           for(int c=0; c<8; c++)
           for(int m=0; m<10; m++)
           {
-            momentsArray[c][m] += velmoments[m]*weights[c];
+            momentsAcc[c][m][i%8] += velmoments[m]*weights[c];
+            //momentsArray[c][m] += velmoments[m]*weights[c];
             // When simd above is uncommented,
             // the following statement prevents segmentation fault
             //assert_isnum(momentsArray[c][m]);
           }
         }
       }
+      // reduce the moments for this cell
+      for(int c=0; c<8; c++)
+      for(int m=0; m<10; m++)
+      for(int i=0; i<8; i++)
+      {
+        momentsArray[c][m] += momentsAcc[c][m][i];
+        // When simd above is uncommented,
+        // the following statement prevents segmentation fault
+        //assert_isnum(momentsArray[c][m]);
+      }
      }
     }
     #pragma omp master

From c7692ad7171be92e3a9c33a39900405d2b409711 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 15 Jan 2014 16:19:26 +0100
Subject: [PATCH 083/118] vectorized summing moments

---
 fields/EMfields3D.cpp | 276 +++++++++++++++++++++++++++---------------
 1 file changed, 181 insertions(+), 95 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 7283eb2f..54bdf0ff 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -562,6 +562,110 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
   }
 }
 
+inline void compute_moments(double momentsAcc[8][10][8],
+  int i,
+  int imod,
+  double const * const x,
+  double const * const y,
+  double const * const z,
+  double const * const u,
+  double const * const v,
+  double const * const w,
+  double const * const q,
+  double xstart,
+  double ystart,
+  double zstart,
+  double inv_dx,
+  double inv_dy,
+  double inv_dz,
+  int cx,
+  int cy,
+  int cz)
+{
+  // compute the quadratic moments of velocity
+  //
+  const double ui=u[i];
+  const double vi=v[i];
+  const double wi=w[i];
+  const double uui=ui*ui;
+  const double uvi=ui*vi;
+  const double uwi=ui*wi;
+  const double vvi=vi*vi;
+  const double vwi=vi*wi;
+  const double wwi=wi*wi;
+  double velmoments[10];
+  velmoments[0] = 1.;
+  velmoments[1] = ui;
+  velmoments[2] = vi;
+  velmoments[3] = wi;
+  velmoments[4] = uui;
+  velmoments[5] = uvi;
+  velmoments[6] = uwi;
+  velmoments[7] = vvi;
+  velmoments[8] = vwi;
+  velmoments[9] = wwi;
+
+  // compute the weights to distribute the moments
+  //
+  double weights[8];
+  const double abs_xpos = x[i];
+  const double abs_ypos = y[i];
+  const double abs_zpos = z[i];
+  const double rel_xpos = abs_xpos - xstart;
+  const double rel_ypos = abs_ypos - ystart;
+  const double rel_zpos = abs_zpos - zstart;
+  const double cxm1_pos = rel_xpos * inv_dx;
+  const double cym1_pos = rel_ypos * inv_dy;
+  const double czm1_pos = rel_zpos * inv_dz;
+  //if(true)
+  //{
+  //  const int cx_inf = int(floor(cxm1_pos));
+  //  const int cy_inf = int(floor(cym1_pos));
+  //  const int cz_inf = int(floor(czm1_pos));
+  //  assert_eq(cx-1,cx_inf);
+  //  assert_eq(cy-1,cy_inf);
+  //  assert_eq(cz-1,cz_inf);
+  //}
+  // fraction of the distance from the right of the cell
+  const double w1x = cx - cxm1_pos;
+  const double w1y = cy - cym1_pos;
+  const double w1z = cz - czm1_pos;
+  // fraction of distance from the left
+  const double w0x = 1-w1x;
+  const double w0y = 1-w1y;
+  const double w0z = 1-w1z;
+  // we are calculating a charge moment.
+  const double qi=q[i];
+  const double weight0 = qi*w0x;
+  const double weight1 = qi*w1x;
+  const double weight00 = weight0*w0y;
+  const double weight01 = weight0*w1y;
+  const double weight10 = weight1*w0y;
+  const double weight11 = weight1*w1y;
+  weights[0] = weight00*w0z; // weight000
+  weights[1] = weight00*w1z; // weight001
+  weights[2] = weight01*w0z; // weight010
+  weights[3] = weight01*w1z; // weight011
+  weights[4] = weight10*w0z; // weight100
+  weights[5] = weight10*w1z; // weight101
+  weights[6] = weight11*w0z; // weight110
+  weights[7] = weight11*w1z; // weight111
+
+  // add particle to moments
+  {
+    // which is the superior order for the following loop?
+    for(int c=0; c<8; c++)
+    for(int m=0; m<10; m++)
+    {
+      momentsAcc[c][m][imod] += velmoments[m]*weights[c];
+      //momentsArray[c][m] += velmoments[m]*weights[c];
+      // When simd above is uncommented,
+      // the following statement prevents segmentation fault
+      //assert_isnum(momentsArray[c][m]);
+    }
+  }
+}
+
 void EMfields3D::sumMoments_vectorized(
   const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct)
 {
@@ -637,111 +741,93 @@ void EMfields3D::sumMoments_vectorized(
 
       // accumulator for moments per each of 8 threads
       double momentsAcc[8][10][8];
-      for(int c=0; c<8; c++)
-      for(int m=0; m<10; m++)
-      for(int i=0; i<8; i++)
-      {
-        momentsAcc[c][m][i] = 0;
-      }
       const int numpcls_in_cell = pcls.get_numpcls_in_bucket(cx,cy,cz);
       const int bucket_offset = pcls.get_bucket_offset(cx,cy,cz);
       const int bucket_end = bucket_offset+numpcls_in_cell;
-      // Why does uncommenting here cause a segmentation fault below on xeon?
-      //#pragma simd
-      for(int i=bucket_offset; i<bucket_end; i++)
+
+      // calculate lower and upper bounds of cell index that
+      // are divisible by the width of the vector unit.
+      const int aligned_start = (bucket_offset+(8-1))/8*8;
+      const int aligned_end = bucket_end/8*8;
+      if(aligned_start >= aligned_end)
       {
-        // compute the quadratic moments of velocity
-        //
-        const double ui=u[i];
-        const double vi=v[i];
-        const double wi=w[i];
-        const double uui=ui*ui;
-        const double uvi=ui*vi;
-        const double uwi=ui*wi;
-        const double vvi=vi*vi;
-        const double vwi=vi*wi;
-        const double wwi=wi*wi;
-        double velmoments[10];
-        velmoments[0] = 1.;
-        velmoments[1] = ui;
-        velmoments[2] = vi;
-        velmoments[3] = wi;
-        velmoments[4] = uui;
-        velmoments[5] = uvi;
-        velmoments[6] = uwi;
-        velmoments[7] = vvi;
-        velmoments[8] = vwi;
-        velmoments[9] = wwi;
-
-        // compute the weights to distribute the moments
-        //
-        double weights[8];
-        const double abs_xpos = x[i];
-        const double abs_ypos = y[i];
-        const double abs_zpos = z[i];
-        const double rel_xpos = abs_xpos - xstart;
-        const double rel_ypos = abs_ypos - ystart;
-        const double rel_zpos = abs_zpos - zstart;
-        const double cxm1_pos = rel_xpos * inv_dx;
-        const double cym1_pos = rel_ypos * inv_dy;
-        const double czm1_pos = rel_zpos * inv_dz;
-        //if(true)
+        for(int c=0; c<8; c++)
+        for(int m=0; m<10; m++)
+        {
+          momentsAcc[c][m][0] = 0;
+        }
+        for(int i=bucket_offset; i<bucket_end; i++)
+        {
+          compute_moments(momentsAcc, i, 0,
+            x, y, z, u, v, w, q,
+            xstart, ystart, zstart,
+            inv_dx, inv_dy, inv_dz,
+            cx, cy, cz);
+        }
+        for(int c=0; c<8; c++)
+        for(int m=0; m<10; m++)
+        {
+          momentsArray[c][m] += momentsAcc[c][m][0];
+        }
+      }
+      // can vectorize for aligned section of particles
+      else
+      {
+        for(int c=0; c<8; c++)
+        for(int m=0; m<10; m++)
+        for(int i=0; i<8; i++)
+        {
+          momentsAcc[c][m][i] = 0;
+        }
+        //const int numSections = (aligned_end-aligned_start)/8;
+        assert_le(bucket_offset, aligned_start);
+        assert_le(aligned_start, aligned_end);
+        assert_le(aligned_end,bucket_end);
+
+        for(int i=bucket_offset; i<aligned_start; i++)
+        {
+          compute_moments(momentsAcc, i, 0,
+            x, y, z, u, v, w, q,
+            xstart, ystart, zstart,
+            inv_dx, inv_dy, inv_dz,
+            cx, cy, cz);
+        }
+        //for(int i=aligned_start; i<aligned_end; i++)
         //{
-        //  const int cx_inf = int(floor(cxm1_pos));
-        //  const int cy_inf = int(floor(cym1_pos));
-        //  const int cz_inf = int(floor(czm1_pos));
-        //  assert_eq(cx-1,cx_inf);
-        //  assert_eq(cy-1,cy_inf);
-        //  assert_eq(cz-1,cz_inf);
+        //  compute_moments(momentsAcc, i, i%8,
+        //    x, y, z, u, v, w, q,
+        //    xstart, ystart, zstart,
+        //    inv_dx, inv_dy, inv_dz,
+        //    cx, cy, cz);
         //}
-        // fraction of the distance from the right of the cell
-        const double w1x = cx - cxm1_pos;
-        const double w1y = cy - cym1_pos;
-        const double w1z = cz - czm1_pos;
-        // fraction of distance from the left
-        const double w0x = 1-w1x;
-        const double w0y = 1-w1y;
-        const double w0z = 1-w1z;
-        // we are calculating a charge moment.
-        const double qi=q[i];
-        const double weight0 = qi*w0x;
-        const double weight1 = qi*w1x;
-        const double weight00 = weight0*w0y;
-        const double weight01 = weight0*w1y;
-        const double weight10 = weight1*w0y;
-        const double weight11 = weight1*w1y;
-        weights[0] = weight00*w0z; // weight000
-        weights[1] = weight00*w1z; // weight001
-        weights[2] = weight01*w0z; // weight010
-        weights[3] = weight01*w1z; // weight011
-        weights[4] = weight10*w0z; // weight100
-        weights[5] = weight10*w1z; // weight101
-        weights[6] = weight11*w0z; // weight110
-        weights[7] = weight11*w1z; // weight111
-
-        // add particle to moments
+        for(int istart = aligned_start; istart < aligned_end; istart+=8)
         {
-          // which is the superior order for the following loop?
-          for(int c=0; c<8; c++)
-          for(int m=0; m<10; m++)
+          // this is intended to vectorize...
+          #pragma simd
+          for(int imod=0;imod<8;imod++)
           {
-            momentsAcc[c][m][i%8] += velmoments[m]*weights[c];
-            //momentsArray[c][m] += velmoments[m]*weights[c];
-            // When simd above is uncommented,
-            // the following statement prevents segmentation fault
-            //assert_isnum(momentsArray[c][m]);
+            compute_moments(momentsAcc, istart+imod, imod,
+              x, y, z, u, v, w, q,
+              xstart, ystart, zstart,
+              inv_dx, inv_dy, inv_dz,
+              cx, cy, cz);
           }
         }
-      }
-      // reduce the moments for this cell
-      for(int c=0; c<8; c++)
-      for(int m=0; m<10; m++)
-      for(int i=0; i<8; i++)
-      {
-        momentsArray[c][m] += momentsAcc[c][m][i];
-        // When simd above is uncommented,
-        // the following statement prevents segmentation fault
-        //assert_isnum(momentsArray[c][m]);
+        for(int i=aligned_end; i<bucket_end; i++)
+        {
+          compute_moments(momentsAcc, i, 0,
+            x, y, z, u, v, w, q,
+            xstart, ystart, zstart,
+            inv_dx, inv_dy, inv_dz,
+            cx, cy, cz);
+        }
+        // reduce the moments for this cell
+        for(int c=0; c<8; c++)
+        for(int m=0; m<10; m++)
+        for(int i=0; i<8; i++)
+        {
+          momentsArray[c][m] += momentsAcc[c][m][i];
+        }
       }
      }
     }

From 5b4298f99ecf1adda5beef6967953ba1f7f0d961 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 15 Jan 2014 14:40:15 +0100
Subject: [PATCH 084/118] trying to vectorize moments acumulation

---
 fields/EMfields3D.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index ed247f17..311eabd1 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -605,6 +605,7 @@ void EMfields3D::sumMoments_vectorized(
     // prevent threads from writing to the same location
     for(int cxmod2=0; cxmod2<2; cxmod2++)
     for(int cymod2=0; cymod2<2; cymod2++)
+    // each mesh cell is handled by its own thread
     #pragma omp for collapse(2)
     for(int cx=cxmod2;cx<nxc;cx+=2)
     for(int cy=cymod2;cy<nyc;cy+=2)
@@ -633,6 +634,14 @@ void EMfields3D::sumMoments_vectorized(
       momentsArray[6] = moments11[iz]; // moments110 
       momentsArray[7] = moments11[cz]; // moments111 
 
+      // accumulator for moments per each of 8 threads
+      double momentsAcc[8][10][8];
+      for(int c=0; c<8; c++)
+      for(int m=0; m<10; m++)
+      for(int i=0; i<8; i++)
+      {
+        momentsAcc[c][m][i] = 0;
+      }
       const int numpcls_in_cell = pcls.get_numpcls_in_bucket(cx,cy,cz);
       const int bucket_offset = pcls.get_bucket_offset(cx,cy,cz);
       const int bucket_end = bucket_offset+numpcls_in_cell;
@@ -715,13 +724,24 @@ void EMfields3D::sumMoments_vectorized(
           for(int c=0; c<8; c++)
           for(int m=0; m<10; m++)
           {
-            momentsArray[c][m] += velmoments[m]*weights[c];
+            momentsAcc[c][m][i%8] += velmoments[m]*weights[c];
+            //momentsArray[c][m] += velmoments[m]*weights[c];
             // When simd above is uncommented,
             // the following statement prevents segmentation fault
             //assert_isnum(momentsArray[c][m]);
           }
         }
       }
+      // reduce the moments for this cell
+      for(int c=0; c<8; c++)
+      for(int m=0; m<10; m++)
+      for(int i=0; i<8; i++)
+      {
+        momentsArray[c][m] += momentsAcc[c][m][i];
+        // When simd above is uncommented,
+        // the following statement prevents segmentation fault
+        //assert_isnum(momentsArray[c][m]);
+      }
      }
     }
     #pragma omp master

From 0a4cb3b3d39bdd6064a2be2958381df6783dd812 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 15 Jan 2014 16:19:26 +0100
Subject: [PATCH 085/118] vectorized summing moments

---
 fields/EMfields3D.cpp | 276 +++++++++++++++++++++++++++---------------
 1 file changed, 181 insertions(+), 95 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 311eabd1..244518fb 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -561,6 +561,110 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
   }
 }
 
+inline void compute_moments(double momentsAcc[8][10][8],
+  int i,
+  int imod,
+  double const * const x,
+  double const * const y,
+  double const * const z,
+  double const * const u,
+  double const * const v,
+  double const * const w,
+  double const * const q,
+  double xstart,
+  double ystart,
+  double zstart,
+  double inv_dx,
+  double inv_dy,
+  double inv_dz,
+  int cx,
+  int cy,
+  int cz)
+{
+  // compute the quadratic moments of velocity
+  //
+  const double ui=u[i];
+  const double vi=v[i];
+  const double wi=w[i];
+  const double uui=ui*ui;
+  const double uvi=ui*vi;
+  const double uwi=ui*wi;
+  const double vvi=vi*vi;
+  const double vwi=vi*wi;
+  const double wwi=wi*wi;
+  double velmoments[10];
+  velmoments[0] = 1.;
+  velmoments[1] = ui;
+  velmoments[2] = vi;
+  velmoments[3] = wi;
+  velmoments[4] = uui;
+  velmoments[5] = uvi;
+  velmoments[6] = uwi;
+  velmoments[7] = vvi;
+  velmoments[8] = vwi;
+  velmoments[9] = wwi;
+
+  // compute the weights to distribute the moments
+  //
+  double weights[8];
+  const double abs_xpos = x[i];
+  const double abs_ypos = y[i];
+  const double abs_zpos = z[i];
+  const double rel_xpos = abs_xpos - xstart;
+  const double rel_ypos = abs_ypos - ystart;
+  const double rel_zpos = abs_zpos - zstart;
+  const double cxm1_pos = rel_xpos * inv_dx;
+  const double cym1_pos = rel_ypos * inv_dy;
+  const double czm1_pos = rel_zpos * inv_dz;
+  //if(true)
+  //{
+  //  const int cx_inf = int(floor(cxm1_pos));
+  //  const int cy_inf = int(floor(cym1_pos));
+  //  const int cz_inf = int(floor(czm1_pos));
+  //  assert_eq(cx-1,cx_inf);
+  //  assert_eq(cy-1,cy_inf);
+  //  assert_eq(cz-1,cz_inf);
+  //}
+  // fraction of the distance from the right of the cell
+  const double w1x = cx - cxm1_pos;
+  const double w1y = cy - cym1_pos;
+  const double w1z = cz - czm1_pos;
+  // fraction of distance from the left
+  const double w0x = 1-w1x;
+  const double w0y = 1-w1y;
+  const double w0z = 1-w1z;
+  // we are calculating a charge moment.
+  const double qi=q[i];
+  const double weight0 = qi*w0x;
+  const double weight1 = qi*w1x;
+  const double weight00 = weight0*w0y;
+  const double weight01 = weight0*w1y;
+  const double weight10 = weight1*w0y;
+  const double weight11 = weight1*w1y;
+  weights[0] = weight00*w0z; // weight000
+  weights[1] = weight00*w1z; // weight001
+  weights[2] = weight01*w0z; // weight010
+  weights[3] = weight01*w1z; // weight011
+  weights[4] = weight10*w0z; // weight100
+  weights[5] = weight10*w1z; // weight101
+  weights[6] = weight11*w0z; // weight110
+  weights[7] = weight11*w1z; // weight111
+
+  // add particle to moments
+  {
+    // which is the superior order for the following loop?
+    for(int c=0; c<8; c++)
+    for(int m=0; m<10; m++)
+    {
+      momentsAcc[c][m][imod] += velmoments[m]*weights[c];
+      //momentsArray[c][m] += velmoments[m]*weights[c];
+      // When simd above is uncommented,
+      // the following statement prevents segmentation fault
+      //assert_isnum(momentsArray[c][m]);
+    }
+  }
+}
+
 void EMfields3D::sumMoments_vectorized(
   const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct)
 {
@@ -636,111 +740,93 @@ void EMfields3D::sumMoments_vectorized(
 
       // accumulator for moments per each of 8 threads
       double momentsAcc[8][10][8];
-      for(int c=0; c<8; c++)
-      for(int m=0; m<10; m++)
-      for(int i=0; i<8; i++)
-      {
-        momentsAcc[c][m][i] = 0;
-      }
       const int numpcls_in_cell = pcls.get_numpcls_in_bucket(cx,cy,cz);
       const int bucket_offset = pcls.get_bucket_offset(cx,cy,cz);
       const int bucket_end = bucket_offset+numpcls_in_cell;
-      // Why does uncommenting here cause a segmentation fault below on xeon?
-      //#pragma simd
-      for(int i=bucket_offset; i<bucket_end; i++)
+
+      // calculate lower and upper bounds of cell index that
+      // are divisible by the width of the vector unit.
+      const int aligned_start = (bucket_offset+(8-1))/8*8;
+      const int aligned_end = bucket_end/8*8;
+      if(aligned_start >= aligned_end)
       {
-        // compute the quadratic moments of velocity
-        //
-        const double ui=u[i];
-        const double vi=v[i];
-        const double wi=w[i];
-        const double uui=ui*ui;
-        const double uvi=ui*vi;
-        const double uwi=ui*wi;
-        const double vvi=vi*vi;
-        const double vwi=vi*wi;
-        const double wwi=wi*wi;
-        double velmoments[10];
-        velmoments[0] = 1.;
-        velmoments[1] = ui;
-        velmoments[2] = vi;
-        velmoments[3] = wi;
-        velmoments[4] = uui;
-        velmoments[5] = uvi;
-        velmoments[6] = uwi;
-        velmoments[7] = vvi;
-        velmoments[8] = vwi;
-        velmoments[9] = wwi;
-
-        // compute the weights to distribute the moments
-        //
-        double weights[8];
-        const double abs_xpos = x[i];
-        const double abs_ypos = y[i];
-        const double abs_zpos = z[i];
-        const double rel_xpos = abs_xpos - xstart;
-        const double rel_ypos = abs_ypos - ystart;
-        const double rel_zpos = abs_zpos - zstart;
-        const double cxm1_pos = rel_xpos * inv_dx;
-        const double cym1_pos = rel_ypos * inv_dy;
-        const double czm1_pos = rel_zpos * inv_dz;
-        //if(true)
+        for(int c=0; c<8; c++)
+        for(int m=0; m<10; m++)
+        {
+          momentsAcc[c][m][0] = 0;
+        }
+        for(int i=bucket_offset; i<bucket_end; i++)
+        {
+          compute_moments(momentsAcc, i, 0,
+            x, y, z, u, v, w, q,
+            xstart, ystart, zstart,
+            inv_dx, inv_dy, inv_dz,
+            cx, cy, cz);
+        }
+        for(int c=0; c<8; c++)
+        for(int m=0; m<10; m++)
+        {
+          momentsArray[c][m] += momentsAcc[c][m][0];
+        }
+      }
+      // can vectorize for aligned section of particles
+      else
+      {
+        for(int c=0; c<8; c++)
+        for(int m=0; m<10; m++)
+        for(int i=0; i<8; i++)
+        {
+          momentsAcc[c][m][i] = 0;
+        }
+        //const int numSections = (aligned_end-aligned_start)/8;
+        assert_le(bucket_offset, aligned_start);
+        assert_le(aligned_start, aligned_end);
+        assert_le(aligned_end,bucket_end);
+
+        for(int i=bucket_offset; i<aligned_start; i++)
+        {
+          compute_moments(momentsAcc, i, 0,
+            x, y, z, u, v, w, q,
+            xstart, ystart, zstart,
+            inv_dx, inv_dy, inv_dz,
+            cx, cy, cz);
+        }
+        //for(int i=aligned_start; i<aligned_end; i++)
         //{
-        //  const int cx_inf = int(floor(cxm1_pos));
-        //  const int cy_inf = int(floor(cym1_pos));
-        //  const int cz_inf = int(floor(czm1_pos));
-        //  assert_eq(cx-1,cx_inf);
-        //  assert_eq(cy-1,cy_inf);
-        //  assert_eq(cz-1,cz_inf);
+        //  compute_moments(momentsAcc, i, i%8,
+        //    x, y, z, u, v, w, q,
+        //    xstart, ystart, zstart,
+        //    inv_dx, inv_dy, inv_dz,
+        //    cx, cy, cz);
         //}
-        // fraction of the distance from the right of the cell
-        const double w1x = cx - cxm1_pos;
-        const double w1y = cy - cym1_pos;
-        const double w1z = cz - czm1_pos;
-        // fraction of distance from the left
-        const double w0x = 1-w1x;
-        const double w0y = 1-w1y;
-        const double w0z = 1-w1z;
-        // we are calculating a charge moment.
-        const double qi=q[i];
-        const double weight0 = qi*w0x;
-        const double weight1 = qi*w1x;
-        const double weight00 = weight0*w0y;
-        const double weight01 = weight0*w1y;
-        const double weight10 = weight1*w0y;
-        const double weight11 = weight1*w1y;
-        weights[0] = weight00*w0z; // weight000
-        weights[1] = weight00*w1z; // weight001
-        weights[2] = weight01*w0z; // weight010
-        weights[3] = weight01*w1z; // weight011
-        weights[4] = weight10*w0z; // weight100
-        weights[5] = weight10*w1z; // weight101
-        weights[6] = weight11*w0z; // weight110
-        weights[7] = weight11*w1z; // weight111
-
-        // add particle to moments
+        for(int istart = aligned_start; istart < aligned_end; istart+=8)
         {
-          // which is the superior order for the following loop?
-          for(int c=0; c<8; c++)
-          for(int m=0; m<10; m++)
+          // this is intended to vectorize...
+          #pragma simd
+          for(int imod=0;imod<8;imod++)
           {
-            momentsAcc[c][m][i%8] += velmoments[m]*weights[c];
-            //momentsArray[c][m] += velmoments[m]*weights[c];
-            // When simd above is uncommented,
-            // the following statement prevents segmentation fault
-            //assert_isnum(momentsArray[c][m]);
+            compute_moments(momentsAcc, istart+imod, imod,
+              x, y, z, u, v, w, q,
+              xstart, ystart, zstart,
+              inv_dx, inv_dy, inv_dz,
+              cx, cy, cz);
           }
         }
-      }
-      // reduce the moments for this cell
-      for(int c=0; c<8; c++)
-      for(int m=0; m<10; m++)
-      for(int i=0; i<8; i++)
-      {
-        momentsArray[c][m] += momentsAcc[c][m][i];
-        // When simd above is uncommented,
-        // the following statement prevents segmentation fault
-        //assert_isnum(momentsArray[c][m]);
+        for(int i=aligned_end; i<bucket_end; i++)
+        {
+          compute_moments(momentsAcc, i, 0,
+            x, y, z, u, v, w, q,
+            xstart, ystart, zstart,
+            inv_dx, inv_dy, inv_dz,
+            cx, cy, cz);
+        }
+        // reduce the moments for this cell
+        for(int c=0; c<8; c++)
+        for(int m=0; m<10; m++)
+        for(int i=0; i<8; i++)
+        {
+          momentsArray[c][m] += momentsAcc[c][m][i];
+        }
       }
      }
     }

From 937a8afe6287f31fd26d6ff28fe6aa02130c3890 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 16 Jan 2014 23:03:07 +0100
Subject: [PATCH 086/118] improved thread rection of sumMoments() and
 simplified sumMoments_vectorized()

---
 fields/EMfields3D.cpp     | 323 ++++++++++++++++++++++++++------------
 include/EMfields3D.h      |   1 +
 particles/Particles3D.cpp |  48 +++---
 3 files changed, 251 insertions(+), 121 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 244518fb..bae8cc36 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -516,39 +516,63 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
     // reduction
     if(!thread_num) timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION);
 
-    // reduce arrays
+    // reduce moments in parallel
+    //
+    for(int thread_num=0;thread_num<get_sizeMomentsArray();thread_num++)
     {
-      #pragma omp critical (reduceMoment0)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { rhons[is][i][j][k] += invVOL*moments[i][j][k][0]; }}
-      #pragma omp critical (reduceMoment1)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1]; }}
-      #pragma omp critical (reduceMoment2)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { Jys  [is][i][j][k] += invVOL*moments[i][j][k][2]; }}
-      #pragma omp critical (reduceMoment3)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3]; }}
-      #pragma omp critical (reduceMoment4)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4]; }}
-      #pragma omp critical (reduceMoment5)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5]; }}
-      #pragma omp critical (reduceMoment6)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6]; }}
-      #pragma omp critical (reduceMoment7)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7]; }}
-      #pragma omp critical (reduceMoment8)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8]; }}
-      #pragma omp critical (reduceMoment9)
-      for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
-        { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }}
+      arr4_double moments = fetch_moments10Array(thread_num).fetch_arr();
+      #pragma omp for collapse(2)
+      for(int i=0;i<nxn;i++)
+      for(int j=0;j<nyn;j++)
+      for(int k=0;k<nzn;k++)
+      {
+        rhons[is][i][j][k] += invVOL*moments[i][j][k][0];
+        Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1];
+        Jys  [is][i][j][k] += invVOL*moments[i][j][k][2];
+        Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3];
+        pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4];
+        pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5];
+        pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6];
+        pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7];
+        pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8];
+        pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9];
+      }
     }
+    //
+    // This was the old way of reducing;
+    // did not scale well to large number of threads
+    //{
+    //  #pragma omp critical (reduceMoment0)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { rhons[is][i][j][k] += invVOL*moments[i][j][k][0]; }}
+    //  #pragma omp critical (reduceMoment1)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1]; }}
+    //  #pragma omp critical (reduceMoment2)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { Jys  [is][i][j][k] += invVOL*moments[i][j][k][2]; }}
+    //  #pragma omp critical (reduceMoment3)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3]; }}
+    //  #pragma omp critical (reduceMoment4)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4]; }}
+    //  #pragma omp critical (reduceMoment5)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5]; }}
+    //  #pragma omp critical (reduceMoment6)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6]; }}
+    //  #pragma omp critical (reduceMoment7)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7]; }}
+    //  #pragma omp critical (reduceMoment8)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8]; }}
+    //  #pragma omp critical (reduceMoment9)
+    //  for(int i=0;i<nxn;i++){for(int j=0;j<nyn;j++) for(int k=0;k<nzn;k++)
+    //    { pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9]; }}
+    //}
     if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_REDUCTION);
     // uncomment this and remove the loop below
     // when we change to use asynchronous communication.
@@ -561,9 +585,8 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
   }
 }
 
-inline void compute_moments(double momentsAcc[8][10][8],
+inline void compute_moments(double velmoments[10], double weights[8],
   int i,
-  int imod,
   double const * const x,
   double const * const y,
   double const * const z,
@@ -581,6 +604,13 @@ inline void compute_moments(double momentsAcc[8][10][8],
   int cy,
   int cz)
 {
+  ALIGNED(x);
+  ALIGNED(y);
+  ALIGNED(z);
+  ALIGNED(u);
+  ALIGNED(v);
+  ALIGNED(w);
+  ALIGNED(q);
   // compute the quadratic moments of velocity
   //
   const double ui=u[i];
@@ -592,7 +622,7 @@ inline void compute_moments(double momentsAcc[8][10][8],
   const double vvi=vi*vi;
   const double vwi=vi*wi;
   const double wwi=wi*wi;
-  double velmoments[10];
+  //double velmoments[10];
   velmoments[0] = 1.;
   velmoments[1] = ui;
   velmoments[2] = vi;
@@ -606,7 +636,7 @@ inline void compute_moments(double momentsAcc[8][10][8],
 
   // compute the weights to distribute the moments
   //
-  double weights[8];
+  //double weights[8];
   const double abs_xpos = x[i];
   const double abs_ypos = y[i];
   const double abs_zpos = z[i];
@@ -649,18 +679,152 @@ inline void compute_moments(double momentsAcc[8][10][8],
   weights[5] = weight10*w1z; // weight101
   weights[6] = weight11*w0z; // weight110
   weights[7] = weight11*w1z; // weight111
+}
+
+inline void add_moments_for_pcl(double momentsAcc[8][10],
+  int i,
+  double const * const x,
+  double const * const y,
+  double const * const z,
+  double const * const u,
+  double const * const v,
+  double const * const w,
+  double const * const q,
+  double xstart,
+  double ystart,
+  double zstart,
+  double inv_dx,
+  double inv_dy,
+  double inv_dz,
+  int cx,
+  int cy,
+  int cz)
+{
+  double velmoments[10];
+  double weights[8];
+  compute_moments(velmoments,weights,
+    i, x, y, z, u, v, w, q,
+    xstart, ystart, zstart,
+    inv_dx, inv_dy, inv_dz,
+    cx, cy, cz);
 
-  // add particle to moments
+  // add moments for this particle
   {
     // which is the superior order for the following loop?
     for(int c=0; c<8; c++)
     for(int m=0; m<10; m++)
     {
-      momentsAcc[c][m][imod] += velmoments[m]*weights[c];
-      //momentsArray[c][m] += velmoments[m]*weights[c];
-      // When simd above is uncommented,
-      // the following statement prevents segmentation fault
-      //assert_isnum(momentsArray[c][m]);
+      momentsAcc[c][m] += velmoments[m]*weights[c];
+    }
+  }
+}
+
+
+// vectorized version of previous method
+// 
+inline void add_moments_for_pcl_vec(double momentsAccVec[8][10][8],
+  double velmoments[10][8], double weights[8][8],
+  int i,
+  int imod,
+  double const * const x,
+  double const * const y,
+  double const * const z,
+  double const * const u,
+  double const * const v,
+  double const * const w,
+  double const * const q,
+  double xstart,
+  double ystart,
+  double zstart,
+  double inv_dx,
+  double inv_dy,
+  double inv_dz,
+  int cx,
+  int cy,
+  int cz)
+{
+  ALIGNED(x);
+  ALIGNED(y);
+  ALIGNED(z);
+  ALIGNED(u);
+  ALIGNED(v);
+  ALIGNED(w);
+  ALIGNED(q);
+  // compute the quadratic moments of velocity
+  //
+  const double ui=u[i];
+  const double vi=v[i];
+  const double wi=w[i];
+  const double uui=ui*ui;
+  const double uvi=ui*vi;
+  const double uwi=ui*wi;
+  const double vvi=vi*vi;
+  const double vwi=vi*wi;
+  const double wwi=wi*wi;
+  //double velmoments[10];
+  velmoments[0][imod] = 1.;
+  velmoments[1][imod] = ui;
+  velmoments[2][imod] = vi;
+  velmoments[3][imod] = wi;
+  velmoments[4][imod] = uui;
+  velmoments[5][imod] = uvi;
+  velmoments[6][imod] = uwi;
+  velmoments[7][imod] = vvi;
+  velmoments[8][imod] = vwi;
+  velmoments[9][imod] = wwi;
+
+  // compute the weights to distribute the moments
+  //
+  //double weights[8];
+  const double abs_xpos = x[i];
+  const double abs_ypos = y[i];
+  const double abs_zpos = z[i];
+  const double rel_xpos = abs_xpos - xstart;
+  const double rel_ypos = abs_ypos - ystart;
+  const double rel_zpos = abs_zpos - zstart;
+  const double cxm1_pos = rel_xpos * inv_dx;
+  const double cym1_pos = rel_ypos * inv_dy;
+  const double czm1_pos = rel_zpos * inv_dz;
+  //if(true)
+  //{
+  //  const int cx_inf = int(floor(cxm1_pos));
+  //  const int cy_inf = int(floor(cym1_pos));
+  //  const int cz_inf = int(floor(czm1_pos));
+  //  assert_eq(cx-1,cx_inf);
+  //  assert_eq(cy-1,cy_inf);
+  //  assert_eq(cz-1,cz_inf);
+  //}
+  // fraction of the distance from the right of the cell
+  const double w1x = cx - cxm1_pos;
+  const double w1y = cy - cym1_pos;
+  const double w1z = cz - czm1_pos;
+  // fraction of distance from the left
+  const double w0x = 1-w1x;
+  const double w0y = 1-w1y;
+  const double w0z = 1-w1z;
+  // we are calculating a charge moment.
+  const double qi=q[i];
+  const double weight0 = qi*w0x;
+  const double weight1 = qi*w1x;
+  const double weight00 = weight0*w0y;
+  const double weight01 = weight0*w1y;
+  const double weight10 = weight1*w0y;
+  const double weight11 = weight1*w1y;
+  weights[0][imod] = weight00*w0z; // weight000
+  weights[1][imod] = weight00*w1z; // weight001
+  weights[2][imod] = weight01*w0z; // weight010
+  weights[3][imod] = weight01*w1z; // weight011
+  weights[4][imod] = weight10*w0z; // weight100
+  weights[5][imod] = weight10*w1z; // weight101
+  weights[6][imod] = weight11*w0z; // weight110
+  weights[7][imod] = weight11*w1z; // weight111
+
+  // add moments for this particle
+  {
+    for(int c=0; c<8; c++)
+    for(int m=0; m<10; m++)
+    {
+      momentsAccVec[c][m][imod] += velmoments[m][imod]*weights[c][imod];
     }
   }
 }
@@ -738,26 +902,19 @@ void EMfields3D::sumMoments_vectorized(
       momentsArray[6] = moments11[iz]; // moments110 
       momentsArray[7] = moments11[cz]; // moments111 
 
-      // accumulator for moments per each of 8 threads
-      double momentsAcc[8][10][8];
       const int numpcls_in_cell = pcls.get_numpcls_in_bucket(cx,cy,cz);
       const int bucket_offset = pcls.get_bucket_offset(cx,cy,cz);
       const int bucket_end = bucket_offset+numpcls_in_cell;
 
-      // calculate lower and upper bounds of cell index that
-      // are divisible by the width of the vector unit.
-      const int aligned_start = (bucket_offset+(8-1))/8*8;
-      const int aligned_end = bucket_end/8*8;
-      if(aligned_start >= aligned_end)
+      bool vectorized=false;
+      if(!vectorized)
       {
-        for(int c=0; c<8; c++)
-        for(int m=0; m<10; m++)
-        {
-          momentsAcc[c][m][0] = 0;
-        }
+        // accumulators for moments per each of 8 threads
+        double momentsAcc[8][10];
+        memset(momentsAcc,0,sizeof(double)*8*10);
         for(int i=bucket_offset; i<bucket_end; i++)
         {
-          compute_moments(momentsAcc, i, 0,
+          add_moments_for_pcl(momentsAcc, i,
             x, y, z, u, v, w, q,
             xstart, ystart, zstart,
             inv_dx, inv_dy, inv_dz,
@@ -766,66 +923,30 @@ void EMfields3D::sumMoments_vectorized(
         for(int c=0; c<8; c++)
         for(int m=0; m<10; m++)
         {
-          momentsArray[c][m] += momentsAcc[c][m][0];
+          momentsArray[c][m] += momentsAcc[c][m];
         }
       }
-      // can vectorize for aligned section of particles
-      else
+      if(vectorized)
       {
-        for(int c=0; c<8; c++)
-        for(int m=0; m<10; m++)
-        for(int i=0; i<8; i++)
-        {
-          momentsAcc[c][m][i] = 0;
-        }
-        //const int numSections = (aligned_end-aligned_start)/8;
-        assert_le(bucket_offset, aligned_start);
-        assert_le(aligned_start, aligned_end);
-        assert_le(aligned_end,bucket_end);
-
-        for(int i=bucket_offset; i<aligned_start; i++)
-        {
-          compute_moments(momentsAcc, i, 0,
-            x, y, z, u, v, w, q,
-            xstart, ystart, zstart,
-            inv_dx, inv_dy, inv_dz,
-            cx, cy, cz);
-        }
-        //for(int i=aligned_start; i<aligned_end; i++)
-        //{
-        //  compute_moments(momentsAcc, i, i%8,
-        //    x, y, z, u, v, w, q,
-        //    xstart, ystart, zstart,
-        //    inv_dx, inv_dy, inv_dz,
-        //    cx, cy, cz);
-        //}
-        for(int istart = aligned_start; istart < aligned_end; istart+=8)
-        {
-          // this is intended to vectorize...
-          #pragma simd
-          for(int imod=0;imod<8;imod++)
-          {
-            compute_moments(momentsAcc, istart+imod, imod,
-              x, y, z, u, v, w, q,
-              xstart, ystart, zstart,
-              inv_dx, inv_dy, inv_dz,
-              cx, cy, cz);
-          }
-        }
-        for(int i=aligned_end; i<bucket_end; i++)
+        double velmoments[10][8];
+        double weights[8][8];
+        double momentsAccVec[8][10][8];
+        memset(momentsAccVec,0,sizeof(double)*8*10*8);
+        #pragma simd
+        for(int i=bucket_offset; i<bucket_end; i++)
         {
-          compute_moments(momentsAcc, i, 0,
+          add_moments_for_pcl_vec(momentsAccVec, velmoments, weights,
+            i, i%8,
             x, y, z, u, v, w, q,
             xstart, ystart, zstart,
             inv_dx, inv_dy, inv_dz,
             cx, cy, cz);
         }
-        // reduce the moments for this cell
         for(int c=0; c<8; c++)
         for(int m=0; m<10; m++)
         for(int i=0; i<8; i++)
         {
-          momentsArray[c][m] += momentsAcc[c][m][i];
+          momentsArray[c][m] += momentsAccVec[c][m][i];
         }
       }
      }
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 5d2f4780..8a93172f 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -271,6 +271,7 @@ class EMfields3D                // :public Field
       assert_lt(i,sizeMomentsArray);
       return *(moments10Array[i]);
     }
+    int get_sizeMomentsArray() { return sizeMomentsArray; }
 
     /*! print electromagnetic fields info */
     void print(void) const;
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index fde7536f..3cc55c8e 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -529,8 +529,14 @@ void Particles3D::mover_PC_vectorized(
       const int numpcls_in_cell = get_numpcls_in_bucket(cx,cy,cz);
       const int bucket_offset = get_bucket_offset(cx,cy,cz);
       const int bucket_end = bucket_offset+numpcls_in_cell;
+      ALIGNED(x);
+      ALIGNED(y);
+      ALIGNED(z);
+      ALIGNED(u);
+      ALIGNED(v);
+      ALIGNED(w);
       // this should vectorize, but could be faster if particle
-      // data were aligned.
+      // data for each mesh cell were aligned.
       #pragma simd
       //for(int pidx=bucket_offset_1d[cell]; pidx<numpcls_in_cell; pidx++)
       for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
@@ -538,25 +544,25 @@ void Particles3D::mover_PC_vectorized(
         // serial case: check that pidx is correct
         //assert_eq(pidx,serial_pidx++);
         // confirm that particle is in correct cell
-        if(true)
-        {
-          int cx_,cy_,cz_;
-          get_safe_cell_for_pos(cx_,cy_,cz_,xavg[pidx],yavg[pidx],zavg[pidx]);
-          //if((cx_!=cx)
-          // ||(cy_!=cy)
-          // ||(cz_!=cz))
-          //{
-          //  dprintf("\n\t cx =%d, cy =%d, cz =%d"
-          //          "\n\t cx_=%d, cy_=%d, cz_=%d"
-          //          "\n\t x=%g, y=%g, z_=%g",
-          //          cx,cy,cz,
-          //          cx_,cy_,cz_,
-          //          xavg[pidx], yavg[pidx], zavg[pidx]);
-          //}
-          assert_eq(cx_,cx);
-          assert_eq(cy_,cy);
-          assert_eq(cz_,cz);
-        }
+        //if(true)
+        //{
+        //  int cx_,cy_,cz_;
+        //  get_safe_cell_for_pos(cx_,cy_,cz_,xavg[pidx],yavg[pidx],zavg[pidx]);
+        //  //if((cx_!=cx)
+        //  // ||(cy_!=cy)
+        //  // ||(cz_!=cz))
+        //  //{
+        //  //  dprintf("\n\t cx =%d, cy =%d, cz =%d"
+        //  //          "\n\t cx_=%d, cy_=%d, cz_=%d"
+        //  //          "\n\t x=%g, y=%g, z_=%g",
+        //  //          cx,cy,cz,
+        //  //          cx_,cy_,cz_,
+        //  //          xavg[pidx], yavg[pidx], zavg[pidx]);
+        //  //}
+        //  assert_eq(cx_,cx);
+        //  assert_eq(cy_,cy);
+        //  assert_eq(cz_,cz);
+        //}
 
         // copy the particle
         const pfloat xorig = x[pidx];
@@ -611,6 +617,8 @@ void Particles3D::mover_PC_vectorized(
         pfloat Bxl = 0.0;
         pfloat Byl = 0.0;
         pfloat Bzl = 0.0;
+
+        // would expanding this out help to vectorize?
         for(int c=0; c<8; c++)
         {
           Bxl += weights[c] * field_components[c][0];

From e1dba2f4abd5839ee618b448dc31ed7bb3d94843 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 22 Jan 2014 15:51:38 +0100
Subject: [PATCH 087/118] implemented mover_PC_AoS (array of structs particles)

---
 fields/EMfields3D.cpp         |   2 +
 iPic3D.cpp                    |   1 +
 include/Alloc.h               |   6 +
 include/Grid3DCU.h            |   6 +-
 include/PSKOutput.h           |   2 +-
 include/Particle.h            | 115 +++++++++++++
 include/Particles3D.h         |   3 +
 include/Particles3Dcomm.h     |  20 ++-
 include/arraysfwd.h           |   3 +
 include/iPic3D.h              |   4 +-
 include/ipicmath.h            |  47 ++++++
 main/Parameters.cpp           |   2 +-
 main/iPic3Dlib.cpp            |  44 ++++-
 particles/Particles3D.cpp     | 310 +++++++++++++++++++++++++++++++++-
 particles/Particles3Dcomm.cpp |  69 ++++++++
 scripts/ipic.py               |   7 +
 16 files changed, 625 insertions(+), 16 deletions(-)
 create mode 100644 include/Particle.h
 create mode 100644 include/ipicmath.h

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index bae8cc36..c5af87f0 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -395,6 +395,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
   for (int i = 0; i < ns; i++)
   {
     const Particles3Dcomm& pcls = part[i];
+    assert_eq(pcls.get_particleType(), ParticleType::AoS);
     const int is = pcls.get_ns();
     assert_eq(i,is);
 
@@ -846,6 +847,7 @@ void EMfields3D::sumMoments_vectorized(
   for (int species_idx = 0; species_idx < ns; species_idx++)
   {
     const Particles3Dcomm& pcls = part[species_idx];
+    assert_eq(pcls.get_particleType(), ParticleType::SoA);
     const int is = pcls.get_ns();
     assert_eq(species_idx,is);
 
diff --git a/iPic3D.cpp b/iPic3D.cpp
index 4e768a71..91129c7b 100644
--- a/iPic3D.cpp
+++ b/iPic3D.cpp
@@ -34,6 +34,7 @@ int main(int argc, char **argv) {
       i = KCode.LastCycle() + 1;
     }
 
+    KCode.convertParticlesToSoA();
     KCode.WriteOutput(i);
     KCode.WriteConserved(i);
     KCode.WriteRestart(i);
diff --git a/include/Alloc.h b/include/Alloc.h
index 63100dc3..cdb56e55 100644
--- a/include/Alloc.h
+++ b/include/Alloc.h
@@ -74,6 +74,12 @@
     #define AlignedFree(S) (delete[] S)
     #define AlignedAlloc(T, NUM) (new T[NUM]) 
 #endif
+inline bool is_aligned(void *p, int N)
+{
+    return (unsigned long)p % N == 0;
+}
+#define assert_aligned(X, N) assert(is_aligned(X, N));
+
 
 // Compile with -DCHECK_BOUNDS to turn on bounds checking.
 //#define CHECK_BOUNDS
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index 03b56c1d..72dfc246 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -170,9 +170,9 @@ class Grid3DCU                  // :public Grid
   // coordinate accessors
   //
   // calculated equivalents (preferred for accelerator?):
-  //const double &calcXN(int X) { return xStart+(X-1)*dx;}
-  //const double &calcYN(int Y) { return yStart+(Y-1)*dy;}
-  //const double &calcZN(int Z) { return zStart+(Z-1)*dz;}
+  const double calcXN(int X) { return xStart+(X-1)*dx;}
+  const double calcYN(int Y) { return yStart+(Y-1)*dy;}
+  const double calcZN(int Z) { return zStart+(Z-1)*dz;}
   const pfloat &get_pfloat_XN(int X) { return pfloat_node_xcoord[X];}
   const pfloat &get_pfloat_YN(int Y) { return pfloat_node_ycoord[Y];}
   const pfloat &get_pfloat_ZN(int Z) { return pfloat_node_zcoord[Z];}
diff --git a/include/PSKOutput.h b/include/PSKOutput.h
index 624fad4c..89ee1d30 100644
--- a/include/PSKOutput.h
+++ b/include/PSKOutput.h
@@ -15,7 +15,7 @@ developers: D. Burgess, June/July 2006
 
 #include "errors.h"
 #include "PSKException.h"
-#include "Particles.h"
+#include "Particles3Dcomm.h"
 #include "Field.h"
 #include "Grid.h"
 #include "Collective.h"
diff --git a/include/Particle.h b/include/Particle.h
new file mode 100644
index 00000000..a17c5368
--- /dev/null
+++ b/include/Particle.h
@@ -0,0 +1,115 @@
+#ifndef _Particle_
+#define _Particle_
+
+// Depends on width of vector unit;
+// need to be known at compile time.
+//
+#define AoS_PCLS_AT_A_TIME 2
+
+namespace ParticleType
+{
+  enum Type
+  {
+    AoS = 0,
+    SoA
+  };
+}
+
+// intended to occupy 64 bytes
+//
+// particle for a specific species
+class SpeciesParticle
+{
+  long long ID;
+  double x[3];
+  double u[3];
+  double q;
+ public:
+  // accessors
+  long long get_ID()const{ return ID; }
+  double get_x(int i)const{ return x[i]; }
+  double get_u(int i)const{ return u[i]; }
+  double get_q()const{ return q; }
+  void set_ID(long long in){ ID=in; }
+  void set_x(int i, double in) { x[i] = in; }
+  void set_u(int i, double in) { u[i] = in; }
+  void set_q(double in) { q = in; }
+  // alternative accessors
+  double get_x()const{ return x[0]; }
+  double get_y()const{ return x[1]; }
+  double get_z()const{ return x[2]; }
+  double get_u()const{ return u[0]; }
+  double get_v()const{ return u[1]; }
+  double get_w()const{ return u[2]; }
+  void set_x(double in){ x[0]=in; }
+  void set_y(double in){ x[1]=in; }
+  void set_z(double in){ x[2]=in; }
+  void set_u(double in){ u[0]=in; }
+  void set_v(double in){ u[1]=in; }
+  void set_w(double in){ u[2]=in; }
+  void set(long long _ID,
+    double _x, double _y, double _z,
+    double _u, double _v, double _w,
+    double _q)
+  {
+    ID = _ID;
+    x[0] = _x; x[1] = _y; x[2] = _z;
+    u[0] = _u; u[1] = _v; u[2] = _w;
+    q = _q;
+  }
+};
+
+// intended to occupy 64 bytes
+//
+// to be used when sorting with every particle advance
+struct CellParticle
+{
+  long long ID; // 8 bytes
+  int cx[3]; // mesh cell
+  float fx[3]; // mesh cell position (fraction)
+  float u[3];
+  float fxavg[3]; // for implicit push
+  float q; // float m would be better for stitching to MHD for dusty plasma
+  float qom; // for dusty plasma
+ public:
+  // accessors
+  //
+  // read access
+  long long get_ID()const{ return ID; }
+  float get_fx()const{ return fx[0]; }
+  float get_fy()const{ return fx[1]; }
+  float get_fz()const{ return fx[2]; }
+  float get_u()const{ return u[0]; }
+  float get_v()const{ return u[1]; }
+  float get_w()const{ return u[2]; }
+  float get_q()const{ return q; }
+  void set_ID(long long in){ ID=in; }
+  // write access
+  void set_u(float in){ u[0]=in; }
+  void set_v(float in){ u[1]=in; }
+  void set_w(float in){ u[2]=in; }
+
+  void init(const SpeciesParticle& pcl,
+    double cxstart[3], // starting position of cell coordinates
+    float dx_inv[3],
+    float _qom)
+  {
+    ID = pcl.get_ID();
+    // position in mesh coordinates
+    //
+
+    float xpos[3];
+    for(int i=0;i<3;i++)
+    {
+      float xpos = (pcl.get_x(i)-cxstart[i])*dx_inv[i];
+      float cxpos = floor(xpos);
+      cx[i] = int(cxpos);
+      fxavg[i] = fx[i] = cxpos - cx[i];
+      u[i] = pcl.get_u(i);
+    }
+    q = pcl.get_q();
+    qom = _qom;
+  }
+};
+
+#endif
diff --git a/include/Particles3D.h b/include/Particles3D.h
index ccd210b4..05a8701a 100644
--- a/include/Particles3D.h
+++ b/include/Particles3D.h
@@ -58,6 +58,9 @@ class Particles3D:public Particles3Dcomm {
     void mover_explicit(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** mover with a Predictor-Corrector Scheme */
     void mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf);
+    /** array-of-structs version of mover_PC */
+    void mover_PC_AoS2(Grid * grid, VirtualTopology3D * vct, Field * EMf);
+    void mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** vectorized version of mover_PC **/
     void mover_PC_vectorized(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** communicate particle after moving them */
diff --git a/include/Particles3Dcomm.h b/include/Particles3Dcomm.h
index 5d40cca8..4b6fb098 100644
--- a/include/Particles3Dcomm.h
+++ b/include/Particles3Dcomm.h
@@ -7,7 +7,11 @@ developers: Stefano Markidis, Giovanni Lapenta
 #ifndef Part3DCOMM_H
 #define Part3DCOMM_H
 
-#include "Particles.h"
+#include "CollectiveIO.h"
+#include "VirtualTopology3D.h"
+#include "Grid.h"
+#include "Field.h"
+#include "Particle.h"
 /**
  * 
  * class for particles of the same species with communications methods
@@ -59,6 +63,14 @@ class Particles3Dcomm // :public Particles
   /** calculate the weights given the position of particles */
   // void calculateWeights(double*** weight, double xp, double yp, double zp,int ix, int iy, int iz, Grid* grid);
 
+ private:
+  void copyParticlesToAoS();
+  void copyParticlesToSoA();
+
+ public:
+  void convertParticlesToAoS();
+  void convertParticlesToSoA();
+
   /*! sort particles for vectorized push (needs to be parallelized) */
   void sort_particles_serial(Grid * grid, VirtualTopology3D * vct);
   /*! sort particles with respect to provided position data */
@@ -108,6 +120,7 @@ class Particles3Dcomm // :public Particles
 
   // inline get accessors
   //
+  ParticleType::Type get_particleType()const { return particleType; }
   double *getXall()  const { return (x); }
   double *getYall()  const { return (y); }
   double *getZall()  const { return (z); }
@@ -185,8 +198,13 @@ class Particles3Dcomm // :public Particles
   /** w0 Drift velocity - Direction Z */
   double w0;
 
+  ParticleType::Type particleType;
   // particles data
   //
+  // AoS representation
+  SpeciesParticle *pcls;
+  // SoA representation
+  //
   /** Positions array - X component */
   double *x;
   /** Positions array - Y component */
diff --git a/include/arraysfwd.h b/include/arraysfwd.h
index 706c057d..86dbd5d8 100644
--- a/include/arraysfwd.h
+++ b/include/arraysfwd.h
@@ -38,6 +38,9 @@ typedef iPic3D::array_ref2<int> arr2_int;
 typedef iPic3D::array_ref3<int> arr3_int;
 typedef iPic3D::array_ref4<int> arr4_int;
 //
+typedef iPic3D::const_array_ref3<void*> const_arr3_ptr;
+typedef iPic3D::array_ref3<void*> arr3_ptr;
+//
 typedef iPic3D::const_array_ref3<double> const_arr3_double;
 typedef iPic3D::const_array_ref4<double> const_arr4_double;
 typedef iPic3D::const_array_ref4<pfloat> const_arr4_pfloat;
diff --git a/include/iPic3D.h b/include/iPic3D.h
index 93db7d11..6d7d1063 100644
--- a/include/iPic3D.h
+++ b/include/iPic3D.h
@@ -47,6 +47,9 @@ namespace iPic3D {
     inline int LastCycle();
     inline int get_myrank();
 
+    void convertParticlesToSoA();
+    void convertParticlesToAoS();
+
   private:
     static MPIdata * mpi;
     Collective    *col;
@@ -96,7 +99,6 @@ namespace iPic3D {
   inline int c_Solver::get_myrank() {
     return (myrank);
   }
-
 }
 
 #endif
diff --git a/include/ipicmath.h b/include/ipicmath.h
new file mode 100644
index 00000000..edf0ca5e
--- /dev/null
+++ b/include/ipicmath.h
@@ -0,0 +1,47 @@
+#ifndef _ipicmath_h_
+#define _ipicmath_h_
+
+// valid if roundup power is representable.
+inline int
+pow2roundup (int x)
+{
+    assert(x>=0);
+    //if (x < 0)
+    //    return 0;
+    --x;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    return x+1;
+}
+
+// does not work if highest non-sign bit is set
+inline int
+pow2rounddown (int x)
+{
+    assert(x>=0);
+    //if (x < 0)
+    //    return 0;
+
+    // set all bits below highest bit
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    // set the bit higher than the highest bit
+    x++;
+    // shift it down and return it
+    return (x >> 1);
+}
+
+// round n up to next multiple of m
+inline int roundup_to_multiple(int n, int m)
+{
+  //return ((n-1)/m+1)*m;
+  return (n+m-1)/m*m;
+}
+
+#endif
diff --git a/main/Parameters.cpp b/main/Parameters.cpp
index 77abb87e..9a51e13f 100644
--- a/main/Parameters.cpp
+++ b/main/Parameters.cpp
@@ -12,7 +12,7 @@ void Parameters::init_parameters()
 //bool Parameters::get_SORTING_PARTICLES() { return SORTING_PARTICLES; }
 bool Parameters::get_SORTING_PARTICLES() { return true; }
 bool Parameters::get_VECTORIZE_MOMENTS() { return true; }
-bool Parameters::get_VECTORIZE_MOVER() { return true; }
+bool Parameters::get_VECTORIZE_MOVER() { return false; }
 // this must also return true if we communicate particles per iteration
 //bool Parameters::get_USING_XAVG() { return get_VECTORIZE_MOVER(); }
 bool Parameters::get_USING_XAVG() { return get_SORTING_PARTICLES(); }
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 4d8c532d..79dce395 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -156,7 +156,7 @@ int c_Solver::Init(int argc, char **argv) {
     my_file.close();
   }
   cqsat = SaveDirName + "/VirtualSatelliteTraces" + num_proc.str() + ".txt";
-  // if(myrank==0){
+  // if(myrank==0)
   ofstream my_file(cqsat.c_str(), fstream::binary);
   nsat = 3;
   for (int isat = 0; isat < nsat; isat++) {
@@ -178,6 +178,8 @@ int c_Solver::Init(int argc, char **argv) {
 
 void c_Solver::CalculateMoments() {
 
+  convertParticlesToSoA();
+
   timeTasks_set_main_task(TimeTasks::MOMENTS);
 
   EMf->updateInfoFields(grid,vct,col);
@@ -199,11 +201,13 @@ void c_Solver::CalculateMoments() {
   {
     // since particles are sorted,
     // we can vectorize interpolation of particles to grid
+    convertParticlesToSoA();
     EMf->sumMoments_vectorized(part, grid, vct);
   }
   else
   {
     EMf->setZeroPrimaryMoments();
+    convertParticlesToSoA();
     EMf->sumMoments(part, grid, vct);
   }
   //for (int i = 0; i < ns; i++)
@@ -259,12 +263,18 @@ bool c_Solver::ParticlesMover() {
       // should merely pass EMf->get_fieldForPcls() rather than EMf.
       // use the Predictor Corrector scheme to move particles
       if(Parameters::get_VECTORIZE_MOVER())
+      {
         part[i].mover_PC_vectorized(grid, vct, EMf);
+      }
       else
-        part[i].mover_PC(grid, vct, EMf);
+      {
+        //part[i].mover_PC(grid, vct, EMf);
+        //part[i].mover_PC_AoS2(grid, vct, EMf);
+        part[i].mover_PC_AoS(grid, vct, EMf);
+      }
     }
     }
-    for (int i = 0; i < ns; i++)  // move each species
+    for (int i = 0; i < ns; i++)  // communicate each species
     {
       mem_avail = part[i].communicate_particles(vct);
     }
@@ -350,22 +360,29 @@ void c_Solver::WriteConserved(int cycle) {
 }
 
 void c_Solver::WriteOutput(int cycle) {
-  // OUTPUT to large file, called proc**
+
+  bool write_fields = (cycle % (col->getFieldOutputCycle()) == 0 || cycle == first_cycle);
+
+  bool write_particles = (cycle % (col->getParticlesOutputCycle()) == 0
+                         && col->getParticlesOutputCycle() != 1);
+
+  if(write_particles){ convertParticlesToSoA(); }
 
   if (col->getWriteMethod() == "Parallel") {
-    if (cycle % (col->getFieldOutputCycle()) == 0 || cycle == first_cycle) {
+    if (write_fields) {
       WriteOutputParallel(grid, EMf, col, vct, cycle);
     }
   }
   else
   {
-    if (cycle % (col->getFieldOutputCycle()) == 0 || cycle == first_cycle) {
+    // OUTPUT to large file, called proc**
+    if (write_fields) {
       hdf5_agent.open_append(SaveDirName + "/proc" + num_proc.str() + ".hdf");
       output_mgr.output("Eall + Ball + rhos + Jsall + pressure", cycle);
       // Pressure tensor is available
       hdf5_agent.close();
     }
-    if (cycle % (col->getParticlesOutputCycle()) == 0 && col->getParticlesOutputCycle() != 1) {
+    if (write_particles) {
       hdf5_agent.open_append(SaveDirName + "/proc" + num_proc.str() + ".hdf");
       output_mgr.output("position + velocity + q ", cycle, 1);
       hdf5_agent.close();
@@ -410,3 +427,16 @@ void c_Solver::Finalize() {
   mpi->finalize_mpi();
 }
 
+// convert particle to struct of arrays (assumed by I/O)
+void c_Solver::convertParticlesToSoA()
+{
+  for (int i = 0; i < ns; i++)
+    part[i].convertParticlesToSoA();
+}
+
+// convert particle to array of structs (used in computing)
+void c_Solver::convertParticlesToAoS()
+{
+  for (int i = 0; i < ns; i++)
+    part[i].convertParticlesToAoS();
+}
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 3cc55c8e..1d1f567b 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -312,6 +312,7 @@ void Particles3D::mover_explicit(Grid * grid, VirtualTopology3D * vct, Field * E
 }
 /** mover with a Predictor-Corrector scheme */
 void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
+  convertParticlesToSoA();
   #pragma omp master
   if (vct->getCartesian_rank() == 0) {
     cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
@@ -456,10 +457,314 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
 }
 
+void Particles3D::mover_PC_AoS2(Grid * grid, VirtualTopology3D * vct, Field * EMf)
+{
+  convertParticlesToAoS();
+  #pragma omp master
+  if (vct->getCartesian_rank() == 0) {
+    cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
+  }
+  const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
+
+  #pragma omp master
+  { timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING); }
+  const pfloat dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
+  #pragma omp for schedule(static)
+  for (int pidx = 0; pidx < nop; pidx++) {
+    // copy the particle
+    SpeciesParticle& pcl = pcls[pidx];
+    const pfloat xorig = pcl.get_x();
+    const pfloat yorig = pcl.get_y();
+    const pfloat zorig = pcl.get_z();
+    const pfloat uorig = pcl.get_u();
+    const pfloat vorig = pcl.get_v();
+    const pfloat worig = pcl.get_w();
+    pfloat xavg = xorig;
+    pfloat yavg = yorig;
+    pfloat zavg = zorig;
+    pfloat uavg;
+    pfloat vavg;
+    pfloat wavg;
+    // calculate the average velocity iteratively
+    for (int innter = 0; innter < NiterMover; innter++) {
+      // interpolation G-->P
+      const pfloat ixd = floor((xavg - xstart) * inv_dx);
+      const pfloat iyd = floor((yavg - ystart) * inv_dy);
+      const pfloat izd = floor((zavg - zstart) * inv_dz);
+      // interface of index to right of cell
+      int ix = 2 + int(ixd);
+      int iy = 2 + int(iyd);
+      int iz = 2 + int(izd);
+
+      // use field data of closest cell in domain
+      //
+      if (ix < 1) ix = 1;
+      if (iy < 1) iy = 1;
+      if (iz < 1) iz = 1;
+      if (ix > nxc) ix = nxc;
+      if (iy > nyc) iy = nyc;
+      if (iz > nzc) iz = nzc;
+      // index of cell of particle;
+      //const int cx = ix - 1;
+      //const int cy = iy - 1;
+      //const int cz = iz - 1;
+
+      const pfloat xi0   = xavg - grid->get_pfloat_XN(ix-1);
+      const pfloat eta0  = yavg - grid->get_pfloat_YN(iy-1);
+      const pfloat zeta0 = zavg - grid->get_pfloat_ZN(iz-1);
+      const pfloat xi1   = grid->get_pfloat_XN(ix) - xavg;
+      const pfloat eta1  = grid->get_pfloat_YN(iy) - yavg;
+      const pfloat zeta1 = grid->get_pfloat_ZN(iz) - zavg;
+
+      pfloat Exl = 0.0;
+      pfloat Eyl = 0.0;
+      pfloat Ezl = 0.0;
+      pfloat Bxl = 0.0;
+      pfloat Byl = 0.0;
+      pfloat Bzl = 0.0;
+
+      pfloat weights[8];
+      const pfloat weight0 = invVOL*xi0;
+      const pfloat weight1 = invVOL*xi1;
+      const pfloat weight00 = weight0*eta0;
+      const pfloat weight01 = weight0*eta1;
+      const pfloat weight10 = weight1*eta0;
+      const pfloat weight11 = weight1*eta1;
+      weights[0] = weight00*zeta0; // weight000
+      weights[1] = weight00*zeta1; // weight001
+      weights[2] = weight01*zeta0; // weight010
+      weights[3] = weight01*zeta1; // weight011
+      weights[4] = weight10*zeta0; // weight100
+      weights[5] = weight10*zeta1; // weight101
+      weights[6] = weight11*zeta0; // weight110
+      weights[7] = weight11*zeta1; // weight111
+      //weights[0] = xi0 * eta0 * zeta0 * qi * invVOL; // weight000
+      //weights[1] = xi0 * eta0 * zeta1 * qi * invVOL; // weight001
+      //weights[2] = xi0 * eta1 * zeta0 * qi * invVOL; // weight010
+      //weights[3] = xi0 * eta1 * zeta1 * qi * invVOL; // weight011
+      //weights[4] = xi1 * eta0 * zeta0 * qi * invVOL; // weight100
+      //weights[5] = xi1 * eta0 * zeta1 * qi * invVOL; // weight101
+      //weights[6] = xi1 * eta1 * zeta0 * qi * invVOL; // weight110
+      //weights[7] = xi1 * eta1 * zeta1 * qi * invVOL; // weight111
+
+      // creating these aliases seems to accelerate this method by about 30%
+      // on the Xeon host, processor, suggesting deficiency in the optimizer.
+      //
+      arr1_pfloat_get field_components[8];
+      field_components[0] = fieldForPcls[ix  ][iy  ][iz  ]; // field000
+      field_components[1] = fieldForPcls[ix  ][iy  ][iz-1]; // field001
+      field_components[2] = fieldForPcls[ix  ][iy-1][iz  ]; // field010
+      field_components[3] = fieldForPcls[ix  ][iy-1][iz-1]; // field011
+      field_components[4] = fieldForPcls[ix-1][iy  ][iz  ]; // field100
+      field_components[5] = fieldForPcls[ix-1][iy  ][iz-1]; // field101
+      field_components[6] = fieldForPcls[ix-1][iy-1][iz  ]; // field110
+      field_components[7] = fieldForPcls[ix-1][iy-1][iz-1]; // field111
+
+      for(int c=0; c<8; c++)
+      {
+        Bxl += weights[c] * field_components[c][0];
+        Byl += weights[c] * field_components[c][1];
+        Bzl += weights[c] * field_components[c][2];
+        Exl += weights[c] * field_components[c][3];
+        Eyl += weights[c] * field_components[c][4];
+        Ezl += weights[c] * field_components[c][5];
+      }
+      const double Omx = qdto2mc*Bxl;
+      const double Omy = qdto2mc*Byl;
+      const double Omz = qdto2mc*Bzl;
+
+      // end interpolation
+      const pfloat omsq = (Omx * Omx + Omy * Omy + Omz * Omz);
+      const pfloat denom = 1.0 / (1.0 + omsq);
+      // solve the position equation
+      const pfloat ut = uorig + qdto2mc * Exl;
+      const pfloat vt = vorig + qdto2mc * Eyl;
+      const pfloat wt = worig + qdto2mc * Ezl;
+      //const pfloat udotb = ut * Bxl + vt * Byl + wt * Bzl;
+      const pfloat udotOm = ut * Omx + vt * Omy + wt * Omz;
+      // solve the velocity equation 
+      uavg = (ut + (vt * Omz - wt * Omy + udotOm * Omx)) * denom;
+      vavg = (vt + (wt * Omx - ut * Omz + udotOm * Omy)) * denom;
+      wavg = (wt + (ut * Omy - vt * Omx + udotOm * Omz)) * denom;
+      // update average position
+      xavg = xorig + uavg * dto2;
+      yavg = yorig + vavg * dto2;
+      zavg = zorig + wavg * dto2;
+    }                           // end of iteration
+    // update the final position and velocity
+    pcl.set_x(xorig + uavg * dt);
+    pcl.set_y(yorig + vavg * dt);
+    pcl.set_z(zorig + wavg * dt);
+    pcl.set_u(2.0 * uavg - uorig);
+    pcl.set_v(2.0 * vavg - vorig);
+    pcl.set_w(2.0 * wavg - worig);
+  }
+  #pragma omp master
+  { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
+}
+void Particles3D::mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf)
+{
+  convertParticlesToAoS();
+  #pragma omp master
+  if (vct->getCartesian_rank() == 0) {
+    cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
+  }
+  const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
+
+  #pragma omp master
+  { timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING); }
+  const double dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
+  #pragma omp for schedule(static)
+  // why does single precision make no difference in execution speed?
+  //#pragma simd vectorlength(VECTOR_WIDTH)
+  for (int pidx = 0; pidx < nop; pidx++) {
+    // copy the particle
+    SpeciesParticle& pcl = pcls[pidx];
+    const double xorig = pcl.get_x();
+    const double yorig = pcl.get_y();
+    const double zorig = pcl.get_z();
+    const double uorig = pcl.get_u();
+    const double vorig = pcl.get_v();
+    const double worig = pcl.get_w();
+    double xavg = xorig;
+    double yavg = yorig;
+    double zavg = zorig;
+    double uavg;
+    double vavg;
+    double wavg;
+    // calculate the average velocity iteratively
+    for (int innter = 0; innter < NiterMover; innter++) {
+
+      // compute weights for field components
+      //
+      double weights[8];
+      // xstart marks start of domain excluding ghosts
+      const double rel_xpos = xavg - xstart;
+      const double rel_ypos = yavg - ystart;
+      const double rel_zpos = zavg - zstart;
+      // cell position minus 1 (due to ghost cells)
+      const double cxm1_pos = rel_xpos * inv_dx;
+      const double cym1_pos = rel_ypos * inv_dy;
+      const double czm1_pos = rel_zpos * inv_dz;
+      //
+      int cx = 1 + int(floor(cxm1_pos));
+      int cy = 1 + int(floor(cym1_pos));
+      int cz = 1 + int(floor(czm1_pos));
+
+      // if the cell is outside the domain, then treat it as
+      // in the nearest ghost cell.
+      //
+      if (cx < 0) cx = 0;
+      if (cy < 0) cy = 0;
+      if (cz < 0) cz = 0;
+      // number of cells in x direction including ghosts is nxc
+      if (cx >= nxc) cx = nxc-1;
+      if (cy >= nyc) cy = nyc-1;
+      if (cz >= nzc) cz = nzc-1;
+
+      // index of interface to right of cell
+      const int ix = cx + 1;
+      const int iy = cy + 1;
+      const int iz = cz + 1;
+
+      // fraction of the distance from the right of the cell
+      const double w1x = cx - cxm1_pos;
+      const double w1y = cy - cym1_pos;
+      const double w1z = cz - czm1_pos;
+      // fraction of distance from the left
+      const double w0x = 1-w1x;
+      const double w0y = 1-w1y;
+      const double w0z = 1-w1z;
+      //const double weight00 = w0x*w0y;
+      //const double weight01 = w0x*w1y;
+      //const double weight10 = w1x*w0y;
+      //const double weight11 = w1x*w1y;
+      //weights[0] = weight00*w0z; // weight000
+      //weights[1] = weight00*w1z; // weight001
+      //weights[2] = weight01*w0z; // weight010
+      //weights[3] = weight01*w1z; // weight011
+      //weights[4] = weight10*w0z; // weight100
+      //weights[5] = weight10*w1z; // weight101
+      //weights[6] = weight11*w0z; // weight110
+      //weights[7] = weight11*w1z; // weight111
+      //
+      weights[0] = w0x*w0y*w0z; // weight000
+      weights[1] = w0x*w0y*w1z; // weight001
+      weights[2] = w0x*w1y*w0z; // weight010
+      weights[3] = w0x*w1y*w1z; // weight011
+      weights[4] = w1x*w0y*w0z; // weight100
+      weights[5] = w1x*w0y*w1z; // weight101
+      weights[6] = w1x*w1y*w0z; // weight110
+      weights[7] = w1x*w1y*w1z; // weight111
+
+      pfloat Exl = 0.0;
+      pfloat Eyl = 0.0;
+      pfloat Ezl = 0.0;
+      pfloat Bxl = 0.0;
+      pfloat Byl = 0.0;
+      pfloat Bzl = 0.0;
+
+      // creating these aliases seems to accelerate this method by about 30%
+      // on the Xeon host, processor, suggesting deficiency in the optimizer.
+      //
+      arr1_pfloat_get field_components[8];
+      field_components[0] = fieldForPcls[ix][iy][iz]; // field000
+      field_components[1] = fieldForPcls[ix][iy][cz]; // field001
+      field_components[2] = fieldForPcls[ix][cy][iz]; // field010
+      field_components[3] = fieldForPcls[ix][cy][cz]; // field011
+      field_components[4] = fieldForPcls[cx][iy][iz]; // field100
+      field_components[5] = fieldForPcls[cx][iy][cz]; // field101
+      field_components[6] = fieldForPcls[cx][cy][iz]; // field110
+      field_components[7] = fieldForPcls[cx][cy][cz]; // field111
+
+      for(int c=0; c<8; c++)
+      {
+        Bxl += weights[c] * field_components[c][0];
+        Byl += weights[c] * field_components[c][1];
+        Bzl += weights[c] * field_components[c][2];
+        Exl += weights[c] * field_components[c][3];
+        Eyl += weights[c] * field_components[c][4];
+        Ezl += weights[c] * field_components[c][5];
+      }
+      const double Omx = qdto2mc*Bxl;
+      const double Omy = qdto2mc*Byl;
+      const double Omz = qdto2mc*Bzl;
+
+      // end interpolation
+      const pfloat omsq = (Omx * Omx + Omy * Omy + Omz * Omz);
+      const pfloat denom = 1.0 / (1.0 + omsq);
+      // solve the position equation
+      const pfloat ut = uorig + qdto2mc * Exl;
+      const pfloat vt = vorig + qdto2mc * Eyl;
+      const pfloat wt = worig + qdto2mc * Ezl;
+      //const pfloat udotb = ut * Bxl + vt * Byl + wt * Bzl;
+      const pfloat udotOm = ut * Omx + vt * Omy + wt * Omz;
+      // solve the velocity equation 
+      uavg = (ut + (vt * Omz - wt * Omy + udotOm * Omx)) * denom;
+      vavg = (vt + (wt * Omx - ut * Omz + udotOm * Omy)) * denom;
+      wavg = (wt + (ut * Omy - vt * Omx + udotOm * Omz)) * denom;
+      // update average position
+      xavg = xorig + uavg * dto2;
+      yavg = yorig + vavg * dto2;
+      zavg = zorig + wavg * dto2;
+    }                           // end of iteration
+    // update the final position and velocity
+    pcl.set_x(xorig + uavg * dt);
+    pcl.set_y(yorig + vavg * dt);
+    pcl.set_z(zorig + wavg * dt);
+    pcl.set_u(2.0 * uavg - uorig);
+    pcl.set_v(2.0 * vavg - vorig);
+    pcl.set_w(2.0 * wavg - worig);
+  }                             // END OF ALL THE PARTICLES
+  #pragma omp master
+  { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
+}
+
 /** mover with a Predictor-Corrector scheme */
 void Particles3D::mover_PC_vectorized(
   Grid * grid, VirtualTopology3D * vct, Field * EMf)
 {
+  convertParticlesToSoA();
   assert_eq(nxc,nxn-1);
   assert_eq(nyc,nyn-1);
   assert_eq(nzc,nzn-1);
@@ -535,8 +840,8 @@ void Particles3D::mover_PC_vectorized(
       ALIGNED(u);
       ALIGNED(v);
       ALIGNED(w);
-      // this should vectorize, but could be faster if particle
-      // data for each mesh cell were aligned.
+      // This pragma help on Xeon but hurts on Xeon Phi.
+      // On the Phi we could accelerate by processing two particles at a time.
       #pragma simd
       //for(int pidx=bucket_offset_1d[cell]; pidx<numpcls_in_cell; pidx++)
       for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
@@ -672,6 +977,7 @@ void Particles3D::mover_PC_vectorized(
 int Particles3D::communicate_particles(VirtualTopology3D * vct)
 {
   timeTasks_set_communicating(); // communicating until end of scope
+  convertParticlesToSoA(); // hack
   const int avail = communicate(vct);
   if (avail < 0)
     return (-1);
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 0f267a59..446e305b 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -23,6 +23,7 @@ developers: Stefano Markidis, Giovanni Lapenta.
 #include "Field.h"
 #include "MPIdata.h"
 #include "ompdefs.h"
+#include "ipicmath.h"
 
 #include "Particles3Dcomm.h"
 #include "Parameters.h"
@@ -55,6 +56,7 @@ Particles3Dcomm::Particles3Dcomm(){
 }
 /** deallocate particles */
 Particles3Dcomm::~Particles3Dcomm() {
+  delete[]pcls;
   delete[]x;
   delete[]y;
   delete[]z;
@@ -100,6 +102,8 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   nop = col->getNp(species) / (vct->getNprocs());
   np_tot = col->getNp(species);
   npmax = col->getNpMax(species) / (vct->getNprocs());
+  // ensure that npmax is a multiple of AoS_PCLS_AT_A_TIME
+  npmax = roundup_to_multiple(npmax,AoS_PCLS_AT_A_TIME);
   qom = col->getQOM(species);
   uth = col->getUth(species);
   vth = col->getVth(species);
@@ -178,6 +182,21 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   // //////////////////////////////////////////////////////////////
   // ////////////// ALLOCATE ARRAYS /////////////////////////
   // //////////////////////////////////////////////////////////////
+  //
+  // AoS particle representation
+  //
+  // intel new allocates with 64-byte alignment
+  // since particles are 64 bytes wide, every particle
+  // is aligned.
+  pcls = new SpeciesParticle[npmax];
+  particleType = ParticleType::SoA;
+  #ifdef __INTEL_COMPILER
+    assert_eq(sizeof(Particle),64);
+    ALIGNED(pcls);
+  #endif
+  //
+  // SoA particle representation
+  //
   // positions
   x = new double[npmax];
   y = new double[npmax];
@@ -1214,4 +1233,54 @@ void Particles3Dcomm::sort_particles_serial(
 //}
 //#endif
 
+void Particles3Dcomm::copyParticlesToSoA()
+{
+  dprintf("copying to struct of arrays");
+  #pragma omp for
+  for(int pidx=0; pidx<nop; pidx++)
+  {
+    const SpeciesParticle& pcl = pcls[pidx];
+    if(ParticleID) ParticleID[pidx] = pcl.get_ID();
+    x[pidx] = pcl.get_x(0);
+    y[pidx] = pcl.get_x(1);
+    z[pidx] = pcl.get_x(2);
+    u[pidx] = pcl.get_u(0);
+    v[pidx] = pcl.get_u(1);
+    w[pidx] = pcl.get_u(2);
+    q[pidx] = pcl.get_q();
+  }
+}
+
+void Particles3Dcomm::copyParticlesToAoS()
+{
+  #pragma omp for
+  dprintf("copying to array of structs");
+  for(int pidx=0; pidx<nop; pidx++)
+  {
+    pcls[pidx].set( ParticleID ? ParticleID[pidx] : 0,
+      x[pidx],y[pidx],z[pidx],
+      u[pidx],v[pidx],w[pidx],
+      q[pidx]);
+  }
+}
+
+void Particles3Dcomm::convertParticlesToAoS()
+{
+  if(particleType!=ParticleType::AoS)
+  {
+    assert_eq(particleType,ParticleType::SoA);
+    copyParticlesToAoS();
+    particleType = ParticleType::AoS;
+  }
+}
+
+void Particles3Dcomm::convertParticlesToSoA()
+{
+  if(particleType != ParticleType::SoA)
+  {
+    assert_eq(particleType,ParticleType::AoS);
+    copyParticlesToSoA();
+    particleType = ParticleType::SoA;
+  }
+}
 
diff --git a/scripts/ipic.py b/scripts/ipic.py
index 1c9d90ad..64116c84 100755
--- a/scripts/ipic.py
+++ b/scripts/ipic.py
@@ -13,6 +13,11 @@
 # http://pymotw.com/2/subprocess/
 # http://stackoverflow.com/questions/3777301/how-to-call-a-shell-script-from-python-code
 
+def ipic_findcpph(args):
+    # create tags file using ctags
+    command = '''find . -name '*.cpp' -or -name '*.h' | grep -v unused | grep -v postprocessing_tools'''
+    os.system(command)
+
 def ipic_ctags(args):
     # create tags file using ctags
     create_tags_command = \
@@ -238,6 +243,8 @@ def main():
     elif command == "ctags":
         ipic_ctags(args)
         #print "ctags not yet implemented"
+    elif command == "findcpph":
+        ipic_findcpph(args)
     else:
         print progname, command, "not supported"
         sys.exit(-1)

From b9f29583b5b21b1d8b32238b243f3d3cb7d2a98e Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Tue, 21 Jan 2014 15:23:25 +0100
Subject: [PATCH 088/118] added compile options -g -xHost for Xeon

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f70f8451..28d0690d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@ if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "k1om") ## Xeon Phi
    #set(CMAKE_CXX_FLAGS "-openmp -g -mmic") # set flags for Xeon Phi, totalview
 elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") ## Xeon
    set(CMAKE_CXX_COMPILER "icpc")  
-   set(CMAKE_CXX_FLAGS "-O3 -openmp -fno-exceptions -vec-report")
+   set(CMAKE_CXX_FLAGS "-O3 -openmp -g -xHost -fno-exceptions -vec-report")
 else()
    set(CMAKE_CXX_FLAGS "-O3")
 endif()

From b947bb4c39cef810d15c019b9c60f88a2bd1938b Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 23 Jan 2014 00:42:51 +0100
Subject: [PATCH 089/118] fixed compiler bugs on MIC introduced in e1dba2f4abd

---
 include/Grid3DCU.h            | 24 ++++++++++++------------
 include/ipicmath.h            |  1 +
 particles/Particles3Dcomm.cpp |  5 +++--
 utility/debug.cpp             |  2 +-
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index 72dfc246..c0a60e7e 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -170,18 +170,18 @@ class Grid3DCU                  // :public Grid
   // coordinate accessors
   //
   // calculated equivalents (preferred for accelerator?):
-  const double calcXN(int X) { return xStart+(X-1)*dx;}
-  const double calcYN(int Y) { return yStart+(Y-1)*dy;}
-  const double calcZN(int Z) { return zStart+(Z-1)*dz;}
-  const pfloat &get_pfloat_XN(int X) { return pfloat_node_xcoord[X];}
-  const pfloat &get_pfloat_YN(int Y) { return pfloat_node_ycoord[Y];}
-  const pfloat &get_pfloat_ZN(int Z) { return pfloat_node_zcoord[Z];}
-  const double &getXN(int X) { return node_xcoord[X];}
-  const double &getYN(int Y) { return node_ycoord[Y];}
-  const double &getZN(int Z) { return node_zcoord[Z];}
-  const double &getXC(int X) { return center_xcoord[X];}
-  const double &getYC(int Y) { return center_ycoord[Y];}
-  const double &getZC(int Z) { return center_zcoord[Z];}
+  double calcXN(int X)const{ return xStart+(X-1)*dx;}
+  double calcYN(int Y)const{ return yStart+(Y-1)*dy;}
+  double calcZN(int Z)const{ return zStart+(Z-1)*dz;}
+  const pfloat &get_pfloat_XN(int X)const{ return pfloat_node_xcoord[X];}
+  const pfloat &get_pfloat_YN(int Y)const{ return pfloat_node_ycoord[Y];}
+  const pfloat &get_pfloat_ZN(int Z)const{ return pfloat_node_zcoord[Z];}
+  const double &getXN(int X)const{ return node_xcoord[X];}
+  const double &getYN(int Y)const{ return node_ycoord[Y];}
+  const double &getZN(int Z)const{ return node_zcoord[Z];}
+  const double &getXC(int X)const{ return center_xcoord[X];}
+  const double &getYC(int Y)const{ return center_ycoord[Y];}
+  const double &getZC(int Z)const{ return center_zcoord[Z];}
   //
   // The following could be eliminated in favor of the previous
   // unless we truly anticipate generalizing to a deformed
diff --git a/include/ipicmath.h b/include/ipicmath.h
index edf0ca5e..a042ef92 100644
--- a/include/ipicmath.h
+++ b/include/ipicmath.h
@@ -1,5 +1,6 @@
 #ifndef _ipicmath_h_
 #define _ipicmath_h_
+#include "assert.h"
 
 // valid if roundup power is representable.
 inline int
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 446e305b..12adb178 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -25,6 +25,7 @@ developers: Stefano Markidis, Giovanni Lapenta.
 #include "ompdefs.h"
 #include "ipicmath.h"
 
+#include "Particle.h"
 #include "Particles3Dcomm.h"
 #include "Parameters.h"
 
@@ -191,7 +192,7 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   pcls = new SpeciesParticle[npmax];
   particleType = ParticleType::SoA;
   #ifdef __INTEL_COMPILER
-    assert_eq(sizeof(Particle),64);
+    assert_eq(sizeof(SpeciesParticle),64);
     ALIGNED(pcls);
   #endif
   //
@@ -1253,8 +1254,8 @@ void Particles3Dcomm::copyParticlesToSoA()
 
 void Particles3Dcomm::copyParticlesToAoS()
 {
-  #pragma omp for
   dprintf("copying to array of structs");
+  #pragma omp for
   for(int pidx=0; pidx<nop; pidx++)
   {
     pcls[pidx].set( ParticleID ? ParticleID[pidx] : 0,
diff --git a/utility/debug.cpp b/utility/debug.cpp
index 44d79626..0d5d4908 100644
--- a/utility/debug.cpp
+++ b/utility/debug.cpp
@@ -70,7 +70,7 @@ void fprintf_fileLine(FILE * fptr,
   const char *type, const char *func, const char *file, int line_number,
   const char *format, ...)
 {
-  //if(!is_output_thread()) return; // temporary
+  if(!is_output_thread()) return; // temporary
 
   // writing directly to fptr would avoid limiting the length
   // of the output string, but by first writing to a string

From 4d5156e5ed65562e707dd46a127cc940fee25a5e Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 23 Jan 2014 01:47:49 +0100
Subject: [PATCH 090/118] added TimeTasks: TRANSPOSE_PCLS_TO_SOA and
 TRANSPOSE_PCLS_TO_AOS

---
 include/TimeTasks.h           | 2 ++
 particles/Particles3Dcomm.cpp | 3 +++
 utility/TimeTasks.cpp         | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/include/TimeTasks.h b/include/TimeTasks.h
index 3ac67b23..ff07f26c 100644
--- a/include/TimeTasks.h
+++ b/include/TimeTasks.h
@@ -32,6 +32,8 @@ class TimeTasks
     MOMENT_REDUCTION,
     MOVER_PCL_SORTING,
     MOVER_PCL_MOVING,
+    TRANSPOSE_PCLS_TO_AOS,
+    TRANSPOSE_PCLS_TO_SOA,
     NUMBER_OF_TASKS // this line should be last
   };
 
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 12adb178..4533266d 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -33,6 +33,7 @@ developers: Stefano Markidis, Giovanni Lapenta.
 #include <vector>
 #include <complex>
 #include "debug.h"
+#include "TimeTasks.h"
 
 using std::cout;
 using std::cerr;
@@ -1236,6 +1237,7 @@ void Particles3Dcomm::sort_particles_serial(
 
 void Particles3Dcomm::copyParticlesToSoA()
 {
+  timeTasks_set_task(TimeTasks::TRANSPOSE_PCLS_TO_SOA);
   dprintf("copying to struct of arrays");
   #pragma omp for
   for(int pidx=0; pidx<nop; pidx++)
@@ -1254,6 +1256,7 @@ void Particles3Dcomm::copyParticlesToSoA()
 
 void Particles3Dcomm::copyParticlesToAoS()
 {
+  timeTasks_set_task(TimeTasks::TRANSPOSE_PCLS_TO_AOS);
   dprintf("copying to array of structs");
   #pragma omp for
   for(int pidx=0; pidx<nop; pidx++)
diff --git a/utility/TimeTasks.cpp b/utility/TimeTasks.cpp
index 13001f88..b37ac4e5 100644
--- a/utility/TimeTasks.cpp
+++ b/utility/TimeTasks.cpp
@@ -23,6 +23,8 @@ static const char *taskNames[] = // order must agree with Tasks in TimeTasks.h
   "moment_reduction",
   "mover_pcl_sorting",
   "mover_pcl_moving",
+  "transpose_pcls_to_AoS",
+  "transpose_pcls_to_SoA",
   "number_of_tasks"
 };
 

From ba6773783b4a677e3e6f939974fae81dee81ce24 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 23 Jan 2014 01:49:25 +0100
Subject: [PATCH 091/118] restricted TimeTasks to master thread of rank 0
 process

---
 include/TimeTasks.h   | 37 ++++---------------------
 utility/TimeTasks.cpp | 64 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 67 insertions(+), 34 deletions(-)

diff --git a/include/TimeTasks.h b/include/TimeTasks.h
index ff07f26c..a00d8d76 100644
--- a/include/TimeTasks.h
+++ b/include/TimeTasks.h
@@ -120,16 +120,8 @@ class TimeTasks_caller_to_set_main_task_for_scope
   double start_time;
   TimeTasks::Tasks task;
  public:
-  TimeTasks_caller_to_set_main_task_for_scope(TimeTasks::Tasks _task) :
-    task(_task)
-  {
-    start_time = MPI_Wtime();
-    timeTasks.start_main_task(task);
-  }
-  ~TimeTasks_caller_to_set_main_task_for_scope()
-  {
-    timeTasks.end_main_task(task, start_time);
-  }
+  TimeTasks_caller_to_set_main_task_for_scope(TimeTasks::Tasks _task);
+  ~TimeTasks_caller_to_set_main_task_for_scope();
 };
 
 class TimeTasks_caller_to_set_task_for_scope
@@ -138,27 +130,8 @@ class TimeTasks_caller_to_set_task_for_scope
   double start_time;
   TimeTasks::Tasks task;
  public:
-  TimeTasks_caller_to_set_task_for_scope(TimeTasks::Tasks _task) :
-    task(_task)
-  {
-    already_active = timeTasks.is_active(task);
-    if(!already_active)
-    {
-      start_time = MPI_Wtime();
-      timeTasks.start_task(task);
-    }
-  }
-  ~TimeTasks_caller_to_set_task_for_scope()
-  {
-    if(already_active)
-    {
-      assert(timeTasks.is_active(task));
-    }
-    else
-    {
-      timeTasks.end_task(task, start_time);
-    }
-  }
+  TimeTasks_caller_to_set_task_for_scope(TimeTasks::Tasks _task);
+  ~TimeTasks_caller_to_set_task_for_scope();
 };
 
 class TimeTasks_caller_to_set_communication_mode_for_scope
@@ -174,7 +147,7 @@ class TimeTasks_caller_to_set_communication_mode_for_scope
 // These macros could be changed to provide file and line number
 //
 // We need to create nonanonymous instances so that the destructor
-// will not be called until the end of the scope, so we use the preprocessor.
+// will not be called until the end of the scope, so we use the preprocessor
 // to generate unique names of nonanonymous instances.
 //
 #define timeTasks_set_main_task(task) \
diff --git a/utility/TimeTasks.cpp b/utility/TimeTasks.cpp
index b37ac4e5..ec908d73 100644
--- a/utility/TimeTasks.cpp
+++ b/utility/TimeTasks.cpp
@@ -4,6 +4,7 @@
 #include "TimeTasks.h"
 #include "asserts.h"
 #include "MPIdata.h" // for get_rank
+#include "parallel.h"
 #include "debug.h"
 
 /** implementation of declarations in utility/TimeTasks.h **/
@@ -49,6 +50,7 @@ void TimeTasks::resetCycle()
 }
 void TimeTasks::start_main_task(TimeTasks::Tasks taskid)
 {
+  if(!is_output_thread()) return;
   assert(is_exclusive(taskid));
   assert_ne(active_task, taskid);
   active_task = taskid;
@@ -57,6 +59,7 @@ void TimeTasks::start_main_task(TimeTasks::Tasks taskid)
 }
 void TimeTasks::start_task(TimeTasks::Tasks taskid)
 {
+  if(!is_output_thread()) return;
   assert(!is_exclusive(taskid));
   assert(!active[taskid]);
   active[taskid]=true;
@@ -64,6 +67,7 @@ void TimeTasks::start_task(TimeTasks::Tasks taskid)
 // have to manage the task stack explicitly
 void TimeTasks::start_task(TimeTasks::Tasks taskid, double start_time)
 {
+  if(!is_output_thread()) return;
   if(stack_depth[taskid]==0)
   {
     start_times[taskid]=start_time;
@@ -73,11 +77,13 @@ void TimeTasks::start_task(TimeTasks::Tasks taskid, double start_time)
 }
 void TimeTasks::end_main_task(TimeTasks::Tasks taskid, double start_time)
 {
+  if(!is_output_thread()) return;
   end_task(taskid, start_time);
   active_task = NONE;
 }
 void TimeTasks::end_task(TimeTasks::Tasks taskid, double start_time)
 {
+  if(!is_output_thread()) return;
   assert(active[taskid]);
   double now = MPI_Wtime();
   // compute time spent on task
@@ -87,6 +93,7 @@ void TimeTasks::end_task(TimeTasks::Tasks taskid, double start_time)
 // have to manage the task stack explicitly
 void TimeTasks::end_task(TimeTasks::Tasks taskid)
 {
+  if(!is_output_thread()) return;
   stack_depth[taskid]--;
   assert_ge(stack_depth[taskid],0);
   if(stack_depth[taskid]==0)
@@ -96,6 +103,7 @@ void TimeTasks::end_task(TimeTasks::Tasks taskid)
 }
 void TimeTasks::end_communicating(double start_time)
 {
+  if(!is_output_thread()) return;
   assert(active_task);
   assert(communicating);
   double additional_communication_time = MPI_Wtime()-start_time;
@@ -105,9 +113,10 @@ void TimeTasks::end_communicating(double start_time)
 #define TIMING_PREFIX "| "
 void TimeTasks::print_cycle_times(int cycle)
 {
+  if(!is_output_thread()) return;
   FILE* file = stdout;
   // we could report average for all processes
-  if(!MPIdata::get_rank())
+  //if(!MPIdata::get_rank())
   {
     fflush(file);
     fprintf(file,"=== times for cycle %d for rank %d === \n",
@@ -147,9 +156,14 @@ void TimeTasks::print_cycle_times(int cycle)
     fprintf(file, TIMING_PREFIX "time   subtask\n");
     for(int e=LAST+1; e<NUMBER_OF_TASKS; e++)
     {
+      // do not show tasks that are not executed
+      double elapsed_time = get_time(e);
+      if(!elapsed_time)
+        continue;
+
       assert_eq(stack_depth[e],0);
       fprintf(file, TIMING_PREFIX "%6.3f %s\n",
-      get_time(e),
+      elapsed_time,
       get_taskname(e));
     }
     
@@ -162,6 +176,7 @@ void TimeTasks::print_cycle_times(int cycle)
 // 
 void TimeTasks::operator/=(int num)
 {
+  assert(false); // this method is not in use.
   for(int e=NONE+1;e<NUMBER_OF_TASKS;e++)
   {
     task_duration[e]/=num;
@@ -171,6 +186,7 @@ void TimeTasks::operator/=(int num)
 }
 void TimeTasks::operator+=(const TimeTasks& arg)
 {
+  assert(false); // this method is not in use.
   active_task = arg.active_task;
   communicating = arg.communicating;
   for(int e=NONE+1;e<NUMBER_OF_TASKS;e++)
@@ -184,6 +200,7 @@ void TimeTasks::operator+=(const TimeTasks& arg)
 }
 void TimeTasks::operator=(const TimeTasks& arg)
 {
+  assert(false); // this method is not in use.
   active_task = arg.active_task;
   communicating = arg.communicating;
   for(int e=NONE+1;e<NUMBER_OF_TASKS;e++)
@@ -196,9 +213,51 @@ void TimeTasks::operator=(const TimeTasks& arg)
   }
 }
 
+TimeTasks_caller_to_set_main_task_for_scope::
+TimeTasks_caller_to_set_main_task_for_scope(TimeTasks::Tasks _task) :
+  task(_task)
+{
+  if(!is_output_thread()) return;
+  start_time = MPI_Wtime();
+  timeTasks.start_main_task(task);
+}
+TimeTasks_caller_to_set_main_task_for_scope::
+~TimeTasks_caller_to_set_main_task_for_scope()
+{
+  if(!is_output_thread()) return;
+  timeTasks.end_main_task(task, start_time);
+}
+
+TimeTasks_caller_to_set_task_for_scope::
+TimeTasks_caller_to_set_task_for_scope(TimeTasks::Tasks _task)
+{
+  if(!is_output_thread()) return;
+  task = _task;
+  already_active = timeTasks.is_active(task);
+  if(!already_active)
+  {
+    start_time = MPI_Wtime();
+    timeTasks.start_task(task);
+  }
+}
+TimeTasks_caller_to_set_task_for_scope::
+~TimeTasks_caller_to_set_task_for_scope()
+{
+  if(!is_output_thread()) return;
+  if(already_active)
+  {
+    assert(timeTasks.is_active(task));
+  }
+  else
+  {
+    timeTasks.end_task(task, start_time);
+  }
+}
+
 TimeTasks_caller_to_set_communication_mode_for_scope::
 TimeTasks_caller_to_set_communication_mode_for_scope()
 {
+  if(!is_output_thread()) return;
   already_communicating = timeTasks.get_communicating();
   if(!already_communicating)
   {
@@ -209,6 +268,7 @@ TimeTasks_caller_to_set_communication_mode_for_scope()
 TimeTasks_caller_to_set_communication_mode_for_scope::
 ~TimeTasks_caller_to_set_communication_mode_for_scope()
 {
+  if(!is_output_thread()) return;
   if(!already_communicating)
   {
     timeTasks.end_communicating(start_time);

From 1a5b13124bd6eafe874ee99c7b5483ed82a4a296 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 23 Jan 2014 05:27:13 +0100
Subject: [PATCH 092/118] created sumMoments_AoS(). code blowing up at 10
 cycles.

---
 fields/EMfields3D.cpp     | 167 +++++++++++++++++++++++++++++++++++++-
 include/EMfields3D.h      |   1 +
 include/Particles3Dcomm.h |   1 +
 main/Parameters.cpp       |   2 +-
 main/iPic3Dlib.cpp        |   7 +-
 5 files changed, 174 insertions(+), 4 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index c5af87f0..38f544c4 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -395,7 +395,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
   for (int i = 0; i < ns; i++)
   {
     const Particles3Dcomm& pcls = part[i];
-    assert_eq(pcls.get_particleType(), ParticleType::AoS);
+    assert_eq(pcls.get_particleType(), ParticleType::SoA);
     const int is = pcls.get_ns();
     assert_eq(i,is);
 
@@ -586,6 +586,171 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
   }
 }
 
+void EMfields3D::sumMoments_AoS(
+  const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct)
+{
+  const double inv_dx = 1.0 / dx;
+  const double inv_dy = 1.0 / dy;
+  const double inv_dz = 1.0 / dz;
+  const int nxn = grid->getNXN();
+  const int nyn = grid->getNYN();
+  const int nzn = grid->getNZN();
+  const double xstart = grid->getXstart();
+  const double ystart = grid->getYstart();
+  const double zstart = grid->getZstart();
+  // To make memory use scale to a large number of threads, we
+  // could first apply an efficient parallel sorting algorithm
+  // to the particles and then accumulate moments in smaller
+  // subarrays.
+  //#ifdef _OPENMP
+  #pragma omp parallel
+  {
+  for (int species_idx = 0; species_idx < ns; species_idx++)
+  {
+    const Particles3Dcomm& pcls = part[species_idx];
+    assert_eq(pcls.get_particleType(), ParticleType::AoS);
+    const int is = pcls.get_ns();
+    assert_eq(species_idx,is);
+
+    const int nop = pcls.getNOP();
+
+    int thread_num = omp_get_thread_num();
+    { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
+    Moments10& speciesMoments10 = fetch_moments10Array(thread_num);
+    arr4_double moments = speciesMoments10.fetch_arr();
+    //
+    // moments.setmode(ompmode::mine);
+    // moments.setall(0.);
+    // 
+    double *moments1d = &moments[0][0][0][0];
+    int moments1dsize = moments.get_size();
+    for(int i=0; i<moments1dsize; i++) moments1d[i]=0;
+    //
+    #pragma omp barrier
+    #pragma omp for nowait
+    for (int pidx = 0; pidx < nop; pidx++)
+    {
+      const SpeciesParticle& pcl = pcls.get_pcl(pidx);
+      // compute the quadratic moments of velocity
+      //
+      const double ui=pcl.get_u();
+      const double vi=pcl.get_v();
+      const double wi=pcl.get_w();
+      const double uui=ui*ui;
+      const double uvi=ui*vi;
+      const double uwi=ui*wi;
+      const double vvi=vi*vi;
+      const double vwi=vi*wi;
+      const double wwi=wi*wi;
+      double velmoments[10];
+      velmoments[0] = 1.;
+      velmoments[1] = ui;
+      velmoments[2] = vi;
+      velmoments[3] = wi;
+      velmoments[4] = uui;
+      velmoments[5] = uvi;
+      velmoments[6] = uwi;
+      velmoments[7] = vvi;
+      velmoments[8] = vwi;
+      velmoments[9] = wwi;
+
+      //
+      // compute the weights to distribute the moments
+      //
+      const int ix = 2 + int (floor((pcl.get_x() - xstart) * inv_dx));
+      const int iy = 2 + int (floor((pcl.get_y() - ystart) * inv_dy));
+      const int iz = 2 + int (floor((pcl.get_z() - zstart) * inv_dz));
+      const double xi0   = pcl.get_x() - grid->getXN(ix-1);
+      const double eta0  = pcl.get_y() - grid->getYN(iy-1);
+      const double zeta0 = pcl.get_z() - grid->getZN(iz-1);
+      const double xi1   = grid->getXN(ix) - pcl.get_x();
+      const double eta1  = grid->getYN(iy) - pcl.get_y();
+      const double zeta1 = grid->getZN(iz) - pcl.get_z();
+      const double qi = pcl.get_q();
+      const double invVOLqi = invVOL*qi;
+      const double weight0 = invVOLqi * xi0;
+      const double weight1 = invVOLqi * xi1;
+      const double weight00 = weight0*eta0;
+      const double weight01 = weight0*eta1;
+      const double weight10 = weight1*eta0;
+      const double weight11 = weight1*eta1;
+      double weights[8];
+      weights[0] = weight00*zeta0; // weight000
+      weights[1] = weight00*zeta1; // weight001
+      weights[2] = weight01*zeta0; // weight010
+      weights[3] = weight01*zeta1; // weight011
+      weights[4] = weight10*zeta0; // weight100
+      weights[5] = weight10*zeta1; // weight101
+      weights[6] = weight11*zeta0; // weight110
+      weights[7] = weight11*zeta1; // weight111
+      //weights[0] = xi0 * eta0 * zeta0 * qi * invVOL; // weight000
+      //weights[1] = xi0 * eta0 * zeta1 * qi * invVOL; // weight001
+      //weights[2] = xi0 * eta1 * zeta0 * qi * invVOL; // weight010
+      //weights[3] = xi0 * eta1 * zeta1 * qi * invVOL; // weight011
+      //weights[4] = xi1 * eta0 * zeta0 * qi * invVOL; // weight100
+      //weights[5] = xi1 * eta0 * zeta1 * qi * invVOL; // weight101
+      //weights[6] = xi1 * eta1 * zeta0 * qi * invVOL; // weight110
+      //weights[7] = xi1 * eta1 * zeta1 * qi * invVOL; // weight111
+
+      // add particle to moments
+      {
+        arr1_double_fetch momentsArray[8];
+        arr2_double_fetch moments00 = moments[ix  ][iy  ];
+        arr2_double_fetch moments01 = moments[ix  ][iy-1];
+        arr2_double_fetch moments10 = moments[ix-1][iy  ];
+        arr2_double_fetch moments11 = moments[ix-1][iy-1];
+        momentsArray[0] = moments00[iz  ]; // moments000 
+        momentsArray[1] = moments00[iz-1]; // moments001 
+        momentsArray[2] = moments01[iz  ]; // moments010 
+        momentsArray[3] = moments01[iz-1]; // moments011 
+        momentsArray[4] = moments10[iz  ]; // moments100 
+        momentsArray[5] = moments10[iz-1]; // moments101 
+        momentsArray[6] = moments11[iz  ]; // moments110 
+        momentsArray[7] = moments11[iz-1]; // moments111 
+
+        for(int m=0; m<10; m++)
+        for(int c=0; c<8; c++)
+        {
+          momentsArray[c][m] += velmoments[m]*weights[c];
+        }
+      }
+    }
+    if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION);
+
+    // reduction
+    if(!thread_num) timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION);
+
+    // reduce moments in parallel
+    //
+    for(int thread_num=0;thread_num<get_sizeMomentsArray();thread_num++)
+    {
+      arr4_double moments = fetch_moments10Array(thread_num).fetch_arr();
+      #pragma omp for collapse(2)
+      for(int i=0;i<nxn;i++)
+      for(int j=0;j<nyn;j++)
+      for(int k=0;k<nzn;k++)
+      {
+        rhons[is][i][j][k] += invVOL*moments[i][j][k][0];
+        Jxs  [is][i][j][k] += invVOL*moments[i][j][k][1];
+        Jys  [is][i][j][k] += invVOL*moments[i][j][k][2];
+        Jzs  [is][i][j][k] += invVOL*moments[i][j][k][3];
+        pXXsn[is][i][j][k] += invVOL*moments[i][j][k][4];
+        pXYsn[is][i][j][k] += invVOL*moments[i][j][k][5];
+        pXZsn[is][i][j][k] += invVOL*moments[i][j][k][6];
+        pYYsn[is][i][j][k] += invVOL*moments[i][j][k][7];
+        pYZsn[is][i][j][k] += invVOL*moments[i][j][k][8];
+        pZZsn[is][i][j][k] += invVOL*moments[i][j][k][9];
+      }
+    }
+    if(!thread_num) timeTasks_end_task(TimeTasks::MOMENT_REDUCTION);
+  }
+  }
+  for (int i = 0; i < ns; i++)
+  {
+    communicateGhostP2G(i, 0, 0, 0, 0, vct);
+  }
+}
+
 inline void compute_moments(double velmoments[10], double weights[8],
   int i,
   double const * const x,
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 8a93172f..32599b9f 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -125,6 +125,7 @@ class EMfields3D                // :public Field
     void communicateGhostP2G(int ns, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, VirtualTopology3D * vct);
     /*! sum moments (interp_P2G) versions */
     void sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
+    void sumMoments_AoS(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
     void sumMoments_vectorized(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
     void sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
     /*! add accumulated moments to the moments for a given species */
diff --git a/include/Particles3Dcomm.h b/include/Particles3Dcomm.h
index 4b6fb098..79e3fb0e 100644
--- a/include/Particles3Dcomm.h
+++ b/include/Particles3Dcomm.h
@@ -121,6 +121,7 @@ class Particles3Dcomm // :public Particles
   // inline get accessors
   //
   ParticleType::Type get_particleType()const { return particleType; }
+  const SpeciesParticle& get_pcl(int pidx)const{ return pcls[pidx]; }
   double *getXall()  const { return (x); }
   double *getYall()  const { return (y); }
   double *getZall()  const { return (z); }
diff --git a/main/Parameters.cpp b/main/Parameters.cpp
index 9a51e13f..00d99b7a 100644
--- a/main/Parameters.cpp
+++ b/main/Parameters.cpp
@@ -11,7 +11,7 @@ void Parameters::init_parameters()
 
 //bool Parameters::get_SORTING_PARTICLES() { return SORTING_PARTICLES; }
 bool Parameters::get_SORTING_PARTICLES() { return true; }
-bool Parameters::get_VECTORIZE_MOMENTS() { return true; }
+bool Parameters::get_VECTORIZE_MOMENTS() { return false; }
 bool Parameters::get_VECTORIZE_MOVER() { return false; }
 // this must also return true if we communicate particles per iteration
 //bool Parameters::get_USING_XAVG() { return get_VECTORIZE_MOVER(); }
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 79dce395..6676b8e4 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -209,6 +209,8 @@ void c_Solver::CalculateMoments() {
     EMf->setZeroPrimaryMoments();
     convertParticlesToSoA();
     EMf->sumMoments(part, grid, vct);
+    //convertParticlesToAoS();
+    //EMf->sumMoments_AoS(part, grid, vct);
   }
   //for (int i = 0; i < ns; i++)
   //{
@@ -265,12 +267,13 @@ bool c_Solver::ParticlesMover() {
       if(Parameters::get_VECTORIZE_MOVER())
       {
         part[i].mover_PC_vectorized(grid, vct, EMf);
+        //part[i].mover_PC_AoS_XeonVec(grid, vct, EMf);
       }
       else
       {
-        //part[i].mover_PC(grid, vct, EMf);
+        part[i].mover_PC(grid, vct, EMf);
+        //part[i].mover_PC_AoS(grid, vct, EMf);
         //part[i].mover_PC_AoS2(grid, vct, EMf);
-        part[i].mover_PC_AoS(grid, vct, EMf);
       }
     }
     }

From 9164960b345889329951190eb0dc72a4804fc5e4 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 29 Jan 2014 19:55:52 +0100
Subject: [PATCH 093/118] Fixed compile errors on Xeon Phi

---
 CMakeLists.txt | 116 +++++++++++++++++++------------------------------
 1 file changed, 44 insertions(+), 72 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa72f8ee..43a72698 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.8) 
 # compiler set in ../cmake/cmake_template.cmake.XeonPhi
-message ("for Xeon Phi:")
-message ("cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/cmake_template.cmake.XeonPhi")
+#message ("for Xeon Phi:")
+#message ("cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/cmake_template.cmake.XeonPhi")
 #message ("for Xeon:")
 #message ("cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/cmake_template.cmake.Xeon")
 #
@@ -21,13 +21,26 @@ set(LIBRARY_OUTPUT_PATH lib)
 # Set compiler flags per system
 #
 if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "k1om") ## Xeon Phi
-   set(CMAKE_CXX_FLAGS "-O3 -openmp -fno-exceptions -vec-report -mmic")
+   option(IPIC_XEONPHI "ipic xeon phi standard compile flags" on)
+   if(IPIC_XEONPHI) 
+     set(CMAKE_CXX_FLAGS "-O3 -openmp -fno-exceptions -vec-report -mmic")
+   else()
+     set(CMAKE_CXX_FLAGS "-mmic")
+   endif()
+   #set(CMAKE_CXX_FLAGS "$(CMAKE_CXX_FLAGS) -mmic")
+   #set(CMAKE_CXX_FLAGS "-O3 -openmp -fno-exceptions -vec-report -mmic")
    #set(CMAKE_CXX_FLAGS "-openmp -g -mmic") # set flags for Xeon Phi, totalview
 elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") ## Xeon
-   set(CMAKE_CXX_COMPILER "icpc")  
-   set(CMAKE_CXX_FLAGS "-O3 -openmp -fno-exceptions -vec-report")
+   option(IPIC_XEON "icpc with optimization" on)
+   if(IPIC_XEON) 
+     # reporting: -g -vec-report
+     # optimization: -O3 -xHost -fno-exceptions
+     set(CMAKE_CXX_FLAGS "-openmp -fno-exceptions -O3 -xHost -vec-report")
+     set(CMAKE_CXX_COMPILER "icpc")  
+   endif()
+   #set(CMAKE_CXX_FLAGS "-O3 -openmp -g -xHost -fno-exceptions -vec-report")
 else()
-   set(CMAKE_CXX_FLAGS "-O3")
+   #set(CMAKE_CXX_FLAGS "-O3")
 endif()
 
 #
@@ -86,7 +99,6 @@ file(
         communication/*.cpp
         fields/*.cpp
         grids/*.cpp
-        iPIC3D.cpp/*.cpp
         inputoutput/*.cpp
         mathlib/*.cpp
         mpidata/*.cpp
@@ -99,50 +111,34 @@ file(
 )
 
 #
-# Compilation options
+# Macro definitions
 #
 
-set(IPIC_TESTS_DIR "${CMAKE_BINARY_DIR}/tests" CACHE STRING "Location of the source files for iPic3D")
-
-option(IPIC_PARALLEL_HDF5 "Output is done using the parallel HDF5 library" OFF)
-if(IPIC_PARALLEL_HDF5)
-  add_definitions(-DPHDF5)
-endif()
-
-option(IPIC_BATSRUS "Compile library with coupling code for BATS-R-US" OFF)
-if(IPIC_BATSRUS)
+set(TEST_B $ENV{BATSRUS})
+if(DEFINED TEST_B)
   add_definitions( -DBATSRUS )
-endif()
-
-option(IPIC_XEONPHI "Compile options for Xeon Phi" OFF)
-if(IPIC_XEONPHI)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-endif()
-
-option(IPIC_TESTS "Set up the code tests" OFF)
-
-option(IPIC_BUILD_SHARED "Compile shared library" OFF)
-if(IPIC_BUILD_SHARED)
-  set(IPIC_BUILD_TYPE SHARED)
-else()
-  set(IPIC_BUILD_TYPE STATIC)
-endif()
+  message(" WARNING: BATSRUS flag is active.")
+else(DEFINED TEST_B)
+  message(" INFO: BATSRUS is not active.")
+endif(DEFINED TEST_B)
 
 #
 # Executable declaration
 #
 
+# Particle solver
 add_executable(
         iPic3D
         iPic3D.cpp
 )
 
+
 #build iPic as a library also
 #libiPic3Dlib.so in folder lib
 add_library(
-        iPic3Dlib
-        ${IPIC_BUILD_TYPE}
-        ${inc_files}
+        iPic3Dlib  #name of the library
+        SHARED	   #type of the library
+        ${inc_files}    # stuff to build the library
         ${src_files}
 )
 
@@ -163,40 +159,16 @@ target_link_libraries(
          iPic3Dlib
 )
 
-## ## to save the executable in the folder where the CMakeLists.txt file is, i.e. CMAKE_CURRENT_SOURCE_DIR
-## #set_target_properties(iPic3D PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-## 
-## ## debug releases have a _d appended to the executable
-## set_target_properties(iPic3D PROPERTIES DEBUG_POSTFIX "_d")
-## 
-## 
-## message("Which system am I compiling for:")
-## message("CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
-## 
-## message("Compiler & compiler flags:")
-## message("CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER}")
-## message("CMAKE_CXX_FLAGS is ${CMAKE_CXX_FLAGS}")
-
-#
-# Code testing
-#
-
-if(IPIC_TESTS)
-  enable_testing()
-
-  add_test(NAME GEM-test
-           COMMAND ${CMAKE_COMMAND}
-           -DIPIC_TESTS_DIR=${IPIC_TESTS_DIR}
-           -DIPIC_SOURCE_DIR=${CMAKE_SOURCE_DIR}
-           -DIPICEXEC=$<TARGET_FILE:iPic3D>
-           -DMPIEXEC=${MPIEXEC}
-           -DMPIEXEC_NUMPROC_FLAG=${MPIEXEC_NUMPROC_FLAG}
-           -DMPIEXEC_POSTFLAGS=${MPIEXEC_POSTFLAGS}
-           -DIPIC_TESTS_DIR=${IPIC_TESTS_DIR}
-           -P ${CMAKE_SOURCE_DIR}/testfiles/CMakeRunTest-GEM.txt)
-
-  add_test(NAME uname-test
-           COMMAND ${CMAKE_COMMAND}
-           -P ${CMAKE_SOURCE_DIR}/testfiles/CMakeRunTest-uname.txt)
-
-endif(IPIC_TESTS)
+## to save the executable in the folder where the CMakeLists.txt file is, i.e. CMAKE_CURRENT_SOURCE_DIR
+#set_target_properties(iPic3D PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+## debug releases have a _d appended to the executable
+set_target_properties(iPic3D PROPERTIES DEBUG_POSTFIX "_d")
+
+
+message("Which system am I compiling for:")
+message("CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
+
+message("Compiler & compiler flags:")
+message("CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER}")
+message("CMAKE_CXX_FLAGS is ${CMAKE_CXX_FLAGS}")

From 6b150e824f546de5aa78bcfea2c3b8796813fc7a Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 30 Jan 2014 00:18:11 +0100
Subject: [PATCH 094/118] Implemented support for 'ipic run' and 'ipic show'
 commands

---
 scripts/ipic.py | 351 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 296 insertions(+), 55 deletions(-)

diff --git a/scripts/ipic.py b/scripts/ipic.py
index 1c9d90ad..069ed895 100755
--- a/scripts/ipic.py
+++ b/scripts/ipic.py
@@ -1,78 +1,278 @@
 #!/usr/bin/env python
 
+import os
 import sys
+import subprocess
+import socket # gethostname()
+import re # regular expression
+#from optparse import OptionParser
 import getopt
 # http://docs.python.org/2/library/collections.html#collections.deque
 from collections import deque # double-ended queue
-import os
-#from optparse import OptionParser
-
+import inspect
+#
 # useful documentation:
 #
 # http://effbot.org/zone/python-list.htm
 # http://pymotw.com/2/subprocess/
 # http://stackoverflow.com/questions/3777301/how-to-call-a-shell-script-from-python-code
 
+def getdims(inputfile):
+    # extract dimensions from intput file
+    dims = [1, 1, 1]
+    pattern = re.compile(r'^\s*([\w]+)\s*=\s*([\w]+)')
+    f = open(inputfile)
+    for line in f:
+        # key, value = line.split('=')
+        #pattern.findall(line)
+        match = re.search(pattern, line)
+        if match:
+            var = match.group(1)
+            val = match.group(2)
+            if var == 'XLEN':
+                dims[0]=int(val)
+            elif var == 'YLEN':
+                dims[1] = int(val)
+            elif var == 'ZLEN':
+                dims[2] = int(val)
+    return dims
+    f.close()
+
+def lineno():
+    """Returns the current line number in our program."""
+    return inspect.currentframe().f_back.f_lineno
+
+def issue_command(command):
+  if(show):
+    print ' '.join(command)
+  else:
+    print '+', ' '.join(command)
+    subprocess.call(command);
+
+def issue_shell_command(command):
+  if(show):
+    print command
+  else:
+    print '+', command
+    os.system(command)
+
+def construct_run_command(args):
+
+    # convert from deque to list for getopts
+    args = list(args)
+
+    # set default values
+    num_max_threads = 1
+    output = 'data'
+    inputfile = 'src/inputfiles/GEM.inp'
+    hostname = ''
+    mpirun = 'mpiexec'
+    global system
+    if system == 'xeon':
+        mpirun = 'mpiexec.hydra' # is this line needed?
+        num_max_threads = 4
+    elif system == 'mic':
+        mpirun = 'mpiexec.hydra'
+        # this should be user configurable
+        num_max_threads = 50
+        hostname = socket.gethostname()
+        micnum = 0
+        hostname = hostname + '-mic' + str(micnum)
+
+    try:
+      opts, args = getopt.getopt(args, 'i:o:s:t:h:', \
+        ['input=', 'output=', 'system=', 'threads=', 'host='])
+    except getopt.GetoptError, e:
+      if e.opt == 'h' and 'requires argument' in e.msg:
+        print 'ERROR: -h requires input filename'
+      elif e.opt == 'i' and 'requires argument' in e.msg:
+        print 'ERROR: -i requires input filename'
+      elif e.opt == 'o' and 'requires argument' in e.msg:
+        print 'ERROR: -o requires directory name'
+      elif e.opt == 't' and 'requires argument' in e.msg:
+        print 'ERROR: -t requires max number of threads'
+      elif e.opt == 's' and 'requires argument' in e.msg:
+        print 'ERROR: -s requires system name (e.g. "mic" or "xeon")'
+      else:
+        usage()
+        sys.exit(-1)
+
+    for o, a in opts:
+        if o in ("-h", "--host"):
+          hostname = a
+        elif o in ("-i", "--input"):
+          inputfile = a
+        elif o in ("-o", "--output"):
+          output = a
+          print 'ERROR: -o is not yet supported'
+          sys.exit(1)
+        elif o in ("-t", "--threads"):
+          num_max_threads = int(a)
+        elif o in ("-s", "--system"):
+          system = a
+        #else:
+        #  assert False, "unhandled option"
+
+    if len(args)!=0:
+      usage();
+
+    # determine num_procs
+    dims = getdims(inputfile)
+    XLEN = dims[0]
+    YLEN = dims[1]
+    ZLEN = dims[2]
+    num_procs = XLEN*YLEN*ZLEN
+    # num_procs = 4
+
+    arguments = ['exec/iPic3D', inputfile];
+    options = ['-n', str(num_procs)]
+    if hostname!="":
+        options.extend(['-host', hostname])
+
+    if num_max_threads > 1:
+        omp_string = 'OMP_NUM_THREADS=' + str(num_max_threads)
+        omp = ['-env', omp_string]
+        options.extend(omp)
+
+    command = [mpirun]
+    command.extend(options)
+    command.extend(arguments)
+    return command
+
+def ipic_run(args):
+    command = construct_run_command(args);
+    issue_command(command)
+
+def ipic_show_run(args):
+    command = construct_run_command(args);
+    print ' '.join(command);
+
+def ipic_make_data():
+    # create data subdirectory
+    create_data_command = '''mkdir -p data''';
+    issue_shell_command(create_data_command)
+
+def ipic_cmake(args):
+
+    # make src a link to the code
+    numargs = len(args)
+    if numargs==0:
+      sourcedir = '..'
+    elif numargs==1:
+      sourcedir = deque.popleft(args)
+    else:
+      usage()
+      sys.exit()
+
+    if sourcedir!='src':
+      rm_command = ['rm -f', 'src'];
+      issue_command(rm_command);
+      ln_command = ['ln', '-s', str(sourcedir), 'src'];
+      issue_command(ln_command)
+
+    ipic_make_data();
+    # invoke cmake 
+    cmake_command = ['cmake'];
+    if system == 'general':
+      0
+    elif system == 'mic':
+      cmake_command.extend(['-DCMAKE_TOOLCHAIN_FILE=src/cmake/cmake_template.cmake.XeonPhi'])
+    else:
+        print "--system", system, "is not supported"
+        sys.exit(-1)
+    # issue the command
+    cmake_command.extend(['src'])
+    issue_command(cmake_command)
+
 def ipic_ctags(args):
     # create tags file using ctags
     create_tags_command = \
         '''find . -name '*.cpp' -or -name '*.h' | grep -v unused | xargs ctags --extra=+qf'''
-    print create_tags_command
-    os.system(create_tags_command)
+    issue_shell_command(create_tags_command)
     # sort tags file
     sort_tags_command = '''LC_ALL=C sort -u tags -o tags'''
-    print sort_tags_command
-    os.system(sort_tags_command)
+    issue_shell_command(sort_tags_command)
 
-def ipic_help():
+def ipic_show(args):
+    if len(args) == 0:
+      ipic_help_show(args)
+      sys.exit()
+    
+    command = deque.popleft(args)
+    if command == "run":
+      ipic_show_run(args)
+    #elif command == "cmake":
+    #  ipic_show_cmake(args)
+    #elif command == "ctags":
+    #  ipic_show_ctags(args)
+    else:
+        print "ipic show", command, "is not supported"
+        sys.exit(-1)
+
+def ipic_basic_help():
     print '''
-  To build, in the iPic3D directory you can use:
+  To build, you can use:
   
-    rm -rf build # if necessary
     mkdir build
     cd build
-    cmake ..
+   ''', progname, '''cmake /path/to/ipic3d
     make # or "make -j" to compile in parallel
   
-  To run the code you can use
-  
-    mkdir data
-    mpiexec -n 4 exec/iPic3D ../inputfiles/GEM.inp
+  Then to run the code, use:
   
-  where 4 = XLEN times YLEN times ZLEN (defined in GEM.inp).
+    ipic run
+
+  If you prefer, use e.g. "ipic show run" to see the shell commands
+  that will be executed and then execute them directly yourself.
   
   Available subcommands:
 
-    ''', progname, '''help ctags
-    ''', progname, '''help mic
-    ''', progname, '''help deep
+    ''', progname, '''help show    # show what a command would do
+    ''', progname, '''help run     # execute iPic3D
+    ''', progname, '''help cmake   # execute cmake and create subdirectories
+    ''', progname, '''help ctags   # create ctags file to navigate code
+    ''', progname, '''help mic     # help for running on mic
+    ''', progname, '''help deep    # help for running on deep
   '''
 
+def ipic_help_show(args):
+    print '''
+  ''', progname, '''show [command]
+
+    show the shell command that would be executed by
+      ipic [command]
+    '''
+
+def ipic_help_run(args):
+    print '''
+ ''', progname, '''[-s <mic|xeon>] run [options]
+
+    run iPic3D with appropriate arguments.
+
+    options:
+    -t <num_max_threads>: set maximum number of threads
+       (default is 1 unless -s <mic|xeon> is set)
+    -i <inputfile>: set input file (default is "src/inputfiles/GEM.inp")
+    -o <outputdir>: set output directory (default is "data")
+    -h <host>: spawn processes on specified host
+    '''
+
 def ipic_help_mic(args):
     print '''
   See "ipic help".  Modifications are as follows.
 
-  To run on the Xeon host processor, use something like:
+  On the Xeon host processor, use:
   
-    mpiexec.hydra -n 8 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
+    ipic -s xeon [command]
   
-  where 8 = XLEN times YLEN times ZLEN.
-  
-  If you want to cross-compile for the MIC, then the instructions are
-  different:
-  
-      mkdir build.phi
-      cd build.phi
-      cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/cmake_template.cmake.XeonPhi
-      make -j
-  
-  And to run you use, e.g.:
-  
-    mkdir data
-    mpiexec.hydra -host knc2-mic0 -n 50 -env OMP_NUM_THREADS=4 exec/iPic3D ../inputfiles/GEM.inp
-  
-  where 50 = XLEN times YLEN times ZLEN.
+  On the MIC, use
+
+    ipic -s mic [command]
 
+  To show what a command will do, use e.g.:
+
+    ipic show -s mic [command]
+  
   See also:
     ''', progname, '''help deep
     '''
@@ -89,6 +289,14 @@ def ipic_help_deep(args):
     ''', progname, '''help mic
     '''
 
+def ipic_help_cmake(args):
+    print '''
+  ''', progname, '''[-s mic] cmake [sourcedir]
+
+     [sourcedir]: the source code directory; by default ".."
+     [-s mic]: cross-compile for the mic system
+  '''
+
 def ipic_help_ctags(args):
     print '''
   Make sure that you are in the source code directory
@@ -164,16 +372,22 @@ def ipic_help_git(args):
     undo-commit = reset --soft HEAD~1
     '''
 
-def help(args):
+def ipic_help(args):
     if len(args) == 0:
-      ipic_help()
+      ipic_basic_help()
       sys.exit()
     
     command = deque.popleft(args)
-    if command == "mic":
+    if command == "show":
+      ipic_help_show(args)
+    elif command == "run":
+      ipic_help_run(args)
+    elif command == "mic":
       ipic_help_mic(args)
     elif command == "deep":
       ipic_help_deep(args)
+    elif command == "cmake":
+      ipic_help_cmake(args)
     elif command == "ctags":
       ipic_help_ctags(args)
     elif command == "git":
@@ -183,20 +397,23 @@ def help(args):
         sys.exit(-1)
 
 def usage():
+    theline = inspect.currentframe().f_back.f_lineno
+    print '  usage() called from ipic.py line ', str(theline)
+
     print '''
-  usage: ''', progname, ''' [options] <command>
+  usage: ''', progname, ''' [show] <command>
 
   Available commands:
-    ''', progname, '''ctags
     ''', progname, '''help
+    ''', progname, '''show
+    ''', progname, '''cmake
+    ''', progname, '''ctags
       '''
 
-def main():
+def ipic_command(argv1):
 
-    global progname
-    progname = os.path.basename(sys.argv[0])
-    global dirname
-    dirname = os.path.dirname(sys.argv[0])
+    global system
+    system = 'general'
 
     # it might be better to use the argparse module rather than getopt,
     # but unfortunately argparse is only available beginning with python 2.7
@@ -206,10 +423,10 @@ def main():
     # before giving up on backward compatibility.
     #
     try:
-      opts, args = getopt.getopt(sys.argv[1:], 'ho:', ['help', 'output='])
+      opts, args = getopt.getopt(argv1, 'hs:', ['help', 'system='])
     except getopt.GetoptError, e:
-      if e.opt == 'o' and 'requires argument' in e.msg:
-        print 'ERROR: -o requires filename'
+      if e.opt == 's' and 'requires argument' in e.msg:
+        print 'ERROR: -s requires system name (e.g. "mic" or "xeon")'
       else:
         usage()
         sys.exit(-1)
@@ -218,8 +435,8 @@ def main():
         if o in ("-h", "--help"):
           usage()
           sys.exit()
-        elif o in ("-o", "--output"):
-          output = a
+        elif o in ("-s", "--system"):
+          system = a
         #else:
         #  assert False, "unhandled option"
 
@@ -234,17 +451,41 @@ def main():
     #print list(args)
 
     if command == "help":
-        help(args)
+        ipic_help(args)
+    # elif command == "show":
+    #     ipic_show(args)
     elif command == "ctags":
         ipic_ctags(args)
-        #print "ctags not yet implemented"
+    elif command == "cmake":
+        ipic_cmake(args)
+    elif command == "run":
+        ipic_run(args)
     else:
-        print progname, command, "not supported"
+        print progname, command, "is not supported"
         sys.exit(-1)
 
     #print os.path.basename(__file__)
     #print os.path.dirname(__file__)
 
+def main():
+
+    global progname
+    progname = os.path.basename(sys.argv[0])
+    global dirname
+    dirname = os.path.dirname(sys.argv[0])
+    global show
+    show=0
+
+    argv1 = sys.argv[1:]
+    if len(argv1)==0:
+      usage()
+
+    if argv1[0]=='show':
+      show=1
+      argv1=argv1[1:]
+
+    ipic_command(argv1)
+
 if __name__ == '__main__':
     main()
 

From 9f0425123547fd6c19a7934e76fce65ad979c82d Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 30 Jan 2014 14:52:32 +0100
Subject: [PATCH 095/118] ipic run: calculate number of threads based on number
 of processes

---
 scripts/ipic.py | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/scripts/ipic.py b/scripts/ipic.py
index 069ed895..92b2bffc 100755
--- a/scripts/ipic.py
+++ b/scripts/ipic.py
@@ -62,23 +62,46 @@ def construct_run_command(args):
     args = list(args)
 
     # set default values
+    num_nodes = 1
     num_max_threads = 1
+    num_threads_per_node = 1
     output = 'data'
     inputfile = 'src/inputfiles/GEM.inp'
     hostname = ''
     mpirun = 'mpiexec'
     global system
-    if system == 'xeon':
+    if system == 'xeon' or system == 'mic':
+      if system == 'xeon':
         mpirun = 'mpiexec.hydra' # is this line needed?
-        num_max_threads = 4
-    elif system == 'mic':
+        # calculate number of threads per process
+        # - should extract this stuff from /proc/cpuinfo
+        num_nodes = 1
+        num_processors_per_node = 2
+        num_cores_per_processor = 8
+        num_threads_per_core = 2
+        num_threads_per_node = (
+          num_threads_per_core *
+          num_cores_per_processor *
+          num_processors_per_node)
+      elif system == 'mic':
         mpirun = 'mpiexec.hydra'
-        # this should be user configurable
-        num_max_threads = 50
+        # calculate number of threads per process
+        # - could use ssh to extract this stuff from /proc/cpuinfo
+        #   on the machine we will run on
+        num_nodes = 1
+        num_processors_per_node = 1
+        num_cores_per_processor = 57 # 57 on knc2, 60 on knc1
+        num_threads_per_core = 4
+        num_threads_per_node = (
+          num_threads_per_core *
+          num_cores_per_processor *
+          num_processors_per_node)
+        #
         hostname = socket.gethostname()
         micnum = 0
         hostname = hostname + '-mic' + str(micnum)
 
+    num_threads_is_given_by_user = 0
     try:
       opts, args = getopt.getopt(args, 'i:o:s:t:h:', \
         ['input=', 'output=', 'system=', 'threads=', 'host='])
@@ -107,6 +130,7 @@ def construct_run_command(args):
           print 'ERROR: -o is not yet supported'
           sys.exit(1)
         elif o in ("-t", "--threads"):
+          num_threads_is_given_by_user = 1
           num_max_threads = int(a)
         elif o in ("-s", "--system"):
           system = a
@@ -122,7 +146,12 @@ def construct_run_command(args):
     YLEN = dims[1]
     ZLEN = dims[2]
     num_procs = XLEN*YLEN*ZLEN
-    # num_procs = 4
+    num_procs_per_node = num_procs/num_nodes
+    num_threads_per_proc = num_threads_per_node/num_procs_per_node
+
+    if not num_threads_is_given_by_user:
+      # rounding down is the correct behavior
+      num_max_threads = int(num_threads_per_proc)
 
     arguments = ['exec/iPic3D', inputfile];
     options = ['-n', str(num_procs)]

From ed23f3e0c8076a42a498d1e62ec0538c4a2f08e2 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Thu, 23 Jan 2014 18:17:42 +0100
Subject: [PATCH 096/118] regularized Parameters options for mover_PC

---
 fields/EMfields3D.cpp         | 209 +++++++++++++++++++++++++++
 include/EMfields3D.h          |   1 +
 include/Parameters.h          |  19 ++-
 include/Particles3Dcomm.h     |  68 ++++++---
 include/iPic3D.h              |   2 +
 main/Parameters.cpp           |  39 +++--
 main/iPic3Dlib.cpp            |  51 +++----
 particles/Particles3D.cpp     |   8 ++
 particles/Particles3Dcomm.cpp | 258 +++++++++++++++++++++++++---------
 9 files changed, 532 insertions(+), 123 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 38f544c4..779c2948 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -1155,6 +1155,215 @@ void EMfields3D::sumMoments_vectorized(
   }
 }
 
+void EMfields3D::sumMoments_vectorized_AoS(
+  const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct)
+{
+  const double inv_dx = grid->get_invdx();
+  const double inv_dy = grid->get_invdy();
+  const double inv_dz = grid->get_invdz();
+  const int nxn = grid->getNXN();
+  const int nyn = grid->getNYN();
+  const int nzn = grid->getNZN();
+  const double xstart = grid->getXstart();
+  const double ystart = grid->getYstart();
+  const double zstart = grid->getZstart();
+  #pragma omp parallel
+  {
+  for (int species_idx = 0; species_idx < ns; species_idx++)
+  {
+    const Particles3Dcomm& pcls = part[species_idx];
+    assert_eq(pcls.get_particleType(), ParticleType::AoS);
+    const int is = pcls.get_ns();
+    assert_eq(species_idx,is);
+
+    const int nop = pcls.getNOP();
+    #pragma omp master
+    { timeTasks_begin_task(TimeTasks::MOMENT_ACCUMULATION); }
+    Moments10& speciesMoments10 = fetch_moments10Array(0);
+    arr4_double moments = speciesMoments10.fetch_arr();
+    //
+    // moments.setmode(ompmode::ompfor);
+    //moments.setall(0.);
+    double *moments1d = &moments[0][0][0][0];
+    int moments1dsize = moments.get_size();
+    #pragma omp for // because shared
+    for(int i=0; i<moments1dsize; i++) moments1d[i]=0;
+    
+    // prevent threads from writing to the same location
+    for(int cxmod2=0; cxmod2<2; cxmod2++)
+    for(int cymod2=0; cymod2<2; cymod2++)
+    // each mesh cell is handled by its own thread
+    #pragma omp for collapse(2)
+    for(int cx=cxmod2;cx<nxc;cx+=2)
+    for(int cy=cymod2;cy<nyc;cy+=2)
+    for(int cz=0;cz<nzc;cz++)
+    {
+     //dprint(cz);
+     // index of interface to right of cell
+     const int ix = cx + 1;
+     const int iy = cy + 1;
+     const int iz = cz + 1;
+     {
+      // reference the 8 nodes to which we will
+      // write moment data for particles in this mesh cell.
+      //
+      arr1_double_fetch momentsArray[8];
+      arr2_double_fetch moments00 = moments[ix][iy];
+      arr2_double_fetch moments01 = moments[ix][cy];
+      arr2_double_fetch moments10 = moments[cx][iy];
+      arr2_double_fetch moments11 = moments[cx][cy];
+      momentsArray[0] = moments00[iz]; // moments000 
+      momentsArray[1] = moments00[cz]; // moments001 
+      momentsArray[2] = moments01[iz]; // moments010 
+      momentsArray[3] = moments01[cz]; // moments011 
+      momentsArray[4] = moments10[iz]; // moments100 
+      momentsArray[5] = moments10[cz]; // moments101 
+      momentsArray[6] = moments11[iz]; // moments110 
+      momentsArray[7] = moments11[cz]; // moments111 
+
+      const int numpcls_in_cell = pcls.get_numpcls_in_bucket(cx,cy,cz);
+      const int bucket_offset = pcls.get_bucket_offset(cx,cy,cz);
+      const int bucket_end = bucket_offset+numpcls_in_cell;
+
+      // data is not stride-1, so we do *not* use
+      // #pragma simd
+      {
+        // accumulators for moments per each of 8 threads
+        double momentsAcc[8][10];
+        memset(momentsAcc,0,sizeof(double)*8*10);
+        for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
+        {
+          const SpeciesParticle& pcl = pcls.get_pcl(pidx);
+          // This depends on the fact that the memory
+          // occupied by a particle coincides with
+          // the alignment interval (64 bytes)
+          ALIGNED(&pcl);
+          double velmoments[10];
+          double weights[8];
+          // compute the quadratic moments of velocity
+          //
+          const double ui=pcl.get_u();
+          const double vi=pcl.get_v();
+          const double wi=pcl.get_w();
+          const double uui=ui*ui;
+          const double uvi=ui*vi;
+          const double uwi=ui*wi;
+          const double vvi=vi*vi;
+          const double vwi=vi*wi;
+          const double wwi=wi*wi;
+          //double velmoments[10];
+          velmoments[0] = 1.;
+          velmoments[1] = ui;
+          velmoments[2] = vi;
+          velmoments[3] = wi;
+          velmoments[4] = uui;
+          velmoments[5] = uvi;
+          velmoments[6] = uwi;
+          velmoments[7] = vvi;
+          velmoments[8] = vwi;
+          velmoments[9] = wwi;
+        
+          // compute the weights to distribute the moments
+          //
+          //double weights[8];
+          const double abs_xpos = pcl.get_x();
+          const double abs_ypos = pcl.get_y();
+          const double abs_zpos = pcl.get_z();
+          const double rel_xpos = abs_xpos - xstart;
+          const double rel_ypos = abs_ypos - ystart;
+          const double rel_zpos = abs_zpos - zstart;
+          const double cxm1_pos = rel_xpos * inv_dx;
+          const double cym1_pos = rel_ypos * inv_dy;
+          const double czm1_pos = rel_zpos * inv_dz;
+          //if(true)
+          //{
+          //  const int cx_inf = int(floor(cxm1_pos));
+          //  const int cy_inf = int(floor(cym1_pos));
+          //  const int cz_inf = int(floor(czm1_pos));
+          //  assert_eq(cx-1,cx_inf);
+          //  assert_eq(cy-1,cy_inf);
+          //  assert_eq(cz-1,cz_inf);
+          //}
+          // fraction of the distance from the right of the cell
+          const double w1x = cx - cxm1_pos;
+          const double w1y = cy - cym1_pos;
+          const double w1z = cz - czm1_pos;
+          // fraction of distance from the left
+          const double w0x = 1-w1x;
+          const double w0y = 1-w1y;
+          const double w0z = 1-w1z;
+          // we are calculating a charge moment.
+          const double qi=pcl.get_q();
+          const double weight0 = qi*w0x;
+          const double weight1 = qi*w1x;
+          const double weight00 = weight0*w0y;
+          const double weight01 = weight0*w1y;
+          const double weight10 = weight1*w0y;
+          const double weight11 = weight1*w1y;
+          weights[0] = weight00*w0z; // weight000
+          weights[1] = weight00*w1z; // weight001
+          weights[2] = weight01*w0z; // weight010
+          weights[3] = weight01*w1z; // weight011
+          weights[4] = weight10*w0z; // weight100
+          weights[5] = weight10*w1z; // weight101
+          weights[6] = weight11*w0z; // weight110
+          weights[7] = weight11*w1z; // weight111
+        
+          // add moments for this particle
+          {
+            // which is the superior order for the following loop?
+            for(int c=0; c<8; c++)
+            for(int m=0; m<10; m++)
+            {
+              momentsAcc[c][m] += velmoments[m]*weights[c];
+            }
+          }
+        }
+        for(int c=0; c<8; c++)
+        for(int m=0; m<10; m++)
+        {
+          momentsArray[c][m] += momentsAcc[c][m];
+        }
+      }
+     }
+    }
+    #pragma omp master
+    { timeTasks_end_task(TimeTasks::MOMENT_ACCUMULATION); }
+
+    // reduction
+    #pragma omp master
+    { timeTasks_begin_task(TimeTasks::MOMENT_REDUCTION); }
+    {
+      #pragma omp for collapse(2)
+      for(int i=0;i<nxn;i++){
+      for(int j=0;j<nyn;j++){
+      for(int k=0;k<nzn;k++)
+      {
+        rhons[is][i][j][k] = invVOL*moments[i][j][k][0];
+        Jxs  [is][i][j][k] = invVOL*moments[i][j][k][1];
+        Jys  [is][i][j][k] = invVOL*moments[i][j][k][2];
+        Jzs  [is][i][j][k] = invVOL*moments[i][j][k][3];
+        pXXsn[is][i][j][k] = invVOL*moments[i][j][k][4];
+        pXYsn[is][i][j][k] = invVOL*moments[i][j][k][5];
+        pXZsn[is][i][j][k] = invVOL*moments[i][j][k][6];
+        pYYsn[is][i][j][k] = invVOL*moments[i][j][k][7];
+        pYZsn[is][i][j][k] = invVOL*moments[i][j][k][8];
+        pZZsn[is][i][j][k] = invVOL*moments[i][j][k][9];
+      }}}
+    }
+    #pragma omp master
+    { timeTasks_end_task(TimeTasks::MOMENT_REDUCTION); }
+    // uncomment this and remove the loop below
+    // when we change to use asynchronous communication.
+    // communicateGhostP2G(is, 0, 0, 0, 0, vct);
+  }
+  }
+  for (int i = 0; i < ns; i++)
+  {
+    communicateGhostP2G(i, 0, 0, 0, 0, vct);
+  }
+}
+
 /*! Calculate Electric field with the implicit solver: the Maxwell solver method is called here */
 void EMfields3D::calculateE(Grid * grid, VirtualTopology3D * vct, Collective *col) {
   if (vct->getCartesian_rank() == 0)
diff --git a/include/EMfields3D.h b/include/EMfields3D.h
index 32599b9f..eb261311 100644
--- a/include/EMfields3D.h
+++ b/include/EMfields3D.h
@@ -127,6 +127,7 @@ class EMfields3D                // :public Field
     void sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
     void sumMoments_AoS(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
     void sumMoments_vectorized(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
+    void sumMoments_vectorized_AoS(const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct);
     void sumMomentsOld(const Particles3Dcomm& pcls, Grid * grid, VirtualTopology3D * vct);
     /*! add accumulated moments to the moments for a given species */
     //void addToSpeciesMoments(const TenMoments & in, int is);
diff --git a/include/Parameters.h b/include/Parameters.h
index ba4e980d..84f6f00e 100644
--- a/include/Parameters.h
+++ b/include/Parameters.h
@@ -5,11 +5,26 @@
 //
 namespace Parameters
 {
+  enum MoverType
+  {
+    SoA=0,
+    AoS,
+    SoAvec_onesort,
+    AoSvec_onesort,
+    SoAvec_resort,
+    AoSvec_resort,
+  };
+
   void init_parameters();
 
+  bool get_USING_AOS();
+  bool get_SORTING_SOA();
   bool get_SORTING_PARTICLES();
+  // for resorting particles with each iteration of mover
+  bool get_RESORTING_PARTICLES();
+  inline bool get_USING_XAVG() { return get_RESORTING_PARTICLES(); }
   bool get_VECTORIZE_MOMENTS();
-  bool get_VECTORIZE_MOVER();
-  bool get_USING_XAVG();
+  //bool get_VECTORIZE_MOVER();
+  MoverType get_MOVER_TYPE();
 }
 #endif
diff --git a/include/Particles3Dcomm.h b/include/Particles3Dcomm.h
index 79e3fb0e..c1e72289 100644
--- a/include/Particles3Dcomm.h
+++ b/include/Particles3Dcomm.h
@@ -73,6 +73,7 @@ class Particles3Dcomm // :public Particles
 
   /*! sort particles for vectorized push (needs to be parallelized) */
   void sort_particles_serial(Grid * grid, VirtualTopology3D * vct);
+  void sort_particles_serial_AoS(Grid * grid, VirtualTopology3D * vct);
   /*! sort particles with respect to provided position data */
   void sort_particles_serial(
     pfloat *xpos, pfloat *ypos, pfloat *zpos,
@@ -118,10 +119,29 @@ class Particles3Dcomm // :public Particles
     assert_le(cz,nzc);
   }
 
+  // get accessors for optional arrays
+  //
+  SpeciesParticle *fetch_pcls(){ return _pcls; }
+  SpeciesParticle *fetch_pclstmp(){ return _pclstmp; }
+  double * fetch_xavg() { return _xavg; }
+  double * fetch_yavg() { return _yavg; }
+  double * fetch_zavg() { return _zavg; }
+  double * fetch_xtmp() { return _xtmp; }
+  double * fetch_ytmp() { return _ytmp; }
+  double * fetch_ztmp() { return _ztmp; }
+  double * fetch_utmp() { return _utmp; }
+  double * fetch_vtmp() { return _vtmp; }
+  double * fetch_wtmp() { return _wtmp; }
+  double * fetch_qtmp() { return _qtmp; }
+  double * fetch_xavgtmp() { return _xavgtmp; }
+  double * fetch_yavgtmp() { return _yavgtmp; }
+  double * fetch_zavgtmp() { return _zavgtmp; }
+  long long *fetch_ParticleIDtmp(){ return _ParticleIDtmp; }
+
   // inline get accessors
   //
   ParticleType::Type get_particleType()const { return particleType; }
-  const SpeciesParticle& get_pcl(int pidx)const{ return pcls[pidx]; }
+  const SpeciesParticle& get_pcl(int pidx)const{ return _pcls[pidx]; }
   double *getXall()  const { return (x); }
   double *getYall()  const { return (y); }
   double *getZall()  const { return (z); }
@@ -202,8 +222,6 @@ class Particles3Dcomm // :public Particles
   ParticleType::Type particleType;
   // particles data
   //
-  // AoS representation
-  SpeciesParticle *pcls;
   // SoA representation
   //
   /** Positions array - X component */
@@ -224,30 +242,34 @@ class Particles3Dcomm // :public Particles
   bool TrackParticleID;
   /** ParticleID */
   long long *ParticleID;
-  /** Average position data (used during particle push) **/
-  double *xavg;
-  double *yavg;
-  double *zavg;
+  //
+  // AoS representation
+  //
+  SpeciesParticle *_pcls;
 
   // structures for sorting particles
   //
-  // alternate storage for sorting particles
+  /** Average position data (used during particle push) **/
+  //
+  double *_xavg;
+  double *_yavg;
+  double *_zavg;
+  //
+  // alternate temporary storage for sorting particles
+  //
+  long long *_ParticleIDtmp;
+  double *_xtmp;
+  double *_ytmp;
+  double *_ztmp;
+  double *_utmp;
+  double *_vtmp;
+  double *_wtmp;
+  double *_qtmp;
+  SpeciesParticle *_pclstmp;
+  double *_xavgtmp;
+  double *_yavgtmp;
+  double *_zavgtmp;
   //
-  double *xtmp;
-  double *ytmp;
-  double *ztmp;
-  double *utmp;
-  double *vtmp;
-  double *wtmp;
-  double *qtmp;
-  long long *ParticleIDtmp;
-  double *xavgtmp;
-  double *yavgtmp;
-  double *zavgtmp;
-  //int *xcell;
-  //int *ycell;
-  //int *zcell;
-
   // references for buckets
   //
   array3_int* numpcls_in_bucket;
diff --git a/include/iPic3D.h b/include/iPic3D.h
index 6d7d1063..02e7e604 100644
--- a/include/iPic3D.h
+++ b/include/iPic3D.h
@@ -49,6 +49,8 @@ namespace iPic3D {
 
     void convertParticlesToSoA();
     void convertParticlesToAoS();
+  private:
+    void sortParticles();
 
   private:
     static MPIdata * mpi;
diff --git a/main/Parameters.cpp b/main/Parameters.cpp
index 00d99b7a..e220e387 100644
--- a/main/Parameters.cpp
+++ b/main/Parameters.cpp
@@ -2,17 +2,40 @@
 
 using namespace Parameters;
 
+//********** edit these parameters *********
+//
+bool Parameters::get_VECTORIZE_MOMENTS() { return false; }
+// options: SoA AoS SoAvec_onesort AoSvec_onesort SoAvec_resort AoSvec_resort
+Parameters::MoverType Parameters::get_MOVER_TYPE() { return SoA; }
+
+//********** derived parameters *********
+
 static bool SORTING_PARTICLES;
+static bool RESORTING_PARTICLES;
+static bool USING_AOS;
+static bool SORTING_SOA;
 
 void Parameters::init_parameters()
 {
-  SORTING_PARTICLES = get_VECTORIZE_MOMENTS() || get_VECTORIZE_MOVER();
+  RESORTING_PARTICLES = 
+       get_MOVER_TYPE()==SoAvec_resort
+    || get_MOVER_TYPE()==AoSvec_resort;
+  SORTING_PARTICLES = get_VECTORIZE_MOMENTS()
+    || get_MOVER_TYPE()==SoAvec_onesort
+    || get_MOVER_TYPE()==AoSvec_onesort
+    || get_MOVER_TYPE()==SoAvec_resort
+    || get_MOVER_TYPE()==AoSvec_resort;
+  USING_AOS =
+       get_MOVER_TYPE()==AoS
+    || get_MOVER_TYPE()==AoSvec_onesort
+    || get_MOVER_TYPE()==AoSvec_resort;
+  SORTING_SOA = get_VECTORIZE_MOMENTS()
+    || get_MOVER_TYPE()==SoAvec_onesort
+    || get_MOVER_TYPE()==SoAvec_resort;
 }
 
-//bool Parameters::get_SORTING_PARTICLES() { return SORTING_PARTICLES; }
-bool Parameters::get_SORTING_PARTICLES() { return true; }
-bool Parameters::get_VECTORIZE_MOMENTS() { return false; }
-bool Parameters::get_VECTORIZE_MOVER() { return false; }
-// this must also return true if we communicate particles per iteration
-//bool Parameters::get_USING_XAVG() { return get_VECTORIZE_MOVER(); }
-bool Parameters::get_USING_XAVG() { return get_SORTING_PARTICLES(); }
+bool Parameters::get_RESORTING_PARTICLES() { return RESORTING_PARTICLES; }
+bool Parameters::get_SORTING_PARTICLES() { return SORTING_PARTICLES; }
+bool Parameters::get_SORTING_SOA() { return SORTING_SOA; }
+bool Parameters::get_USING_AOS() { return USING_AOS; }
+//
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 6676b8e4..3f7115c7 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -176,36 +176,35 @@ int c_Solver::Init(int argc, char **argv) {
   return 0;
 }
 
-void c_Solver::CalculateMoments() {
+void c_Solver::sortParticles() {
+  timeTasks_begin_task(TimeTasks::MOMENT_PCL_SORTING);
+  for(int species_idx=0; species_idx<ns; species_idx++)
+    part[species_idx].sort_particles_serial(grid,vct);
+  timeTasks_end_task(TimeTasks::MOMENT_PCL_SORTING);
+}
 
-  convertParticlesToSoA();
+void c_Solver::CalculateMoments() {
 
   timeTasks_set_main_task(TimeTasks::MOMENTS);
 
   EMf->updateInfoFields(grid,vct,col);
 
-  if(Parameters::get_SORTING_PARTICLES())
-  {
-    // sort particles
-    //#pragma omp master
-    {
-      //dprint(omp_get_thread_num());
-      timeTasks_begin_task(TimeTasks::MOMENT_PCL_SORTING);
-      for(int species_idx=0; species_idx<ns; species_idx++)
-        part[species_idx].sort_particles_serial(grid,vct);
-      timeTasks_end_task(TimeTasks::MOMENT_PCL_SORTING);
-    }
-  }
-
   if(Parameters::get_VECTORIZE_MOMENTS())
   {
     // since particles are sorted,
     // we can vectorize interpolation of particles to grid
     convertParticlesToSoA();
+	sortParticles();
     EMf->sumMoments_vectorized(part, grid, vct);
+    //convertParticlesToAoS();
+	//sortParticles();
+    //EMf->sumMoments_vectorized_AoS(part, grid, vct);
   }
   else
   {
+    if(Parameters::get_SORTING_PARTICLES())
+	  sortParticles();
+
     EMf->setZeroPrimaryMoments();
     convertParticlesToSoA();
     EMf->sumMoments(part, grid, vct);
@@ -264,16 +263,20 @@ bool c_Solver::ParticlesMover() {
       //
       // should merely pass EMf->get_fieldForPcls() rather than EMf.
       // use the Predictor Corrector scheme to move particles
-      if(Parameters::get_VECTORIZE_MOVER())
-      {
-        part[i].mover_PC_vectorized(grid, vct, EMf);
-        //part[i].mover_PC_AoS_XeonVec(grid, vct, EMf);
-      }
-      else
+      switch(Parameters::get_MOVER_TYPE())
       {
-        part[i].mover_PC(grid, vct, EMf);
-        //part[i].mover_PC_AoS(grid, vct, EMf);
-        //part[i].mover_PC_AoS2(grid, vct, EMf);
+        case Parameters::SoA:
+          part[i].mover_PC(grid, vct, EMf);
+          break;
+        case Parameters::SoAvec_resort:
+          part[i].mover_PC_vectorized(grid, vct, EMf);
+          break;
+        case Parameters::AoS:
+          part[i].mover_PC_AoS(grid, vct, EMf);
+          //part[i].mover_PC_AoS2(grid, vct, EMf);
+          break;
+        default:
+          unsupported_value_error(Parameters::get_MOVER_TYPE());
       }
     }
     }
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 1d1f567b..3a4a2e80 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -460,6 +460,7 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
 void Particles3D::mover_PC_AoS2(Grid * grid, VirtualTopology3D * vct, Field * EMf)
 {
   convertParticlesToAoS();
+  SpeciesParticle * pcls = fetch_pcls();
   #pragma omp master
   if (vct->getCartesian_rank() == 0) {
     cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
@@ -611,6 +612,7 @@ void Particles3D::mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf
   }
   const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
 
+  SpeciesParticle * pcls = fetch_pcls();
   #pragma omp master
   { timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING); }
   const double dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
@@ -765,6 +767,9 @@ void Particles3D::mover_PC_vectorized(
   Grid * grid, VirtualTopology3D * vct, Field * EMf)
 {
   convertParticlesToSoA();
+  double* xavg = fetch_xavg();
+  double* yavg = fetch_yavg();
+  double* zavg = fetch_zavg();
   assert_eq(nxc,nxn-1);
   assert_eq(nyc,nyn-1);
   assert_eq(nzc,nzn-1);
@@ -840,6 +845,9 @@ void Particles3D::mover_PC_vectorized(
       ALIGNED(u);
       ALIGNED(v);
       ALIGNED(w);
+      ALIGNED(xavg);
+      ALIGNED(yavg);
+      ALIGNED(zavg);
       // This pragma help on Xeon but hurts on Xeon Phi.
       // On the Phi we could accelerate by processing two particles at a time.
       #pragma simd
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 4533266d..f5f49e63 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -58,7 +58,6 @@ Particles3Dcomm::Particles3Dcomm(){
 }
 /** deallocate particles */
 Particles3Dcomm::~Particles3Dcomm() {
-  delete[]pcls;
   delete[]x;
   delete[]y;
   delete[]z;
@@ -67,21 +66,26 @@ Particles3Dcomm::~Particles3Dcomm() {
   delete[]w;
   delete[]q;
   delete[]ParticleID;
-  delete[]xavg;
-  delete[]yavg;
-  delete[]zavg;
+  // AoS representation
+  delete[]_pcls;
+  // average position used in particle advance
+  delete[]_xavg;
+  delete[]_yavg;
+  delete[]_zavg;
   // deallocate alternate storage
-  delete[]xtmp;
-  delete[]ytmp;
-  delete[]ztmp;
-  delete[]utmp;
-  delete[]vtmp;
-  delete[]wtmp;
-  delete[]qtmp;
-  delete[]ParticleIDtmp;
-  delete[]xavgtmp;
-  delete[]yavgtmp;
-  delete[]zavgtmp;
+  delete[]_xtmp;
+  delete[]_ytmp;
+  delete[]_ztmp;
+  delete[]_utmp;
+  delete[]_vtmp;
+  delete[]_wtmp;
+  delete[]_qtmp;
+  delete[]_ParticleIDtmp;
+  delete[] _pclstmp;
+  // extra xavg for sort
+  delete[]_xavgtmp;
+  delete[]_yavgtmp;
+  delete[]_zavgtmp;
   // deallocate buffers
   delete[]b_X_RIGHT;
   delete[]b_X_LEFT;
@@ -190,11 +194,11 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   // intel new allocates with 64-byte alignment
   // since particles are 64 bytes wide, every particle
   // is aligned.
-  pcls = new SpeciesParticle[npmax];
+  _pcls = new SpeciesParticle[npmax];
   particleType = ParticleType::SoA;
   #ifdef __INTEL_COMPILER
     assert_eq(sizeof(SpeciesParticle),64);
-    ALIGNED(pcls);
+    ALIGNED(_pcls);
   #endif
   //
   // SoA particle representation
@@ -210,50 +214,66 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   // charge
   q = new double[npmax];
   // average positions, used in iterative particle advance
-  xavg = 0;
-  yavg = 0;
-  zavg = 0;
+  _xavg = 0;
+  _yavg = 0;
+  _zavg = 0;
+  //if(Parameters::get_USING_XAVG())
+  //{
+  //  xavg = new double[npmax];
+  //  yavg = new double[npmax];
+  //  zavg = new double[npmax];
+  //}
+  _xtmp = 0;
+  _ytmp = 0;
+  _ztmp = 0;
+  _utmp = 0;
+  _vtmp = 0;
+  _wtmp = 0;
+  _qtmp = 0;
+  _xavgtmp = 0;
+  _yavgtmp = 0;
+  _zavgtmp = 0;
+  _pcls = 0;
+  _pclstmp = 0;
+  // accessors for data that should be allocated only if needed
+  //
   if(Parameters::get_USING_XAVG())
   {
-    xavg = new double[npmax];
-    yavg = new double[npmax];
-    zavg = new double[npmax];
+    _xavg=new double[npmax];
+    _yavg=new double[npmax];
+    _zavg=new double[npmax];
   }
-  //
-  xtmp = 0;
-  ytmp = 0;
-  ztmp = 0;
-  utmp = 0;
-  vtmp = 0;
-  wtmp = 0;
-  qtmp = 0;
-  xavgtmp = 0;
-  yavgtmp = 0;
-  zavgtmp = 0;
-  if(Parameters::get_SORTING_PARTICLES())
+  if(Parameters::get_SORTING_SOA())
+  {
+    _xtmp=new double[npmax];
+    _ytmp=new double[npmax];
+    _ztmp=new double[npmax];
+    _utmp=new double[npmax];
+    _vtmp=new double[npmax];
+    _wtmp=new double[npmax];
+    _qtmp=new double[npmax];
+    _xavgtmp=new double[npmax];
+    _yavgtmp=new double[npmax];
+    _zavgtmp=new double[npmax];
+    if(TrackParticleID)
+    {
+      _ParticleIDtmp = new long long[npmax];
+    }
+  }
+  if(Parameters::get_USING_AOS())
   {
-    xtmp = new double[npmax];
-    ytmp = new double[npmax];
-    ztmp = new double[npmax];
-    // velocities
-    utmp = new double[npmax];
-    vtmp = new double[npmax];
-    wtmp = new double[npmax];
-    // charge
-    qtmp = new double[npmax];
-    // average positions, used in iterative particle advance
-    xavgtmp = new double[npmax];
-    yavgtmp = new double[npmax];
-    zavgtmp = new double[npmax];
+    assert_eq(sizeof(SpeciesParticle),64);
+    _pcls = AlignedAlloc(SpeciesParticle,npmax);
+    _pclstmp = AlignedAlloc(SpeciesParticle,npmax);
   }
 
   ParticleID = 0;
-  ParticleIDtmp = 0;
+  _ParticleIDtmp = 0;
   // ID
   if (TrackParticleID) {
     ParticleID = new long long[npmax];
-    if(Parameters::get_SORTING_PARTICLES())
-      ParticleIDtmp = new long long[npmax];
+    //if(Parameters::get_SORTING_PARTICLES())
+    //  _ParticleIDtmp = new long long[npmax];
     BirthRank[0] = vct->getCartesian_rank();
     if (vct->getNprocs() > 1)
       BirthRank[1] = (int) ceil(log10((double) (vct->getNprocs())));  // Number of digits needed for # of process in ID
@@ -999,7 +1019,89 @@ void Particles3Dcomm::PrintNp(VirtualTopology3D * ptVCT)  const {
 
 void Particles3Dcomm::sort_particles_serial(Grid * grid, VirtualTopology3D * vct)
 {
-  sort_particles_serial(x,y,z, grid,vct);
+  switch(particleType)
+  {
+    case ParticleType::AoS:
+      sort_particles_serial_AoS(grid,vct);
+      break;
+    case ParticleType::SoA:
+      sort_particles_serial(x,y,z, grid,vct);
+      break;
+    default:
+      unsupported_value_error(particleType);
+  }
+}
+
+// need to sort and communicate particles after each iteration
+void Particles3Dcomm::sort_particles_serial_AoS(
+  Grid * grid, VirtualTopology3D * vct)
+{
+  SpeciesParticle* pcls = fetch_pcls();
+  SpeciesParticle* pclstmp = fetch_pclstmp();
+  {
+    numpcls_in_bucket->setall(0);
+    // iterate through particles and count where they will go
+    for (int pidx = 0; pidx < nop; pidx++)
+    {
+      const SpeciesParticle& pcl = get_pcl(pidx);
+      // get the cell indices of the particle
+      int cx,cy,cz;
+      get_safe_cell_for_pos(cx,cy,cz,pcl.get_x(),pcl.get_y(),pcl.get_z());
+
+      // increment the number of particles in bucket of this particle
+      (*numpcls_in_bucket)[cx][cy][cz]++;
+    }
+
+    // compute prefix sum to determine initial position
+    // of each bucket (could parallelize this)
+    //
+    int accpcls=0;
+    for(int cx=0;cx<nxc;cx++)
+    for(int cy=0;cy<nyc;cy++)
+    for(int cz=0;cz<nzc;cz++)
+    {
+      (*bucket_offset)[cx][cy][cz] = accpcls;
+      accpcls += (*numpcls_in_bucket)[cx][cy][cz];
+    }
+    assert_eq(accpcls,nop);
+
+    numpcls_in_bucket_now->setall(0);
+    // put the particles where they are supposed to go
+    for (int pidx = 0; pidx < nop; pidx++)
+    {
+      const SpeciesParticle& pcl = get_pcl(pidx);
+      // get the cell indices of the particle
+      int cx,cy,cz;
+      get_safe_cell_for_pos(cx,cy,cz,pcl.get_x(),pcl.get_y(),pcl.get_z());
+
+      // compute where the data should go
+      const int numpcls_now = (*numpcls_in_bucket_now)[cx][cy][cz]++;
+      const int outpidx = (*bucket_offset)[cx][cy][cz] + numpcls_now;
+      assert_lt(outpidx, nop);
+      assert_ge(outpidx, 0);
+      assert_lt(pidx, nop);
+      assert_ge(pidx, 0);
+
+      // copy particle data to new location
+      //
+      pclstmp[outpidx] = pcl;
+    }
+    // swap the tmp particle memory with the official particle memory
+    {
+      swap(_pclstmp,_pcls);
+    }
+
+    // check if the particles were sorted incorrectly
+    if(true)
+    {
+      for(int cx=0;cx<nxc;cx++)
+      for(int cy=0;cy<nyc;cy++)
+      for(int cz=0;cz<nzc;cz++)
+      {
+        assert_eq((*numpcls_in_bucket_now)[cx][cy][cz], (*numpcls_in_bucket)[cx][cy][cz]);
+      }
+    }
+  }
 }
 
 // need to sort and communicate particles after each iteration
@@ -1007,6 +1109,17 @@ void Particles3Dcomm::sort_particles_serial(
   double *xpos, double *ypos, double *zpos,
   Grid * grid, VirtualTopology3D * vct)
 {
+  double * xtmp = fetch_xtmp();
+  double * ytmp = fetch_ytmp();
+  double * ztmp = fetch_ztmp();
+  double * utmp = fetch_utmp();
+  double * vtmp = fetch_vtmp();
+  double * wtmp = fetch_wtmp();
+  double * qtmp = fetch_qtmp();
+  long long* ParticleIDtmp = 0;
+  if (TrackParticleID) ParticleIDtmp = fetch_ParticleIDtmp();
+
+  // sort the particles
   {
     numpcls_in_bucket->setall(0);
     // iterate through particles and count where they will go
@@ -1071,24 +1184,35 @@ void Particles3Dcomm::sort_particles_serial(
       wtmp[outpidx] = w[pidx];
       qtmp[outpidx] = q[pidx];
       if (TrackParticleID)
+      {
         ParticleIDtmp[outpidx] = ParticleID[pidx];
-      xavgtmp[outpidx] = xavg[pidx];
-      yavgtmp[outpidx] = yavg[pidx];
-      zavgtmp[outpidx] = zavg[pidx];
+      }
+      if(_xavg)
+      {
+        double* xavg = fetch_xavg();
+        double* yavg = fetch_yavg();
+        double* zavg = fetch_zavg();
+        double* xavgtmp = fetch_xavgtmp();
+        double* yavgtmp = fetch_yavgtmp();
+        double* zavgtmp = fetch_zavgtmp();
+        xavgtmp[outpidx] = xavg[pidx];
+        yavgtmp[outpidx] = yavg[pidx];
+        zavgtmp[outpidx] = zavg[pidx];
+      }
     }
     // swap the tmp particle memory with the official particle memory
     {
-      swap(xtmp,x);
-      swap(ytmp,y);
-      swap(ztmp,z);
-      swap(utmp,u);
-      swap(vtmp,v);
-      swap(wtmp,w);
-      swap(qtmp,q);
-      swap(ParticleIDtmp,ParticleID);
-      swap(xavgtmp,xavg);
-      swap(yavgtmp,yavg);
-      swap(zavgtmp,zavg);
+      swap(_xtmp,x);
+      swap(_ytmp,y);
+      swap(_ztmp,z);
+      swap(_utmp,u);
+      swap(_vtmp,v);
+      swap(_wtmp,w);
+      swap(_qtmp,q);
+      swap(_ParticleIDtmp,ParticleID);
+      swap(_xavgtmp,_xavg);
+      swap(_yavgtmp,_yavg);
+      swap(_zavgtmp,_zavg);
     }
 
     // check if the particles were sorted incorrectly
@@ -1239,6 +1363,7 @@ void Particles3Dcomm::copyParticlesToSoA()
 {
   timeTasks_set_task(TimeTasks::TRANSPOSE_PCLS_TO_SOA);
   dprintf("copying to struct of arrays");
+  SpeciesParticle const*const pcls = fetch_pcls();
   #pragma omp for
   for(int pidx=0; pidx<nop; pidx++)
   {
@@ -1256,6 +1381,7 @@ void Particles3Dcomm::copyParticlesToSoA()
 
 void Particles3Dcomm::copyParticlesToAoS()
 {
+  SpeciesParticle * pcls = fetch_pcls();
   timeTasks_set_task(TimeTasks::TRANSPOSE_PCLS_TO_AOS);
   dprintf("copying to array of structs");
   #pragma omp for

From e5a84d374a87cb60742560c766bfc6b3d2692fa6 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Sat, 25 Jan 2014 15:23:43 +0100
Subject: [PATCH 097/118] fixed AoS bugs and turned off sorting and
 vectorization in Parameters.cpp

---
 fields/EMfields3D.cpp         |  21 +--
 include/Parameters.h          |  13 +-
 include/Particle.h            |  68 +++----
 include/Particles3D.h         |   1 +
 include/Particles3Dcomm.h     |   6 +-
 include/ompdefs.h             |  22 +++
 main/Parameters.cpp           |  21 ++-
 main/iPic3Dlib.cpp            |  50 ++++--
 particles/Particles3D.cpp     | 322 ++++++++++++++++++++++++++--------
 particles/Particles3Dcomm.cpp | 285 ++++++++++++++++++++++++++----
 10 files changed, 613 insertions(+), 196 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 779c2948..8034f792 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -425,7 +425,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
     // The following loop is expensive, so it is wise to assume that the
     // compiler is stupid.  Therefore we should on the one hand
     // expand things out and on the other hand avoid repeating computations.
-    #pragma omp for nowait
+    #pragma omp for // used nowait with the old way
     for (int i = 0; i < nop; i++)
     {
       // compute the quadratic moments of velocity
@@ -1158,6 +1158,7 @@ void EMfields3D::sumMoments_vectorized(
 void EMfields3D::sumMoments_vectorized_AoS(
   const Particles3Dcomm* part, Grid * grid, VirtualTopology3D * vct)
 {
+  dprint("entering")
   const double inv_dx = grid->get_invdx();
   const double inv_dy = grid->get_invdy();
   const double inv_dz = grid->get_invdz();
@@ -1233,18 +1234,18 @@ void EMfields3D::sumMoments_vectorized_AoS(
         memset(momentsAcc,0,sizeof(double)*8*10);
         for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
         {
-          const SpeciesParticle& pcl = pcls.get_pcl(pidx);
+          const SpeciesParticle* pcl = &pcls.get_pcl(pidx);
           // This depends on the fact that the memory
           // occupied by a particle coincides with
           // the alignment interval (64 bytes)
-          ALIGNED(&pcl);
+          ALIGNED(pcl);
           double velmoments[10];
           double weights[8];
           // compute the quadratic moments of velocity
           //
-          const double ui=pcl.get_u();
-          const double vi=pcl.get_v();
-          const double wi=pcl.get_w();
+          const double ui=pcl->get_u();
+          const double vi=pcl->get_v();
+          const double wi=pcl->get_w();
           const double uui=ui*ui;
           const double uvi=ui*vi;
           const double uwi=ui*wi;
@@ -1266,9 +1267,9 @@ void EMfields3D::sumMoments_vectorized_AoS(
           // compute the weights to distribute the moments
           //
           //double weights[8];
-          const double abs_xpos = pcl.get_x();
-          const double abs_ypos = pcl.get_y();
-          const double abs_zpos = pcl.get_z();
+          const double abs_xpos = pcl->get_x();
+          const double abs_ypos = pcl->get_y();
+          const double abs_zpos = pcl->get_z();
           const double rel_xpos = abs_xpos - xstart;
           const double rel_ypos = abs_ypos - ystart;
           const double rel_zpos = abs_zpos - zstart;
@@ -1293,7 +1294,7 @@ void EMfields3D::sumMoments_vectorized_AoS(
           const double w0y = 1-w1y;
           const double w0z = 1-w1z;
           // we are calculating a charge moment.
-          const double qi=pcl.get_q();
+          const double qi=pcl->get_q();
           const double weight0 = qi*w0x;
           const double weight1 = qi*w1x;
           const double weight00 = weight0*w0y;
diff --git a/include/Parameters.h b/include/Parameters.h
index 84f6f00e..fac04e9e 100644
--- a/include/Parameters.h
+++ b/include/Parameters.h
@@ -5,10 +5,14 @@
 //
 namespace Parameters
 {
-  enum MoverType
+  enum Enum
   {
-    SoA=0,
-    AoS,
+    SoA=0, // struct of arrays
+    AoS, // array of structs
+    // for moments type
+    AoSvec,
+    SoAvec,
+    // for mover type
     SoAvec_onesort,
     AoSvec_onesort,
     SoAvec_resort,
@@ -25,6 +29,7 @@ namespace Parameters
   inline bool get_USING_XAVG() { return get_RESORTING_PARTICLES(); }
   bool get_VECTORIZE_MOMENTS();
   //bool get_VECTORIZE_MOVER();
-  MoverType get_MOVER_TYPE();
+  Enum get_MOVER_TYPE();
+  Enum get_MOMENTS_TYPE();
 }
 #endif
diff --git a/include/Particle.h b/include/Particle.h
index a17c5368..660ce9f4 100644
--- a/include/Particle.h
+++ b/include/Particle.h
@@ -61,55 +61,39 @@ class SpeciesParticle
 
 // intended to occupy 64 bytes
 //
-// to be used when sorting with every particle advance
-struct CellParticle
+// species particle for second-order-accuracy implicit advance
+class ISpcl
 {
-  long long ID; // 8 bytes
-  int cx[3]; // mesh cell
-  float fx[3]; // mesh cell position (fraction)
+  long long ID;
+  double x[3];
+  float hdx[3]; // xavg = x + hdx
   float u[3];
-  float fxavg[3]; // for implicit push
-  float q; // float m would be better for stitching to MHD for dusty plasma
-  float qom; // for dusty plasma
+  double q;
  public:
   // accessors
-  //
-  // read access
   long long get_ID()const{ return ID; }
-  float get_fx()const{ return fx[0]; }
-  float get_fy()const{ return fx[1]; }
-  float get_fz()const{ return fx[2]; }
-  float get_u()const{ return u[0]; }
-  float get_v()const{ return u[1]; }
-  float get_w()const{ return u[2]; }
-  float get_q()const{ return q; }
+  double get_x(int i)const{ return x[i]; }
+  double get_hdx(int i)const{ return hdx[i]; }
+  float get_u(int i)const{ return u[i]; }
+  double get_q()const{ return q; }
   void set_ID(long long in){ ID=in; }
-  // write access
-  void set_u(float in){ u[0]=in; }
-  void set_v(float in){ u[1]=in; }
-  void set_w(float in){ u[2]=in; }
-
-  void init(const SpeciesParticle& pcl,
-    double cxstart[3], // starting position of cell coordinates
-    float dx_inv[3],
-    float _qom)
-  {
-    ID = pcl.get_ID();
-    // position in mesh coordinates
-    //
+  void set_x(int i, double in) { x[i] = in; }
+  void set_hdx(int i, float in) { hdx[i] = in; }
+  void set_u(int i, double in) { u[i] = in; }
+  void set_q(double in) { q = in; }
+};
 
-    float xpos[3];
-    for(int i=0;i<3;i++)
-    {
-      float xpos = (pcl.get_x(i)-cxstart[i])*dx_inv[i];
-      float cxpos = floor(xpos);
-      cx[i] = int(cxpos);
-      fxavg[i] = fx[i] = cxpos - cx[i];
-      u[i] = pcl.get_u(i);
-    }
-    q = pcl.get_q();
-    qom = _qom;
-  }
+// intended to occupy 64 bytes
+//
+// dust particle for second-order-accuracy implicit advance
+class IDpcl
+{
+  long long ID;
+  double x[3]; // could replace with cell index and float x
+  float hdx[3]; // xavg = x + hdx
+  float u[3];
+  float qom; // charge to mass ratio of particle
+  float m; // mass of particle
 };
 
 #endif
diff --git a/include/Particles3D.h b/include/Particles3D.h
index 05a8701a..58c8fb49 100644
--- a/include/Particles3D.h
+++ b/include/Particles3D.h
@@ -61,6 +61,7 @@ class Particles3D:public Particles3Dcomm {
     /** array-of-structs version of mover_PC */
     void mover_PC_AoS2(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     void mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf);
+    void mover_PC_AoS_vec(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** vectorized version of mover_PC **/
     void mover_PC_vectorized(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** communicate particle after moving them */
diff --git a/include/Particles3Dcomm.h b/include/Particles3Dcomm.h
index c1e72289..3299d243 100644
--- a/include/Particles3Dcomm.h
+++ b/include/Particles3Dcomm.h
@@ -72,12 +72,10 @@ class Particles3Dcomm // :public Particles
   void convertParticlesToSoA();
 
   /*! sort particles for vectorized push (needs to be parallelized) */
+  void sort_particles_serial_SoA_by_xavg(Grid * grid, VirtualTopology3D * vct);
   void sort_particles_serial(Grid * grid, VirtualTopology3D * vct);
   void sort_particles_serial_AoS(Grid * grid, VirtualTopology3D * vct);
-  /*! sort particles with respect to provided position data */
-  void sort_particles_serial(
-    pfloat *xpos, pfloat *ypos, pfloat *zpos,
-    Grid * grid, VirtualTopology3D * vct);
+  void sort_particles_serial_SoA(Grid * grid, VirtualTopology3D * vct);
   void get_safe_cell_for_pos(
     int& cx, int& cy, int& cz, 
     pfloat xpos, pfloat ypos, pfloat zpos)
diff --git a/include/ompdefs.h b/include/ompdefs.h
index 2c16779f..f3640c36 100644
--- a/include/ompdefs.h
+++ b/include/ompdefs.h
@@ -9,6 +9,28 @@
 #else
 inline int omp_get_thread_num() { return 0;}
 inline int omp_get_max_threads(){ return 1;}
+#define omp_set_num_threads(num_threads)
 #endif
 
+class Caller_to_SetMaxThreadsForScope{
+ int max_threads;
+ public:
+  Caller_to_SetMaxThreadsForScope(int i)
+  {
+    max_threads = omp_get_max_threads();
+    // omp_set_num_threads should have been
+    // called omp_set_max_threads
+    omp_set_num_threads(i);
+  }
+  ~Caller_to_SetMaxThreadsForScope()
+  {
+    // restore the original maximum number of threads
+    omp_set_num_threads(max_threads);
+  }
+};
+
+#define set_max_threads_for_scope(num_threads) \
+  Caller_to_SetMaxThreadsForScope \
+  instanceOfCaller_to_SetMaxThreadsForScope(num_threads);
+
 #endif
diff --git a/main/Parameters.cpp b/main/Parameters.cpp
index e220e387..ca8ed480 100644
--- a/main/Parameters.cpp
+++ b/main/Parameters.cpp
@@ -5,9 +5,10 @@ using namespace Parameters;
 //********** edit these parameters *********
 //
 bool Parameters::get_VECTORIZE_MOMENTS() { return false; }
-// options: SoA AoS SoAvec_onesort AoSvec_onesort SoAvec_resort AoSvec_resort
-Parameters::MoverType Parameters::get_MOVER_TYPE() { return SoA; }
-
+// supported options: SoA AoS
+Parameters::Enum Parameters::get_MOMENTS_TYPE() { return SoA; }
+// supported options: SoA AoS AoSvec_onesort SoAvec_resort
+Parameters::Enum Parameters::get_MOVER_TYPE() { return SoA; }
 //********** derived parameters *********
 
 static bool SORTING_PARTICLES;
@@ -25,17 +26,23 @@ void Parameters::init_parameters()
     || get_MOVER_TYPE()==AoSvec_onesort
     || get_MOVER_TYPE()==SoAvec_resort
     || get_MOVER_TYPE()==AoSvec_resort;
-  USING_AOS =
-       get_MOVER_TYPE()==AoS
-    || get_MOVER_TYPE()==AoSvec_onesort
-    || get_MOVER_TYPE()==AoSvec_resort;
   SORTING_SOA = get_VECTORIZE_MOMENTS()
     || get_MOVER_TYPE()==SoAvec_onesort
     || get_MOVER_TYPE()==SoAvec_resort;
+  USING_AOS =
+       get_MOMENTS_TYPE()==AoS
+    || get_MOVER_TYPE()==AoS
+    || get_MOVER_TYPE()==AoSvec_onesort
+    || get_MOVER_TYPE()==AoSvec_resort;
 }
 
 bool Parameters::get_RESORTING_PARTICLES() { return RESORTING_PARTICLES; }
 bool Parameters::get_SORTING_PARTICLES() { return SORTING_PARTICLES; }
 bool Parameters::get_SORTING_SOA() { return SORTING_SOA; }
 bool Parameters::get_USING_AOS() { return USING_AOS; }
+
+//bool Parameters::get_RESORTING_PARTICLES() { return true; }
+//bool Parameters::get_SORTING_PARTICLES() { return true; }
+//bool Parameters::get_SORTING_SOA() { return true; }
+//bool Parameters::get_USING_AOS() { return true; }
 //
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 3f7115c7..9f60f114 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -191,25 +191,42 @@ void c_Solver::CalculateMoments() {
 
   if(Parameters::get_VECTORIZE_MOMENTS())
   {
-    // since particles are sorted,
-    // we can vectorize interpolation of particles to grid
-    convertParticlesToSoA();
-	sortParticles();
-    EMf->sumMoments_vectorized(part, grid, vct);
-    //convertParticlesToAoS();
-	//sortParticles();
-    //EMf->sumMoments_vectorized_AoS(part, grid, vct);
+    switch(Parameters::get_MOMENTS_TYPE())
+    {
+      case Parameters::SoA:
+        // since particles are sorted,
+        // we can vectorize interpolation of particles to grid
+        convertParticlesToSoA();
+        sortParticles();
+        EMf->sumMoments_vectorized(part, grid, vct);
+        break;
+      case Parameters::AoS:
+        convertParticlesToAoS();
+        sortParticles();
+        EMf->sumMoments_vectorized_AoS(part, grid, vct);
+        break;
+      default:
+        unsupported_value_error(Parameters::get_MOMENTS_TYPE());
+    }
   }
   else
   {
     if(Parameters::get_SORTING_PARTICLES())
-	  sortParticles();
-
-    EMf->setZeroPrimaryMoments();
-    convertParticlesToSoA();
-    EMf->sumMoments(part, grid, vct);
-    //convertParticlesToAoS();
-    //EMf->sumMoments_AoS(part, grid, vct);
+      sortParticles();
+    switch(Parameters::get_MOMENTS_TYPE())
+    {
+      case Parameters::SoA:
+        EMf->setZeroPrimaryMoments();
+        convertParticlesToSoA();
+        EMf->sumMoments(part, grid, vct);
+        break;
+      case Parameters::AoS:
+        convertParticlesToAoS();
+        EMf->sumMoments_AoS(part, grid, vct);
+        break;
+      default:
+        unsupported_value_error(Parameters::get_MOMENTS_TYPE());
+    }
   }
   //for (int i = 0; i < ns; i++)
   //{
@@ -275,6 +292,9 @@ bool c_Solver::ParticlesMover() {
           part[i].mover_PC_AoS(grid, vct, EMf);
           //part[i].mover_PC_AoS2(grid, vct, EMf);
           break;
+        case Parameters::AoSvec_onesort:
+          part[i].mover_PC_AoS_vec(grid, vct, EMf);
+          break;
         default:
           unsupported_value_error(Parameters::get_MOVER_TYPE());
       }
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 3a4a2e80..dd0af36b 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -317,34 +317,34 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   if (vct->getCartesian_rank() == 0) {
     cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
   }
-  const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
+  const_arr4_double fieldForPcls = EMf->get_fieldForPcls();
 
   #pragma omp master
   { timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING); }
-  const pfloat dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
+  const double dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
   #pragma omp for schedule(static)
   // why does single precision make no difference in execution speed?
   //#pragma simd vectorlength(VECTOR_WIDTH)
   for (int pidx = 0; pidx < nop; pidx++) {
     // copy the particle
-    const pfloat xorig = x[pidx];
-    const pfloat yorig = y[pidx];
-    const pfloat zorig = z[pidx];
-    const pfloat uorig = u[pidx];
-    const pfloat vorig = v[pidx];
-    const pfloat worig = w[pidx];
-    pfloat xavg = xorig;
-    pfloat yavg = yorig;
-    pfloat zavg = zorig;
-    pfloat uavg;
-    pfloat vavg;
-    pfloat wavg;
+    const double xorig = x[pidx];
+    const double yorig = y[pidx];
+    const double zorig = z[pidx];
+    const double uorig = u[pidx];
+    const double vorig = v[pidx];
+    const double worig = w[pidx];
+    double xavg = xorig;
+    double yavg = yorig;
+    double zavg = zorig;
+    double uavg;
+    double vavg;
+    double wavg;
     // calculate the average velocity iteratively
     for (int innter = 0; innter < NiterMover; innter++) {
       // interpolation G-->P
-      const pfloat ixd = floor((xavg - xstart) * inv_dx);
-      const pfloat iyd = floor((yavg - ystart) * inv_dy);
-      const pfloat izd = floor((zavg - zstart) * inv_dz);
+      const double ixd = floor((xavg - xstart) * inv_dx);
+      const double iyd = floor((yavg - ystart) * inv_dy);
+      const double izd = floor((zavg - zstart) * inv_dz);
       // interface of index to right of cell
       int ix = 2 + int(ixd);
       int iy = 2 + int(iyd);
@@ -363,12 +363,12 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       //const int cy = iy - 1;
       //const int cz = iz - 1;
 
-      const pfloat xi0   = xavg - grid->get_pfloat_XN(ix-1);
-      const pfloat eta0  = yavg - grid->get_pfloat_YN(iy-1);
-      const pfloat zeta0 = zavg - grid->get_pfloat_ZN(iz-1);
-      const pfloat xi1   = grid->get_pfloat_XN(ix) - xavg;
-      const pfloat eta1  = grid->get_pfloat_YN(iy) - yavg;
-      const pfloat zeta1 = grid->get_pfloat_ZN(iz) - zavg;
+      const double xi0   = xavg - grid->getXN(ix-1);
+      const double eta0  = yavg - grid->getYN(iy-1);
+      const double zeta0 = zavg - grid->getZN(iz-1);
+      const double xi1   = grid->getXN(ix) - xavg;
+      const double eta1  = grid->getYN(iy) - yavg;
+      const double zeta1 = grid->getZN(iz) - zavg;
 
       pfloat Exl = 0.0;
       pfloat Eyl = 0.0;
@@ -404,7 +404,7 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       // creating these aliases seems to accelerate this method by about 30%
       // on the Xeon host, processor, suggesting deficiency in the optimizer.
       //
-      arr1_pfloat_get field_components[8];
+      arr1_double_get field_components[8];
       field_components[0] = fieldForPcls[ix  ][iy  ][iz  ]; // field000
       field_components[1] = fieldForPcls[ix  ][iy  ][iz-1]; // field001
       field_components[2] = fieldForPcls[ix  ][iy-1][iz  ]; // field010
@@ -551,7 +551,7 @@ void Particles3D::mover_PC_AoS2(Grid * grid, VirtualTopology3D * vct, Field * EM
       // creating these aliases seems to accelerate this method by about 30%
       // on the Xeon host, processor, suggesting deficiency in the optimizer.
       //
-      arr1_pfloat_get field_components[8];
+      arr1_double_get field_components[8];
       field_components[0] = fieldForPcls[ix  ][iy  ][iz  ]; // field000
       field_components[1] = fieldForPcls[ix  ][iy  ][iz-1]; // field001
       field_components[2] = fieldForPcls[ix  ][iy-1][iz  ]; // field010
@@ -603,6 +603,7 @@ void Particles3D::mover_PC_AoS2(Grid * grid, VirtualTopology3D * vct, Field * EM
   #pragma omp master
   { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
 }
+
 void Particles3D::mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf)
 {
   convertParticlesToAoS();
@@ -621,13 +622,14 @@ void Particles3D::mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf
   //#pragma simd vectorlength(VECTOR_WIDTH)
   for (int pidx = 0; pidx < nop; pidx++) {
     // copy the particle
-    SpeciesParticle& pcl = pcls[pidx];
-    const double xorig = pcl.get_x();
-    const double yorig = pcl.get_y();
-    const double zorig = pcl.get_z();
-    const double uorig = pcl.get_u();
-    const double vorig = pcl.get_v();
-    const double worig = pcl.get_w();
+    SpeciesParticle* pcl = &pcls[pidx];
+    ALIGNED(pcl);
+    const double xorig = pcl->get_x();
+    const double yorig = pcl->get_y();
+    const double zorig = pcl->get_z();
+    const double uorig = pcl->get_u();
+    const double vorig = pcl->get_v();
+    const double worig = pcl->get_w();
     double xavg = xorig;
     double yavg = yorig;
     double zavg = zorig;
@@ -709,7 +711,7 @@ void Particles3D::mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf
       // creating these aliases seems to accelerate this method by about 30%
       // on the Xeon host, processor, suggesting deficiency in the optimizer.
       //
-      arr1_pfloat_get field_components[8];
+      arr1_double_get field_components[8];
       field_components[0] = fieldForPcls[ix][iy][iz]; // field000
       field_components[1] = fieldForPcls[ix][iy][cz]; // field001
       field_components[2] = fieldForPcls[ix][cy][iz]; // field010
@@ -751,25 +753,189 @@ void Particles3D::mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf
       zavg = zorig + wavg * dto2;
     }                           // end of iteration
     // update the final position and velocity
-    pcl.set_x(xorig + uavg * dt);
-    pcl.set_y(yorig + vavg * dt);
-    pcl.set_z(zorig + wavg * dt);
-    pcl.set_u(2.0 * uavg - uorig);
-    pcl.set_v(2.0 * vavg - vorig);
-    pcl.set_w(2.0 * wavg - worig);
+    pcl->set_x(xorig + uavg * dt);
+    pcl->set_y(yorig + vavg * dt);
+    pcl->set_z(zorig + wavg * dt);
+    pcl->set_u(2.0 * uavg - uorig);
+    pcl->set_v(2.0 * vavg - vorig);
+    pcl->set_w(2.0 * wavg - worig);
   }                             // END OF ALL THE PARTICLES
   #pragma omp master
   { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
 }
 
+// this currently computes garbage but execution time
+// suggests bound on performance.  For correct execution
+// would need to sort by xavg with each iteration
+// like in mover_PC_vectorized
+void Particles3D::mover_PC_AoS_vec(
+  Grid * grid, VirtualTopology3D * vct, Field * EMf)
+{
+  convertParticlesToAoS();
+  #pragma omp master
+  if (vct->getCartesian_rank() == 0) {
+    cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
+  }
+  const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
+
+  SpeciesParticle * pcls = fetch_pcls();
+  #pragma omp master
+  { timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING); }
+  const double dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
+
+  #pragma omp for collapse(2) // schedule(static)
+  for(int cx=0;cx<nxc;cx++)
+  for(int cy=0;cy<nyc;cy++)
+  for(int cz=0;cz<nzc;cz++)
+  //for(int cell=0; cell<ncells; cell++)
+  {
+    // interface to the right of cell
+    const int ix = cx+1;
+    const int iy = cy+1;
+    const int iz = cz+1;
+
+    arr1_double_get field_components[8];
+    field_components[0] = fieldForPcls[ix][iy][iz]; // field000
+    field_components[1] = fieldForPcls[ix][iy][cz]; // field001
+    field_components[2] = fieldForPcls[ix][cy][iz]; // field010
+    field_components[3] = fieldForPcls[ix][cy][cz]; // field011
+    field_components[4] = fieldForPcls[cx][iy][iz]; // field100
+    field_components[5] = fieldForPcls[cx][iy][cz]; // field101
+    field_components[6] = fieldForPcls[cx][cy][iz]; // field110
+    field_components[7] = fieldForPcls[cx][cy][cz]; // field111
+
+    // push all particles in mesh cell
+    //
+    //const int numpcls_in_cell = numpcls_in_bucket_1d[cell];
+    const int numpcls_in_cell = get_numpcls_in_bucket(cx,cy,cz);
+    const int bucket_offset = get_bucket_offset(cx,cy,cz);
+    const int bucket_end = bucket_offset+numpcls_in_cell;
+    for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
+    {
+      SpeciesParticle* pcl = &pcls[pidx];
+      ALIGNED(pcl);
+      // copy the particle
+      const pfloat xorig = pcl->get_x();
+      const pfloat yorig = pcl->get_y();
+      const pfloat zorig = pcl->get_z();
+      const pfloat uorig = pcl->get_u();
+      const pfloat vorig = pcl->get_v();
+      const pfloat worig = pcl->get_w();
+      double xavg = xorig;
+      double yavg = yorig;
+      double zavg = zorig;
+      double uavg;
+      double vavg;
+      double wavg;
+      // calculate the average velocity iteratively
+      for (int innter = 0; innter < NiterMover; innter++) {
+
+        // compute weights for field components
+        //
+        double weights[8];
+        // xstart marks start of domain excluding ghosts
+        const double rel_xpos = xavg - xstart;
+        const double rel_ypos = yavg - ystart;
+        const double rel_zpos = zavg - zstart;
+        // cell position minus 1 (due to ghost cells)
+        const double cxm1_pos = rel_xpos * inv_dx;
+        const double cym1_pos = rel_ypos * inv_dy;
+        const double czm1_pos = rel_zpos * inv_dz;
+        //
+        int cx = 1 + int(floor(cxm1_pos));
+        int cy = 1 + int(floor(cym1_pos));
+        int cz = 1 + int(floor(czm1_pos));
+
+        // if the cell is outside the domain, then treat it as
+        // in the nearest ghost cell.
+        //
+        if (cx < 0) cx = 0;
+        if (cy < 0) cy = 0;
+        if (cz < 0) cz = 0;
+        // number of cells in x direction including ghosts is nxc
+        if (cx >= nxc) cx = nxc-1;
+        if (cy >= nyc) cy = nyc-1;
+        if (cz >= nzc) cz = nzc-1;
+
+        // index of interface to right of cell
+        const int ix = cx + 1;
+        const int iy = cy + 1;
+        const int iz = cz + 1;
+
+        // fraction of the distance from the right of the cell
+        const double w1x = cx - cxm1_pos;
+        const double w1y = cy - cym1_pos;
+        const double w1z = cz - czm1_pos;
+        // fraction of distance from the left
+        const double w0x = 1-w1x;
+        const double w0y = 1-w1y;
+        const double w0z = 1-w1z;
+        //
+        weights[0] = w0x*w0y*w0z; // weight000
+        weights[1] = w0x*w0y*w1z; // weight001
+        weights[2] = w0x*w1y*w0z; // weight010
+        weights[3] = w0x*w1y*w1z; // weight011
+        weights[4] = w1x*w0y*w0z; // weight100
+        weights[5] = w1x*w0y*w1z; // weight101
+        weights[6] = w1x*w1y*w0z; // weight110
+        weights[7] = w1x*w1y*w1z; // weight111
+
+        pfloat Exl = 0.0;
+        pfloat Eyl = 0.0;
+        pfloat Ezl = 0.0;
+        pfloat Bxl = 0.0;
+        pfloat Byl = 0.0;
+        pfloat Bzl = 0.0;
+
+        for(int c=0; c<8; c++)
+        {
+          Bxl += weights[c] * field_components[c][0];
+          Byl += weights[c] * field_components[c][1];
+          Bzl += weights[c] * field_components[c][2];
+          Exl += weights[c] * field_components[c][3];
+          Eyl += weights[c] * field_components[c][4];
+          Ezl += weights[c] * field_components[c][5];
+        }
+        const double Omx = qdto2mc*Bxl;
+        const double Omy = qdto2mc*Byl;
+        const double Omz = qdto2mc*Bzl;
+
+        // end interpolation
+        const pfloat omsq = (Omx * Omx + Omy * Omy + Omz * Omz);
+        const pfloat denom = 1.0 / (1.0 + omsq);
+        // solve the position equation
+        const pfloat ut = uorig + qdto2mc * Exl;
+        const pfloat vt = vorig + qdto2mc * Eyl;
+        const pfloat wt = worig + qdto2mc * Ezl;
+        //const pfloat udotb = ut * Bxl + vt * Byl + wt * Bzl;
+        const pfloat udotOm = ut * Omx + vt * Omy + wt * Omz;
+        // solve the velocity equation 
+        uavg = (ut + (vt * Omz - wt * Omy + udotOm * Omx)) * denom;
+        vavg = (vt + (wt * Omx - ut * Omz + udotOm * Omy)) * denom;
+        wavg = (wt + (ut * Omy - vt * Omx + udotOm * Omz)) * denom;
+        // update average position
+        xavg = xorig + uavg * dto2;
+        yavg = yorig + vavg * dto2;
+        zavg = zorig + wavg * dto2;
+      }
+      // update the final position and velocity
+      pcl->set_x(xorig + uavg * dt);
+      pcl->set_y(yorig + vavg * dt);
+      pcl->set_z(zorig + wavg * dt);
+      pcl->set_u(2.0 * uavg - uorig);
+      pcl->set_v(2.0 * vavg - vorig);
+      pcl->set_w(2.0 * wavg - worig);
+    }
+  }
+  #pragma omp master
+  { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
+}
+
 /** mover with a Predictor-Corrector scheme */
 void Particles3D::mover_PC_vectorized(
   Grid * grid, VirtualTopology3D * vct, Field * EMf)
 {
   convertParticlesToSoA();
-  double* xavg = fetch_xavg();
-  double* yavg = fetch_yavg();
-  double* zavg = fetch_zavg();
   assert_eq(nxc,nxn-1);
   assert_eq(nyc,nyn-1);
   assert_eq(nzc,nzn-1);
@@ -783,9 +949,9 @@ void Particles3D::mover_PC_vectorized(
   #pragma omp for schedule(static)
   for(int pidx = 0; pidx < nop; pidx++)
   {
-    xavg[pidx] = x[pidx];
-    yavg[pidx] = y[pidx];
-    zavg[pidx] = z[pidx];
+    _xavg[pidx] = x[pidx];
+    _yavg[pidx] = y[pidx];
+    _zavg[pidx] = z[pidx];
   }
 
   const pfloat dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
@@ -797,7 +963,8 @@ void Particles3D::mover_PC_vectorized(
       #pragma omp master
       {
         timeTasks_begin_task(TimeTasks::MOVER_PCL_SORTING);
-        sort_particles_serial(xavg, yavg, zavg, grid,vct);
+        // this changes the definitions of x,y,z,u,v,w,_xavg,_yavg,_zavg,etc.
+        sort_particles_serial_SoA_by_xavg(grid,vct);
         timeTasks_end_task(TimeTasks::MOVER_PCL_SORTING);
       }
       #pragma omp barrier
@@ -811,6 +978,15 @@ void Particles3D::mover_PC_vectorized(
     //const int ncells=nxc*nyc*nzc;
     //int *numpcls_in_bucket_1d = &numpcls_in_bucket[0][0][0];
     //int *bucket_offset_1d = &bucket_offset[0][0][0];
+    ALIGNED(x);
+    ALIGNED(y);
+    ALIGNED(z);
+    ALIGNED(u);
+    ALIGNED(v);
+    ALIGNED(w);
+    ALIGNED(_xavg);
+    ALIGNED(_yavg);
+    ALIGNED(_zavg);
     int serial_pidx = 0;
     #pragma omp for collapse(2) // schedule(static)
     for(int cx=0;cx<nxc;cx++)
@@ -823,7 +999,7 @@ void Particles3D::mover_PC_vectorized(
       const int iy = cy+1;
       const int iz = cz+1;
 
-      arr1_pfloat_get field_components[8];
+      arr1_double_get field_components[8];
       field_components[0] = fieldForPcls[ix][iy][iz]; // field000
       field_components[1] = fieldForPcls[ix][iy][cz]; // field001
       field_components[2] = fieldForPcls[ix][cy][iz]; // field010
@@ -839,39 +1015,33 @@ void Particles3D::mover_PC_vectorized(
       const int numpcls_in_cell = get_numpcls_in_bucket(cx,cy,cz);
       const int bucket_offset = get_bucket_offset(cx,cy,cz);
       const int bucket_end = bucket_offset+numpcls_in_cell;
-      ALIGNED(x);
-      ALIGNED(y);
-      ALIGNED(z);
-      ALIGNED(u);
-      ALIGNED(v);
-      ALIGNED(w);
-      ALIGNED(xavg);
-      ALIGNED(yavg);
-      ALIGNED(zavg);
       // This pragma help on Xeon but hurts on Xeon Phi.
       // On the Phi we could accelerate by processing two particles at a time.
+      // there should be no function calls in this loop (except inlined calls)
       #pragma simd
-      //for(int pidx=bucket_offset_1d[cell]; pidx<numpcls_in_cell; pidx++)
       for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
       {
         // serial case: check that pidx is correct
-        //assert_eq(pidx,serial_pidx++);
+        //assert_eq(pidx,serial_pidx);
+        //serial_pidx++;
         // confirm that particle is in correct cell
         //if(true)
         //{
         //  int cx_,cy_,cz_;
-        //  get_safe_cell_for_pos(cx_,cy_,cz_,xavg[pidx],yavg[pidx],zavg[pidx]);
-        //  //if((cx_!=cx)
-        //  // ||(cy_!=cy)
-        //  // ||(cz_!=cz))
-        //  //{
-        //  //  dprintf("\n\t cx =%d, cy =%d, cz =%d"
-        //  //          "\n\t cx_=%d, cy_=%d, cz_=%d"
-        //  //          "\n\t x=%g, y=%g, z_=%g",
-        //  //          cx,cy,cz,
-        //  //          cx_,cy_,cz_,
-        //  //          xavg[pidx], yavg[pidx], zavg[pidx]);
-        //  //}
+        //  get_safe_cell_for_pos(cx_,cy_,cz_,_xavg[pidx],_yavg[pidx],_zavg[pidx]);
+        //  if((cx_!=cx)
+        //   ||(cy_!=cy)
+        //   ||(cz_!=cz))
+        //  {
+        //    dprintf("\n\t cx =%d, cy =%d, cz =%d"
+        //            "\n\t cx_=%d, cy_=%d, cz_=%d"
+        //            "\n\t cxf=%g, cyf=%g, czf=%g",
+        //            cx,cy,cz,
+        //            cx_,cy_,cz_,
+        //            1+(_xavg[pidx]-xstart)*inv_dx,
+        //            1+(_yavg[pidx]-ystart)*inv_dy,
+        //            1+(_zavg[pidx]-zstart)*inv_dz);
+        //  }
         //  assert_eq(cx_,cx);
         //  assert_eq(cy_,cy);
         //  assert_eq(cz_,cz);
@@ -888,9 +1058,9 @@ void Particles3D::mover_PC_vectorized(
         // compute weights for field components
         //
         double weights[8];
-        const double abs_xpos = xavg[pidx];
-        const double abs_ypos = yavg[pidx];
-        const double abs_zpos = zavg[pidx];
+        const double abs_xpos = _xavg[pidx];
+        const double abs_ypos = _yavg[pidx];
+        const double abs_zpos = _zavg[pidx];
         // xstart marks start of domain excluding ghosts
         const double rel_xpos = abs_xpos - xstart;
         const double rel_ypos = abs_ypos - ystart;
@@ -959,9 +1129,9 @@ void Particles3D::mover_PC_vectorized(
         const pfloat vavg = (vt + (wt * Omx - ut * Omz + udotOm * Omy)) * denom;
         const pfloat wavg = (wt + (ut * Omy - vt * Omx + udotOm * Omz)) * denom;
         // update average position
-        xavg[pidx] = xorig + uavg * dto2;
-        yavg[pidx] = yorig + vavg * dto2;
-        zavg[pidx] = zorig + wavg * dto2;
+        _xavg[pidx] = xorig + uavg * dto2;
+        _yavg[pidx] = yorig + vavg * dto2;
+        _zavg[pidx] = zorig + wavg * dto2;
 
         // if it is the last iteration, update the position and velocity
         // (hopefully this will not compromise vectorization...)
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index f5f49e63..f3bdf39a 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -189,21 +189,10 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   // ////////////// ALLOCATE ARRAYS /////////////////////////
   // //////////////////////////////////////////////////////////////
   //
-  // AoS particle representation
-  //
-  // intel new allocates with 64-byte alignment
-  // since particles are 64 bytes wide, every particle
-  // is aligned.
-  _pcls = new SpeciesParticle[npmax];
-  particleType = ParticleType::SoA;
-  #ifdef __INTEL_COMPILER
-    assert_eq(sizeof(SpeciesParticle),64);
-    ALIGNED(_pcls);
-  #endif
-  //
   // SoA particle representation
   //
   // positions
+  particleType = ParticleType::SoA;
   x = new double[npmax];
   y = new double[npmax];
   z = new double[npmax];
@@ -235,6 +224,7 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   _zavgtmp = 0;
   _pcls = 0;
   _pclstmp = 0;
+  _ParticleIDtmp = 0;
   // accessors for data that should be allocated only if needed
   //
   if(Parameters::get_USING_XAVG())
@@ -263,12 +253,20 @@ void Particles3Dcomm::allocate(int species, CollectiveIO * col, VirtualTopology3
   if(Parameters::get_USING_AOS())
   {
     assert_eq(sizeof(SpeciesParticle),64);
-    _pcls = AlignedAlloc(SpeciesParticle,npmax);
-    _pclstmp = AlignedAlloc(SpeciesParticle,npmax);
+    //_pcls = AlignedAlloc(SpeciesParticle,npmax);
+    _pcls = new SpeciesParticle[npmax];
+    if(Parameters::get_SORTING_PARTICLES())
+    {
+      //_pclstmp = AlignedAlloc(SpeciesParticle,npmax);
+      _pclstmp = new SpeciesParticle[npmax];
+    }
+    #ifdef __INTEL_COMPILER
+      assert_eq(sizeof(SpeciesParticle),64);
+      ALIGNED(_pcls);
+    #endif
   }
 
   ParticleID = 0;
-  _ParticleIDtmp = 0;
   // ID
   if (TrackParticleID) {
     ParticleID = new long long[npmax];
@@ -1025,7 +1023,7 @@ void Particles3Dcomm::sort_particles_serial(Grid * grid, VirtualTopology3D * vct
       sort_particles_serial_AoS(grid,vct);
       break;
     case ParticleType::SoA:
-      sort_particles_serial(x,y,z, grid,vct);
+      sort_particles_serial_SoA(grid,vct);
       break;
     default:
       unsupported_value_error(particleType);
@@ -1105,8 +1103,7 @@ void Particles3Dcomm::sort_particles_serial_AoS(
 }
 
 // need to sort and communicate particles after each iteration
-void Particles3Dcomm::sort_particles_serial(
-  double *xpos, double *ypos, double *zpos,
+void Particles3Dcomm::sort_particles_serial_SoA(
   Grid * grid, VirtualTopology3D * vct)
 {
   double * xtmp = fetch_xtmp();
@@ -1116,8 +1113,15 @@ void Particles3Dcomm::sort_particles_serial(
   double * vtmp = fetch_vtmp();
   double * wtmp = fetch_wtmp();
   double * qtmp = fetch_qtmp();
+
   long long* ParticleIDtmp = 0;
-  if (TrackParticleID) ParticleIDtmp = fetch_ParticleIDtmp();
+  if (TrackParticleID)
+  {
+    assert(ParticleID);
+    ParticleIDtmp = fetch_ParticleIDtmp();
+    assert(fetch_ParticleIDtmp());
+    assert(ParticleIDtmp);
+  }
 
   // sort the particles
   {
@@ -1128,7 +1132,7 @@ void Particles3Dcomm::sort_particles_serial(
       // get the cell indices of the particle
       //
       int cx,cy,cz;
-      get_safe_cell_for_pos(cx,cy,cz,xpos[pidx],ypos[pidx],zpos[pidx]);
+      get_safe_cell_for_pos(cx,cy,cz,x[pidx],y[pidx],z[pidx]);
       //
       // is it better just to recompute this?
       //
@@ -1154,13 +1158,14 @@ void Particles3Dcomm::sort_particles_serial(
     assert_eq(accpcls,nop);
 
     numpcls_in_bucket_now->setall(0);
+
     // put the particles where they are supposed to go
     for (int pidx = 0; pidx < nop; pidx++)
     {
       // get the cell indices of the particle
       //
       int cx,cy,cz;
-      get_safe_cell_for_pos(cx,cy,cz,xpos[pidx],ypos[pidx],zpos[pidx]);
+      get_safe_cell_for_pos(cx,cy,cz,x[pidx],y[pidx],z[pidx]);
       //
       //cx = xcell[pidx];
       //cy = ycell[pidx];
@@ -1187,17 +1192,161 @@ void Particles3Dcomm::sort_particles_serial(
       {
         ParticleIDtmp[outpidx] = ParticleID[pidx];
       }
-      if(_xavg)
+    }
+    // swap the tmp particle memory with the official particle memory
+    {
+      swap(_xtmp,x);
+      swap(_ytmp,y);
+      swap(_ztmp,z);
+      swap(_utmp,u);
+      swap(_vtmp,v);
+      swap(_wtmp,w);
+      swap(_qtmp,q);
+      swap(_ParticleIDtmp,ParticleID);
+    }
+
+    // check that the number of bins was correct
+    //
+    if(true)
+    {
+      for(int cx=0;cx<nxc;cx++)
+      for(int cy=0;cy<nyc;cy++)
+      for(int cz=0;cz<nzc;cz++)
       {
-        double* xavg = fetch_xavg();
-        double* yavg = fetch_yavg();
-        double* zavg = fetch_zavg();
-        double* xavgtmp = fetch_xavgtmp();
-        double* yavgtmp = fetch_yavgtmp();
-        double* zavgtmp = fetch_zavgtmp();
-        xavgtmp[outpidx] = xavg[pidx];
-        yavgtmp[outpidx] = yavg[pidx];
-        zavgtmp[outpidx] = zavg[pidx];
+        assert_eq((*numpcls_in_bucket_now)[cx][cy][cz], (*numpcls_in_bucket)[cx][cy][cz]);
+      }
+    }
+    // confirm that the particles were sorted correctly
+    if(false)
+    {
+      for(int cx=0;cx<nxc;cx++)
+      for(int cy=0;cy<nyc;cy++)
+      for(int cz=0;cz<nzc;cz++)
+      {
+        const int numpcls_in_cell = get_numpcls_in_bucket(cx,cy,cz);
+        const int bucket_offset = get_bucket_offset(cx,cy,cz);
+        const int bucket_end = bucket_offset+numpcls_in_cell;
+        for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
+        {
+          // confirm that particle is in correct cell
+          {
+            int cx_,cy_,cz_;
+            get_safe_cell_for_pos(cx_,cy_,cz_,x[pidx],y[pidx],z[pidx]);
+            if((cx_!=cx)
+             ||(cy_!=cy)
+             ||(cz_!=cz))
+            {
+              dprintf("\n\t cx =%d, cy =%d, cz =%d"
+                      "\n\t cx_=%d, cy_=%d, cz_=%d"
+                      "\n\t cxf=%f, cyf=%f, czf=%f",
+                      cx,cy,cz,
+                      cx_,cy_,cz_,
+                      1.+(x[pidx]-xstart)*inv_dx,
+                      1.+(y[pidx]-ystart)*inv_dy,
+                      1.+(z[pidx]-zstart)*inv_dz);
+            }
+            assert_eq(cx_,cx);
+            assert_eq(cy_,cy);
+            assert_eq(cz_,cz);
+          }
+        }
+      }
+    }
+  }
+}
+
+// need to sort and communicate particles after each iteration
+void Particles3Dcomm::sort_particles_serial_SoA_by_xavg(
+  Grid * grid, VirtualTopology3D * vct)
+{
+  double * xtmp = fetch_xtmp();
+  double * ytmp = fetch_ytmp();
+  double * ztmp = fetch_ztmp();
+  double * utmp = fetch_utmp();
+  double * vtmp = fetch_vtmp();
+  double * wtmp = fetch_wtmp();
+  double * qtmp = fetch_qtmp();
+  double* xavg = fetch_xavg();
+  double* yavg = fetch_yavg();
+  double* zavg = fetch_zavg();
+  double* xavgtmp = fetch_xavgtmp();
+  double* yavgtmp = fetch_yavgtmp();
+  double* zavgtmp = fetch_zavgtmp();
+
+  long long* ParticleIDtmp = 0;
+  if (TrackParticleID) ParticleIDtmp = fetch_ParticleIDtmp();
+
+  // sort the particles
+  {
+    numpcls_in_bucket->setall(0);
+    // iterate through particles and count where they will go
+    for (int pidx = 0; pidx < nop; pidx++)
+    {
+      // get the cell indices of the particle
+      //
+      int cx,cy,cz;
+      get_safe_cell_for_pos(cx,cy,cz,xavg[pidx],yavg[pidx],zavg[pidx]);
+      //
+      // is it better just to recompute this?
+      //
+      //xcell[pidx]=cx;
+      //ycell[pidx]=cy;
+      //zcell[pidx]=cz;
+
+      // increment the number of particles in bucket of this particle
+      (*numpcls_in_bucket)[cx][cy][cz]++;
+    }
+
+    // compute prefix sum to determine initial position
+    // of each bucket (could parallelize this)
+    //
+    int accpcls=0;
+    for(int cx=0;cx<nxc;cx++)
+    for(int cy=0;cy<nyc;cy++)
+    for(int cz=0;cz<nzc;cz++)
+    {
+      (*bucket_offset)[cx][cy][cz] = accpcls;
+      accpcls += (*numpcls_in_bucket)[cx][cy][cz];
+    }
+    assert_eq(accpcls,nop);
+
+    numpcls_in_bucket_now->setall(0);
+
+    // put the particles where they are supposed to go
+    for (int pidx = 0; pidx < nop; pidx++)
+    {
+      // get the cell indices of the particle
+      //
+      int cx,cy,cz;
+      get_safe_cell_for_pos(cx,cy,cz,xavg[pidx],yavg[pidx],zavg[pidx]);
+      //
+      //cx = xcell[pidx];
+      //cy = ycell[pidx];
+      //cz = zcell[pidx];
+
+      // compute where the data should go
+      const int numpcls_now = (*numpcls_in_bucket_now)[cx][cy][cz]++;
+      const int outpidx = (*bucket_offset)[cx][cy][cz] + numpcls_now;
+      assert_lt(outpidx, nop);
+      assert_ge(outpidx, 0);
+      assert_lt(pidx, nop);
+      assert_ge(pidx, 0);
+
+      // copy particle data to new location
+      //
+      xtmp[outpidx] = x[pidx];
+      ytmp[outpidx] = y[pidx];
+      ztmp[outpidx] = z[pidx];
+      utmp[outpidx] = u[pidx];
+      vtmp[outpidx] = v[pidx];
+      wtmp[outpidx] = w[pidx];
+      qtmp[outpidx] = q[pidx];
+      xavgtmp[outpidx] = xavg[pidx];
+      yavgtmp[outpidx] = yavg[pidx];
+      zavgtmp[outpidx] = zavg[pidx];
+      if (TrackParticleID)
+      {
+        ParticleIDtmp[outpidx] = ParticleID[pidx];
       }
     }
     // swap the tmp particle memory with the official particle memory
@@ -1214,8 +1363,12 @@ void Particles3Dcomm::sort_particles_serial(
       swap(_yavgtmp,_yavg);
       swap(_zavgtmp,_zavg);
     }
+    xavg = _xavg;
+    yavg = _yavg;
+    zavg = _zavg;
 
-    // check if the particles were sorted incorrectly
+    // check that the number of bins was correct
+    //
     if(true)
     {
       for(int cx=0;cx<nxc;cx++)
@@ -1225,6 +1378,46 @@ void Particles3Dcomm::sort_particles_serial(
         assert_eq((*numpcls_in_bucket_now)[cx][cy][cz], (*numpcls_in_bucket)[cx][cy][cz]);
       }
     }
+    int serial_pidx=0;
+    // confirm that the particles were sorted correctly
+    for(int cx=0;cx<nxc;cx++)
+    for(int cy=0;cy<nyc;cy++)
+    for(int cz=0;cz<nzc;cz++)
+    {
+      const int numpcls_in_cell = get_numpcls_in_bucket(cx,cy,cz);
+      const int bucket_offset = get_bucket_offset(cx,cy,cz);
+      const int bucket_end = bucket_offset+numpcls_in_cell;
+      for(int pidx=bucket_offset; pidx<bucket_end; pidx++)
+      {
+        // serial case: check that pidx is correct
+        assert_eq(pidx,serial_pidx);
+        serial_pidx++;
+        // confirm that particle is in correct cell
+        if(true)
+        {
+          int cx_,cy_,cz_;
+          get_safe_cell_for_pos(cx_,cy_,cz_,xavg[pidx],yavg[pidx],zavg[pidx]);
+          if((cx_!=cx)
+           ||(cy_!=cy)
+           ||(cz_!=cz))
+          {
+            dprint(cx)
+            dprintf("\n\t cx =%d, cy =%d, cz =%d"
+                    "\n\t cx_=%d, cy_=%d, cz_=%d"
+                    "\n\t cxf=%f, cyf=%f, czf=%f",
+                    cx,cy,cz,
+                    cx_,cy_,cz_,
+                    1.+(xavg[pidx]-xstart)*inv_dx,
+                    1.+(yavg[pidx]-ystart)*inv_dy,
+                    1.+(zavg[pidx]-zstart)*inv_dz);
+            dprint(serial_pidx);
+          }
+          assert_eq(cx_,cx);
+          assert_eq(cy_,cy);
+          assert_eq(cz_,cz);
+        }
+      }
+    }
   }
 }
 
@@ -1384,13 +1577,29 @@ void Particles3Dcomm::copyParticlesToAoS()
   SpeciesParticle * pcls = fetch_pcls();
   timeTasks_set_task(TimeTasks::TRANSPOSE_PCLS_TO_AOS);
   dprintf("copying to array of structs");
-  #pragma omp for
-  for(int pidx=0; pidx<nop; pidx++)
+  assert(pcls);
+  if(TrackParticleID)
   {
-    pcls[pidx].set( ParticleID ? ParticleID[pidx] : 0,
-      x[pidx],y[pidx],z[pidx],
-      u[pidx],v[pidx],w[pidx],
-      q[pidx]);
+    assert(ParticleID);
+    #pragma omp for
+    for(int pidx=0; pidx<nop; pidx++)
+    {
+      pcls[pidx].set( ParticleID[pidx],
+        x[pidx],y[pidx],z[pidx],
+        u[pidx],v[pidx],w[pidx],
+        q[pidx]);
+    }
+  }
+  else
+  {
+    #pragma omp for
+    for(int pidx=0; pidx<nop; pidx++)
+    {
+      pcls[pidx].set( 0,
+        x[pidx],y[pidx],z[pidx],
+        u[pidx],v[pidx],w[pidx],
+        q[pidx]);
+    }
   }
 }
 

From 48b10315d0cf6d9f01808fd728175f52e9bf13d2 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 31 Jan 2014 01:10:24 +0100
Subject: [PATCH 098/118] fixed bug in "ipic cmake" command

---
 scripts/ipic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ipic.py b/scripts/ipic.py
index 141ed769..d043f2ff 100755
--- a/scripts/ipic.py
+++ b/scripts/ipic.py
@@ -194,7 +194,7 @@ def ipic_cmake(args):
       sys.exit()
 
     if sourcedir!='src':
-      rm_command = ['rm -f', 'src'];
+      rm_command = ['rm', '-f', 'src'];
       issue_command(rm_command);
       ln_command = ['ln', '-s', str(sourcedir), 'src'];
       issue_command(ln_command)

From af34642a03e6311bf6403c6ff095b5339f839698 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 3 Feb 2014 18:01:23 +0100
Subject: [PATCH 099/118] fixed ipic cmake

---
 scripts/ipic.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/ipic.py b/scripts/ipic.py
index d043f2ff..f00b018c 100755
--- a/scripts/ipic.py
+++ b/scripts/ipic.py
@@ -216,7 +216,10 @@ def ipic_cmake(args):
 def ipic_findcpph(args):
     # create tags file using ctags
     command = '''find . -name '*.cpp' -or -name '*.h' | grep -v unused | grep -v postprocessing_tools'''
-    issue_shell_command(cmake_command)
+    if(show):
+      print command
+    else:
+      os.system(command)
 
 def ipic_ctags(args):
     # create tags file using ctags

From 6c7cd962139078d2684c4c5df90a15da4d3bfe2d Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 31 Jan 2014 16:31:58 +0100
Subject: [PATCH 100/118] implemented mover_PC_AoS_vec

---
 fields/EMfields3D.cpp         |   1 +
 grids/Grid3DCU.cpp            |   3 +
 include/Grid3DCU.h            | 110 ++++++++
 include/Parameters.h          |   8 +-
 include/Particles3D.h         |  10 +-
 include/Particles3Dcomm.h     |  62 ++---
 include/arraysfwd.h           |   2 +
 main/Parameters.cpp           |  22 +-
 main/iPic3Dlib.cpp            |   8 +-
 particles/Particles3D.cpp     | 487 +++++++++++++++-------------------
 particles/Particles3Dcomm.cpp |  16 +-
 11 files changed, 376 insertions(+), 353 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 923985e1..2bcc6fef 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -422,6 +422,7 @@ void EMfields3D::sumMoments(const Particles3Dcomm* part, Grid * grid, VirtualTop
     int moments1dsize = moments.get_size();
     for(int i=0; i<moments1dsize; i++) moments1d[i]=0;
     //
+    // This barrier is not needed
     #pragma omp barrier
     // The following loop is expensive, so it is wise to assume that the
     // compiler is stupid.  Therefore we should on the one hand
diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index e4b7d166..f13d8aed 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -28,6 +28,9 @@ Grid3DCU::Grid3DCU(CollectiveIO * col, VirtualTopology3D * vct) {
   nxn = nxc + 1;
   nyn = nyc + 1;
   nzn = nzc + 1;
+  cxlast = nxc-1;
+  cylast = nyc-1;
+  czlast = nzc-1;
   dx = col->getLx() / col->getNxc();
   dy = col->getLy() / col->getNyc();
   dz = col->getLz() / col->getNzc();
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index c0a60e7e..92651196 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -139,6 +139,11 @@ class Grid3DCU                  // :public Grid
   double invdz;
   /** invol = inverse of volume*/
   double invVOL;
+  /** index of last cell including ghost cells */
+  // (precomputed for speed)
+  int cxlast; // nxc-1;
+  int cylast; // nyc-1;
+  int czlast; // nzc-1;
   /** node coordinate */
   pfloat *pfloat_node_xcoord;
   pfloat *pfloat_node_ycoord;
@@ -201,6 +206,111 @@ class Grid3DCU                  // :public Grid
   double getZstart() { return (zStart); }
   double getZend() { return (zEnd); }
   double getInvVOL() { return (invVOL); }
+
+  // inline methods to calculate mesh cell and weights.
+  static void get_weights(double weights[8],
+    double w0x, double w0y, double w0z,
+    double w1x, double w1y, double w1z)
+  {
+    // which of the following is faster?
+    //
+    // this:
+    //
+    //const double weight00 = w0x*w0y;
+    //const double weight01 = w0x*w1y;
+    //const double weight10 = w1x*w0y;
+    //const double weight11 = w1x*w1y;
+    //weights[0] = weight00*w0z; // weight000
+    //weights[1] = weight00*w1z; // weight001
+    //weights[2] = weight01*w0z; // weight010
+    //weights[3] = weight01*w1z; // weight011
+    //weights[4] = weight10*w0z; // weight100
+    //weights[5] = weight10*w1z; // weight101
+    //weights[6] = weight11*w0z; // weight110
+    //weights[7] = weight11*w1z; // weight111
+    //
+    // or this:
+    //
+    weights[0] = w0x*w0y*w0z; // weight000
+    weights[1] = w0x*w0y*w1z; // weight001
+    weights[2] = w0x*w1y*w0z; // weight010
+    weights[3] = w0x*w1y*w1z; // weight011
+    weights[4] = w1x*w0y*w0z; // weight100
+    weights[5] = w1x*w0y*w1z; // weight101
+    weights[6] = w1x*w1y*w0z; // weight110
+    weights[7] = w1x*w1y*w1z; // weight111
+  }
+  void get_cell_coordinates(
+    int& cx, int& cy, int& cz,
+    double xpos, double ypos, double zpos)
+  {
+      // xStart marks start of domain excluding ghosts
+      const double rel_xpos = xpos - xStart;
+      const double rel_ypos = ypos - yStart;
+      const double rel_zpos = zpos - zStart;
+      // cell position minus 1 (due to ghost cells)
+      const double cxm1_pos = rel_xpos * invdx;
+      const double cym1_pos = rel_ypos * invdy;
+      const double czm1_pos = rel_zpos * invdz;
+      cx = 1 + int(floor(cxm1_pos));
+      cy = 1 + int(floor(cym1_pos));
+      cz = 1 + int(floor(czm1_pos));
+  }
+  void make_cell_coordinates_safe(int& cx, int& cy, int& cz)
+  {
+    // if the cell is outside the domain, then treat it as
+    // in the nearest ghost cell.
+    //
+    if (cx < 0) cx = 0;
+    if (cy < 0) cy = 0;
+    if (cz < 0) cz = 0;
+    if (cx > cxlast) cx = cxlast; //nxc-1;
+    if (cy > cylast) cy = cylast; //nyc-1;
+    if (cz > czlast) cz = czlast; //nzc-1;
+  }
+  void get_safe_cell_coordinates(
+    int& cx, int& cy, int& cz,
+    double x, double y, double z)
+  {
+    get_cell_coordinates(cx,cy,cz,x,y,z);
+    make_cell_coordinates_safe(cx,cy,cz);
+  }
+  void get_safe_cell_and_weights(
+    double xpos, double ypos, double zpos,
+    int &cx, int& cy, int& cz,
+    double weights[8])
+  {
+    //convert_xpos_to_cxpos(xpos,ypos,zpos,cxpos,cypos,czpos);
+    // xStart marks start of domain excluding ghosts
+    const double rel_xpos = xpos - xStart;
+    const double rel_ypos = ypos - yStart;
+    const double rel_zpos = zpos - zStart;
+    // cell position minus 1 (due to ghost cells)
+    const double cxm1_pos = rel_xpos * invdx;
+    const double cym1_pos = rel_ypos * invdy;
+    const double czm1_pos = rel_zpos * invdz;
+    //
+    cx = 1 + int(floor(cxm1_pos));
+    cy = 1 + int(floor(cym1_pos));
+    cz = 1 + int(floor(czm1_pos));
+  
+    make_cell_coordinates_safe(cx,cy,cz);
+  
+    // fraction of the distance from the right of the cell
+    const double w1x = cx - cxm1_pos;
+    const double w1y = cy - cym1_pos;
+    const double w1z = cz - czm1_pos;
+    // fraction of distance from the left
+    const double w0x = 1.-w1x;
+    const double w0y = 1.-w1y;
+    const double w0z = 1.-w1z;
+
+    get_weights(weights, w0x, w0y, w0z, w1x, w1y, w1z);
+  }
+  void get_safe_cell_and_weights(double xpos[3], int cx[3], double weights[8])
+  {
+    get_safe_cell_and_weights(xpos[0],xpos[1],xpos[2],cx[0],cx[1],cx[2],weights);
+  }
 };
 
 typedef Grid3DCU Grid;
diff --git a/include/Parameters.h b/include/Parameters.h
index fac04e9e..819ea7f0 100644
--- a/include/Parameters.h
+++ b/include/Parameters.h
@@ -13,10 +13,10 @@ namespace Parameters
     AoSvec,
     SoAvec,
     // for mover type
-    SoAvec_onesort,
-    AoSvec_onesort,
-    SoAvec_resort,
-    AoSvec_resort,
+    SoA_vec_onesort,
+    AoS_vec_onesort,
+    SoA_vec_resort,
+    AoS_vec_resort,
   };
 
   void init_parameters();
diff --git a/include/Particles3D.h b/include/Particles3D.h
index 58c8fb49..ce0595e6 100644
--- a/include/Particles3D.h
+++ b/include/Particles3D.h
@@ -59,9 +59,11 @@ class Particles3D:public Particles3Dcomm {
     /** mover with a Predictor-Corrector Scheme */
     void mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** array-of-structs version of mover_PC */
-    void mover_PC_AoS2(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     void mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf);
+    /* vectorized version of previous */
     void mover_PC_AoS_vec(Grid * grid, VirtualTopology3D * vct, Field * EMf);
+    /* this computes garbage */
+    void mover_PC_AoS_vec_onesort(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** vectorized version of mover_PC **/
     void mover_PC_vectorized(Grid * grid, VirtualTopology3D * vct, Field * EMf);
     /** communicate particle after moving them */
@@ -84,6 +86,12 @@ class Particles3D:public Particles3Dcomm {
     void MaxwellianFromFluidCell(Grid* grid, Collective *col, int is, int i, int j, int k, int &ip, double *x, double *y, double *z, double *q, double *vx, double *vy, double *vz, unsigned long* ParticleID);
 #endif
 
+  private:
+
+    inline void get_field_components_for_cell(
+      arr1_double_get field_components[8],
+      const_arr4_double fieldForPcls,
+      int cx,int cy,int cz);
 };
 
 
diff --git a/include/Particles3Dcomm.h b/include/Particles3Dcomm.h
index 3299d243..5c97a8ba 100644
--- a/include/Particles3Dcomm.h
+++ b/include/Particles3Dcomm.h
@@ -76,46 +76,6 @@ class Particles3Dcomm // :public Particles
   void sort_particles_serial(Grid * grid, VirtualTopology3D * vct);
   void sort_particles_serial_AoS(Grid * grid, VirtualTopology3D * vct);
   void sort_particles_serial_SoA(Grid * grid, VirtualTopology3D * vct);
-  void get_safe_cell_for_pos(
-    int& cx, int& cy, int& cz, 
-    pfloat xpos, pfloat ypos, pfloat zpos)
-  {
-    // xstart is left edge of domain excluding ghost cells
-    // cx=0 for ghost cell layer.
-    cx = 1 + int(floor((xpos - xstart) * inv_dx));
-    cy = 1 + int(floor((ypos - ystart) * inv_dy));
-    cz = 1 + int(floor((zpos - zstart) * inv_dz));
-    //
-    // if the cell is outside the domain, then treat it as
-    // in the nearest ghost cell.
-    //
-    if (cx < 0) cx = 0;
-    if (cy < 0) cy = 0;
-    if (cz < 0) cz = 0;
-    // number of cells in x direction including ghosts is nxc
-    if (cx >= nxc) cx = nxc-1;
-    if (cy >= nyc) cy = nyc-1;
-    if (cz >= nzc) cz = nzc-1;
-  }
-
-  /*! version that assumes particle is in domain */
-  void get_cell_for_pos_in_domain(
-    int& cx, int& cy, int& cz, 
-    pfloat xpos, pfloat ypos, pfloat zpos)
-  {
-    // xstart is left edge of domain excluding ghost cells
-    // cx=0 for ghost cell layer.
-    cx = 1 + int(floor((xpos - xstart) * inv_dx));
-    cy = 1 + int(floor((ypos - ystart) * inv_dy));
-    cz = 1 + int(floor((zpos - zstart) * inv_dz));
-    //
-    assert_le(0,cx);
-    assert_le(0,cy);
-    assert_le(0,cz);
-    assert_le(cx,nxc);
-    assert_le(cy,nyc);
-    assert_le(cz,nzc);
-  }
 
   // get accessors for optional arrays
   //
@@ -290,10 +250,13 @@ class Particles3Dcomm // :public Particles
   int BirthRank[2];
   /** number of variables to be stored in buffer for communication for each particle  */
   int nVar;
-  /** Simulation domain lengths */
-  double xstart, xend, ystart, yend, zstart, zend, invVOL;
   /** time step */
   double dt;
+  //
+  // Copies of grid data (should just put pointer to Grid in this class)
+  //
+  /** Simulation domain lengths */
+  double xstart, xend, ystart, yend, zstart, zend, invVOL;
   /** Lx = simulation box length - x direction   */
   double Lx;
   /** Ly = simulation box length - y direction   */
@@ -306,6 +269,13 @@ class Particles3Dcomm // :public Particles
   int nxn, nyn, nzn;
   /** number of grid cells */
   int nxc, nyc, nzc;
+  // convenience values from grid
+  double inv_dx;
+  double inv_dy;
+  double inv_dz;
+  //
+  // Communication variables
+  //
   /** buffers for communication */
   /** size of sending buffers for exiting particles, DEFINED IN METHOD "COMMUNICATE" */
   int buffer_size;
@@ -376,6 +346,9 @@ class Particles3Dcomm // :public Particles
   int bcPfaceZright;
   /** Boundary Condition Particles: FaceYleft */
   int bcPfaceZleft;
+  //
+  // Other variables
+  //
   /** speed of light in vacuum */
   double c;
   /** restart variable for loading particles from restart file */
@@ -388,11 +361,6 @@ class Particles3Dcomm // :public Particles
   double Q_removed;
   /** density of the injection of the particles */
   double Ninj;
-
-  // convenience values from grid
-  double inv_dx;
-  double inv_dy;
-  double inv_dz;
 };
 
 typedef Particles3Dcomm Particles;
diff --git a/include/arraysfwd.h b/include/arraysfwd.h
index 86dbd5d8..0afc2826 100644
--- a/include/arraysfwd.h
+++ b/include/arraysfwd.h
@@ -63,11 +63,13 @@ typedef iPic3D::array_fetch1<double> arr1_double_fetch;
 typedef iPic3D::array_get1<double> arr1_double_get;
 typedef iPic3D::array_get1<pfloat> arr1_pfloat_get;
 typedef iPic3D::array_fetch2<double> arr2_double_fetch;
+typedef iPic3D::array_fetch3<double> arr3_double_fetch;
 #else
 typedef double* arr1_double_fetch;
 typedef double* arr1_double_get;
 typedef pfloat* arr1_pfloat_get;
 typedef double** arr2_double_fetch;
+typedef double*** arr3_double_fetch;
 #endif
 
 #endif
diff --git a/main/Parameters.cpp b/main/Parameters.cpp
index ca8ed480..60c45fc5 100644
--- a/main/Parameters.cpp
+++ b/main/Parameters.cpp
@@ -7,7 +7,7 @@ using namespace Parameters;
 bool Parameters::get_VECTORIZE_MOMENTS() { return false; }
 // supported options: SoA AoS
 Parameters::Enum Parameters::get_MOMENTS_TYPE() { return SoA; }
-// supported options: SoA AoS AoSvec_onesort SoAvec_resort
+// supported options: SoA AoS AoSvec AoS_vec_onesort SoA_vec_resort
 Parameters::Enum Parameters::get_MOVER_TYPE() { return SoA; }
 //********** derived parameters *********
 
@@ -19,21 +19,21 @@ static bool SORTING_SOA;
 void Parameters::init_parameters()
 {
   RESORTING_PARTICLES = 
-       get_MOVER_TYPE()==SoAvec_resort
-    || get_MOVER_TYPE()==AoSvec_resort;
+       get_MOVER_TYPE()==SoA_vec_resort
+    || get_MOVER_TYPE()==AoS_vec_resort;
   SORTING_PARTICLES = get_VECTORIZE_MOMENTS()
-    || get_MOVER_TYPE()==SoAvec_onesort
-    || get_MOVER_TYPE()==AoSvec_onesort
-    || get_MOVER_TYPE()==SoAvec_resort
-    || get_MOVER_TYPE()==AoSvec_resort;
+    || get_MOVER_TYPE()==SoA_vec_onesort
+    || get_MOVER_TYPE()==AoS_vec_onesort
+    || get_MOVER_TYPE()==SoA_vec_resort
+    || get_MOVER_TYPE()==AoS_vec_resort;
   SORTING_SOA = get_VECTORIZE_MOMENTS()
-    || get_MOVER_TYPE()==SoAvec_onesort
-    || get_MOVER_TYPE()==SoAvec_resort;
+    || get_MOVER_TYPE()==SoA_vec_onesort
+    || get_MOVER_TYPE()==SoA_vec_resort;
   USING_AOS =
        get_MOMENTS_TYPE()==AoS
     || get_MOVER_TYPE()==AoS
-    || get_MOVER_TYPE()==AoSvec_onesort
-    || get_MOVER_TYPE()==AoSvec_resort;
+    || get_MOVER_TYPE()==AoS_vec_onesort
+    || get_MOVER_TYPE()==AoS_vec_resort;
 }
 
 bool Parameters::get_RESORTING_PARTICLES() { return RESORTING_PARTICLES; }
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 9f60f114..33f9a96c 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -285,16 +285,18 @@ bool c_Solver::ParticlesMover() {
         case Parameters::SoA:
           part[i].mover_PC(grid, vct, EMf);
           break;
-        case Parameters::SoAvec_resort:
+        case Parameters::SoA_vec_resort:
           part[i].mover_PC_vectorized(grid, vct, EMf);
           break;
         case Parameters::AoS:
           part[i].mover_PC_AoS(grid, vct, EMf);
-          //part[i].mover_PC_AoS2(grid, vct, EMf);
           break;
-        case Parameters::AoSvec_onesort:
+        case Parameters::AoSvec:
           part[i].mover_PC_AoS_vec(grid, vct, EMf);
           break;
+        case Parameters::AoS_vec_onesort:
+          part[i].mover_PC_AoS_vec_onesort(grid, vct, EMf);
+          break;
         default:
           unsupported_value_error(Parameters::get_MOVER_TYPE());
       }
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index dd0af36b..5186a4d7 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -304,6 +304,47 @@ void Particles3D::AddPerturbationJ(double deltaBoB, double kx, double ky, double
   }
 }
 
+inline void Particles3D::get_field_components_for_cell(
+  arr1_double_get field_components[8],
+  const_arr4_double fieldForPcls,
+  int cx,int cy,int cz)
+{
+  // interface to the right of cell
+  const int ix = cx+1;
+  const int iy = cy+1;
+  const int iz = cz+1;
+
+  // is this faster?
+  //
+  //field_components[0] = fieldForPcls[ix][iy][iz]; // field000
+  //field_components[1] = fieldForPcls[ix][iy][cz]; // field001
+  //field_components[2] = fieldForPcls[ix][cy][iz]; // field010
+  //field_components[3] = fieldForPcls[ix][cy][cz]; // field011
+  //field_components[4] = fieldForPcls[cx][iy][iz]; // field100
+  //field_components[5] = fieldForPcls[cx][iy][cz]; // field101
+  //field_components[6] = fieldForPcls[cx][cy][iz]; // field110
+  //field_components[7] = fieldForPcls[cx][cy][cz]; // field111
+  //
+  // or is this?
+  //
+  // creating these aliases seems to accelerate this method (by about 30%?)
+  // on the Xeon host processor, suggesting deficiency in the optimizer.
+  //
+  arr3_double_fetch field0 = fieldForPcls[ix];
+  arr3_double_fetch field1 = fieldForPcls[cx];
+  arr2_double_fetch field00 = field0[iy];
+  arr2_double_fetch field01 = field0[cy];
+  arr2_double_fetch field10 = field1[iy];
+  arr2_double_fetch field11 = field1[cy];
+  field_components[0] = field00[iz]; // field000 
+  field_components[1] = field00[cz]; // field001 
+  field_components[2] = field01[iz]; // field010 
+  field_components[3] = field01[cz]; // field011 
+  field_components[4] = field10[iz]; // field100 
+  field_components[5] = field10[cz]; // field101 
+  field_components[6] = field11[iz]; // field110 
+  field_components[7] = field11[cz]; // field111 
+}
 
 /** explicit mover */
 void Particles3D::mover_explicit(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
@@ -359,9 +400,9 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       if (iy > nyc) iy = nyc;
       if (iz > nzc) iz = nzc;
       // index of cell of particle;
-      //const int cx = ix - 1;
-      //const int cy = iy - 1;
-      //const int cz = iz - 1;
+      const int cx = ix - 1;
+      const int cy = iy - 1;
+      const int cz = iz - 1;
 
       const double xi0   = xavg - grid->getXN(ix-1);
       const double eta0  = yavg - grid->getYN(iy-1);
@@ -405,14 +446,7 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
       // on the Xeon host, processor, suggesting deficiency in the optimizer.
       //
       arr1_double_get field_components[8];
-      field_components[0] = fieldForPcls[ix  ][iy  ][iz  ]; // field000
-      field_components[1] = fieldForPcls[ix  ][iy  ][iz-1]; // field001
-      field_components[2] = fieldForPcls[ix  ][iy-1][iz  ]; // field010
-      field_components[3] = fieldForPcls[ix  ][iy-1][iz-1]; // field011
-      field_components[4] = fieldForPcls[ix-1][iy  ][iz  ]; // field100
-      field_components[5] = fieldForPcls[ix-1][iy  ][iz-1]; // field101
-      field_components[6] = fieldForPcls[ix-1][iy-1][iz  ]; // field110
-      field_components[7] = fieldForPcls[ix-1][iy-1][iz-1]; // field111
+      get_field_components_for_cell(field_components,fieldForPcls,cx,cy,cz);
 
       for(int c=0; c<8; c++)
       {
@@ -457,110 +491,54 @@ void Particles3D::mover_PC(Grid * grid, VirtualTopology3D * vct, Field * EMf) {
   { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
 }
 
-void Particles3D::mover_PC_AoS2(Grid * grid, VirtualTopology3D * vct, Field * EMf)
+void Particles3D::mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf)
 {
   convertParticlesToAoS();
-  SpeciesParticle * pcls = fetch_pcls();
   #pragma omp master
   if (vct->getCartesian_rank() == 0) {
     cout << "*** MOVER species " << ns << " ***" << NiterMover << " ITERATIONS   ****" << endl;
   }
   const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
 
+  SpeciesParticle * pcls = fetch_pcls();
   #pragma omp master
   { timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING); }
-  const pfloat dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
+  const double dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
   #pragma omp for schedule(static)
   for (int pidx = 0; pidx < nop; pidx++) {
     // copy the particle
-    SpeciesParticle& pcl = pcls[pidx];
-    const pfloat xorig = pcl.get_x();
-    const pfloat yorig = pcl.get_y();
-    const pfloat zorig = pcl.get_z();
-    const pfloat uorig = pcl.get_u();
-    const pfloat vorig = pcl.get_v();
-    const pfloat worig = pcl.get_w();
-    pfloat xavg = xorig;
-    pfloat yavg = yorig;
-    pfloat zavg = zorig;
-    pfloat uavg;
-    pfloat vavg;
-    pfloat wavg;
+    SpeciesParticle* pcl = &pcls[pidx];
+    ALIGNED(pcl);
+    const double xorig = pcl->get_x();
+    const double yorig = pcl->get_y();
+    const double zorig = pcl->get_z();
+    const double uorig = pcl->get_u();
+    const double vorig = pcl->get_v();
+    const double worig = pcl->get_w();
+    double xavg = xorig;
+    double yavg = yorig;
+    double zavg = zorig;
+    double uavg;
+    double vavg;
+    double wavg;
     // calculate the average velocity iteratively
     for (int innter = 0; innter < NiterMover; innter++) {
-      // interpolation G-->P
-      const pfloat ixd = floor((xavg - xstart) * inv_dx);
-      const pfloat iyd = floor((yavg - ystart) * inv_dy);
-      const pfloat izd = floor((zavg - zstart) * inv_dz);
-      // interface of index to right of cell
-      int ix = 2 + int(ixd);
-      int iy = 2 + int(iyd);
-      int iz = 2 + int(izd);
 
-      // use field data of closest cell in domain
+      // compute weights for field components
       //
-      if (ix < 1) ix = 1;
-      if (iy < 1) iy = 1;
-      if (iz < 1) iz = 1;
-      if (ix > nxc) ix = nxc;
-      if (iy > nyc) iy = nyc;
-      if (iz > nzc) iz = nzc;
-      // index of cell of particle;
-      //const int cx = ix - 1;
-      //const int cy = iy - 1;
-      //const int cz = iz - 1;
-
-      const pfloat xi0   = xavg - grid->get_pfloat_XN(ix-1);
-      const pfloat eta0  = yavg - grid->get_pfloat_YN(iy-1);
-      const pfloat zeta0 = zavg - grid->get_pfloat_ZN(iz-1);
-      const pfloat xi1   = grid->get_pfloat_XN(ix) - xavg;
-      const pfloat eta1  = grid->get_pfloat_YN(iy) - yavg;
-      const pfloat zeta1 = grid->get_pfloat_ZN(iz) - zavg;
-
-      pfloat Exl = 0.0;
-      pfloat Eyl = 0.0;
-      pfloat Ezl = 0.0;
-      pfloat Bxl = 0.0;
-      pfloat Byl = 0.0;
-      pfloat Bzl = 0.0;
-
-      pfloat weights[8];
-      const pfloat weight0 = invVOL*xi0;
-      const pfloat weight1 = invVOL*xi1;
-      const pfloat weight00 = weight0*eta0;
-      const pfloat weight01 = weight0*eta1;
-      const pfloat weight10 = weight1*eta0;
-      const pfloat weight11 = weight1*eta1;
-      weights[0] = weight00*zeta0; // weight000
-      weights[1] = weight00*zeta1; // weight001
-      weights[2] = weight01*zeta0; // weight010
-      weights[3] = weight01*zeta1; // weight011
-      weights[4] = weight10*zeta0; // weight100
-      weights[5] = weight10*zeta1; // weight101
-      weights[6] = weight11*zeta0; // weight110
-      weights[7] = weight11*zeta1; // weight111
-      //weights[0] = xi0 * eta0 * zeta0 * qi * invVOL; // weight000
-      //weights[1] = xi0 * eta0 * zeta1 * qi * invVOL; // weight001
-      //weights[2] = xi0 * eta1 * zeta0 * qi * invVOL; // weight010
-      //weights[3] = xi0 * eta1 * zeta1 * qi * invVOL; // weight011
-      //weights[4] = xi1 * eta0 * zeta0 * qi * invVOL; // weight100
-      //weights[5] = xi1 * eta0 * zeta1 * qi * invVOL; // weight101
-      //weights[6] = xi1 * eta1 * zeta0 * qi * invVOL; // weight110
-      //weights[7] = xi1 * eta1 * zeta1 * qi * invVOL; // weight111
+      double weights[8];
+      int cx,cy,cz;
+      grid->get_safe_cell_and_weights(xavg,yavg,zavg,cx,cy,cz,weights);
 
-      // creating these aliases seems to accelerate this method by about 30%
-      // on the Xeon host, processor, suggesting deficiency in the optimizer.
-      //
       arr1_double_get field_components[8];
-      field_components[0] = fieldForPcls[ix  ][iy  ][iz  ]; // field000
-      field_components[1] = fieldForPcls[ix  ][iy  ][iz-1]; // field001
-      field_components[2] = fieldForPcls[ix  ][iy-1][iz  ]; // field010
-      field_components[3] = fieldForPcls[ix  ][iy-1][iz-1]; // field011
-      field_components[4] = fieldForPcls[ix-1][iy  ][iz  ]; // field100
-      field_components[5] = fieldForPcls[ix-1][iy  ][iz-1]; // field101
-      field_components[6] = fieldForPcls[ix-1][iy-1][iz  ]; // field110
-      field_components[7] = fieldForPcls[ix-1][iy-1][iz-1]; // field111
-
+      get_field_components_for_cell(field_components,fieldForPcls,cx,cy,cz);
+
+      double Exl = 0.0;
+      double Eyl = 0.0;
+      double Ezl = 0.0;
+      double Bxl = 0.0;
+      double Byl = 0.0;
+      double Bzl = 0.0;
       for(int c=0; c<8; c++)
       {
         Bxl += weights[c] * field_components[c][0];
@@ -593,18 +571,18 @@ void Particles3D::mover_PC_AoS2(Grid * grid, VirtualTopology3D * vct, Field * EM
       zavg = zorig + wavg * dto2;
     }                           // end of iteration
     // update the final position and velocity
-    pcl.set_x(xorig + uavg * dt);
-    pcl.set_y(yorig + vavg * dt);
-    pcl.set_z(zorig + wavg * dt);
-    pcl.set_u(2.0 * uavg - uorig);
-    pcl.set_v(2.0 * vavg - vorig);
-    pcl.set_w(2.0 * wavg - worig);
-  }
+    pcl->set_x(xorig + uavg * dt);
+    pcl->set_y(yorig + vavg * dt);
+    pcl->set_z(zorig + wavg * dt);
+    pcl->set_u(2.0 * uavg - uorig);
+    pcl->set_v(2.0 * vavg - vorig);
+    pcl->set_w(2.0 * wavg - worig);
+  }                             // END OF ALL THE PARTICLES
   #pragma omp master
   { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
 }
 
-void Particles3D::mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf)
+void Particles3D::mover_PC_AoS_vec(Grid * grid, VirtualTopology3D * vct, Field * EMf)
 {
   convertParticlesToAoS();
   #pragma omp master
@@ -613,162 +591,141 @@ void Particles3D::mover_PC_AoS(Grid * grid, VirtualTopology3D * vct, Field * EMf
   }
   const_arr4_pfloat fieldForPcls = EMf->get_fieldForPcls();
 
+  const int NUM_PCLS_MOVED_AT_A_TIME = 8;
+  // make sure that we won't overrun memory
+  assert_divides(NUM_PCLS_MOVED_AT_A_TIME,npmax);
+
   SpeciesParticle * pcls = fetch_pcls();
   #pragma omp master
   { timeTasks_begin_task(TimeTasks::MOVER_PCL_MOVING); }
   const double dto2 = .5 * dt, qdto2mc = qom * dto2 / c;
   #pragma omp for schedule(static)
-  // why does single precision make no difference in execution speed?
-  //#pragma simd vectorlength(VECTOR_WIDTH)
-  for (int pidx = 0; pidx < nop; pidx++) {
-    // copy the particle
-    SpeciesParticle* pcl = &pcls[pidx];
-    ALIGNED(pcl);
-    const double xorig = pcl->get_x();
-    const double yorig = pcl->get_y();
-    const double zorig = pcl->get_z();
-    const double uorig = pcl->get_u();
-    const double vorig = pcl->get_v();
-    const double worig = pcl->get_w();
-    double xavg = xorig;
-    double yavg = yorig;
-    double zavg = zorig;
-    double uavg;
-    double vavg;
-    double wavg;
+  for (int pidx = 0; pidx < nop; pidx+=NUM_PCLS_MOVED_AT_A_TIME)
+  {
+    // copy the particles
+    SpeciesParticle* pcl[NUM_PCLS_MOVED_AT_A_TIME];
+    for(int i=0;i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+    {
+      pcl[i] = &pcls[pidx+i];
+    }
+    // actually, all the particles are aligned,
+    // but the compiler should be able to see that.
+    ALIGNED(pcl[0]);
+    double xorig[NUM_PCLS_MOVED_AT_A_TIME][3];
+    double uorig[NUM_PCLS_MOVED_AT_A_TIME][3];
+    double  xavg[NUM_PCLS_MOVED_AT_A_TIME][3];
+    double  uavg[NUM_PCLS_MOVED_AT_A_TIME][3];
+    // gather data into vectors
+    // #pragma simd collapse(2)
+    for(int i=0;i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+    for(int j=0;j<3;j++)
+    {
+      xavg[i][j] = xorig[i][j] = pcl[i]->get_x(j);
+      uorig[i][j] = pcl[i]->get_u(j);
+    }
     // calculate the average velocity iteratively
     for (int innter = 0; innter < NiterMover; innter++) {
 
       // compute weights for field components
       //
-      double weights[8];
-      // xstart marks start of domain excluding ghosts
-      const double rel_xpos = xavg - xstart;
-      const double rel_ypos = yavg - ystart;
-      const double rel_zpos = zavg - zstart;
-      // cell position minus 1 (due to ghost cells)
-      const double cxm1_pos = rel_xpos * inv_dx;
-      const double cym1_pos = rel_ypos * inv_dy;
-      const double czm1_pos = rel_zpos * inv_dz;
-      //
-      int cx = 1 + int(floor(cxm1_pos));
-      int cy = 1 + int(floor(cym1_pos));
-      int cz = 1 + int(floor(czm1_pos));
-
-      // if the cell is outside the domain, then treat it as
-      // in the nearest ghost cell.
-      //
-      if (cx < 0) cx = 0;
-      if (cy < 0) cy = 0;
-      if (cz < 0) cz = 0;
-      // number of cells in x direction including ghosts is nxc
-      if (cx >= nxc) cx = nxc-1;
-      if (cy >= nyc) cy = nyc-1;
-      if (cz >= nzc) cz = nzc-1;
-
-      // index of interface to right of cell
-      const int ix = cx + 1;
-      const int iy = cy + 1;
-      const int iz = cz + 1;
-
-      // fraction of the distance from the right of the cell
-      const double w1x = cx - cxm1_pos;
-      const double w1y = cy - cym1_pos;
-      const double w1z = cz - czm1_pos;
-      // fraction of distance from the left
-      const double w0x = 1-w1x;
-      const double w0y = 1-w1y;
-      const double w0z = 1-w1z;
-      //const double weight00 = w0x*w0y;
-      //const double weight01 = w0x*w1y;
-      //const double weight10 = w1x*w0y;
-      //const double weight11 = w1x*w1y;
-      //weights[0] = weight00*w0z; // weight000
-      //weights[1] = weight00*w1z; // weight001
-      //weights[2] = weight01*w0z; // weight010
-      //weights[3] = weight01*w1z; // weight011
-      //weights[4] = weight10*w0z; // weight100
-      //weights[5] = weight10*w1z; // weight101
-      //weights[6] = weight11*w0z; // weight110
-      //weights[7] = weight11*w1z; // weight111
-      //
-      weights[0] = w0x*w0y*w0z; // weight000
-      weights[1] = w0x*w0y*w1z; // weight001
-      weights[2] = w0x*w1y*w0z; // weight010
-      weights[3] = w0x*w1y*w1z; // weight011
-      weights[4] = w1x*w0y*w0z; // weight100
-      weights[5] = w1x*w0y*w1z; // weight101
-      weights[6] = w1x*w1y*w0z; // weight110
-      weights[7] = w1x*w1y*w1z; // weight111
-
-      pfloat Exl = 0.0;
-      pfloat Eyl = 0.0;
-      pfloat Ezl = 0.0;
-      pfloat Bxl = 0.0;
-      pfloat Byl = 0.0;
-      pfloat Bzl = 0.0;
+      double weights[NUM_PCLS_MOVED_AT_A_TIME][8];
+      int cx[NUM_PCLS_MOVED_AT_A_TIME][3];
+      for(int i=0;i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      {
+        grid->get_safe_cell_and_weights(xavg[i],cx[i],weights[i]);
+      }
 
-      // creating these aliases seems to accelerate this method by about 30%
-      // on the Xeon host, processor, suggesting deficiency in the optimizer.
-      //
-      arr1_double_get field_components[8];
-      field_components[0] = fieldForPcls[ix][iy][iz]; // field000
-      field_components[1] = fieldForPcls[ix][iy][cz]; // field001
-      field_components[2] = fieldForPcls[ix][cy][iz]; // field010
-      field_components[3] = fieldForPcls[ix][cy][cz]; // field011
-      field_components[4] = fieldForPcls[cx][iy][iz]; // field100
-      field_components[5] = fieldForPcls[cx][iy][cz]; // field101
-      field_components[6] = fieldForPcls[cx][cy][iz]; // field110
-      field_components[7] = fieldForPcls[cx][cy][cz]; // field111
+      arr1_double_get field_components[NUM_PCLS_MOVED_AT_A_TIME][8];
+      for(int i=0;i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      {
+        get_field_components_for_cell(field_components[i],fieldForPcls,
+          cx[i][0],cx[i][1],cx[i][2]);
+      }
 
+      double E[NUM_PCLS_MOVED_AT_A_TIME][3];
+      double B[NUM_PCLS_MOVED_AT_A_TIME][3];
+      // could do this with memset
+      // #pragma simd collapse(2)
+      for(int i=0;i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      for(int j=0;j<3;j++)
+      {
+        E[i][j]=0;
+        B[i][j]=0;
+      }
+      for(int i=0; i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      for(int j=0;j<3;j++)
       for(int c=0; c<8; c++)
       {
-        Bxl += weights[c] * field_components[c][0];
-        Byl += weights[c] * field_components[c][1];
-        Bzl += weights[c] * field_components[c][2];
-        Exl += weights[c] * field_components[c][3];
-        Eyl += weights[c] * field_components[c][4];
-        Ezl += weights[c] * field_components[c][5];
+        B[i][j] += weights[i][c] * field_components[i][c][j];
+        E[i][j] += weights[i][c] * field_components[i][c][j+3];
+      }
+      double Om[NUM_PCLS_MOVED_AT_A_TIME][3];
+      for(int i=0; i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      for(int j=0;j<3;j++)
+      {
+        Om[i][j] = qdto2mc*B[i][j];
       }
-      const double Omx = qdto2mc*Bxl;
-      const double Omy = qdto2mc*Byl;
-      const double Omz = qdto2mc*Bzl;
 
-      // end interpolation
-      const pfloat omsq = (Omx * Omx + Omy * Omy + Omz * Omz);
-      const pfloat denom = 1.0 / (1.0 + omsq);
+      // can these dot products vectorize if
+      // NUM_PCLS_MOVED_AT_A_TIME is large enough?
+      double omsq[NUM_PCLS_MOVED_AT_A_TIME];
+      double denom[NUM_PCLS_MOVED_AT_A_TIME];
+      for(int i=0; i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      {
+        omsq[i] = Om[i][0] * Om[i][0]
+                + Om[i][1] * Om[i][1]
+                + Om[i][2] * Om[i][2];
+        denom[i] = 1.0 / (1.0 + omsq[i]);
+      }
       // solve the position equation
-      const pfloat ut = uorig + qdto2mc * Exl;
-      const pfloat vt = vorig + qdto2mc * Eyl;
-      const pfloat wt = worig + qdto2mc * Ezl;
-      //const pfloat udotb = ut * Bxl + vt * Byl + wt * Bzl;
-      const pfloat udotOm = ut * Omx + vt * Omy + wt * Omz;
+      double ut[NUM_PCLS_MOVED_AT_A_TIME][3];
+      for(int i=0; i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      for(int j=0;j<3;j++)
+      {
+        ut[i][j] = uorig[i][j] + qdto2mc * E[i][j];
+      }
+      double udotOm[NUM_PCLS_MOVED_AT_A_TIME];
+      for(int i=0; i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      {
+        udotOm[i] = ut[i][0] * Om[i][0]
+                  + ut[i][1] * Om[i][1]
+                  + ut[i][2] * Om[i][2];
+      }
       // solve the velocity equation 
-      uavg = (ut + (vt * Omz - wt * Omy + udotOm * Omx)) * denom;
-      vavg = (vt + (wt * Omx - ut * Omz + udotOm * Omy)) * denom;
-      wavg = (wt + (ut * Omy - vt * Omx + udotOm * Omz)) * denom;
+      for(int i=0;i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      {
+        // these cross-products might not vectorize so well...
+        uavg[i][0] = (ut[i][0] + (ut[i][1] * Om[i][2] - ut[i][2] * Om[i][1] + udotOm[i] * Om[i][0])) * denom[i];
+        uavg[i][1] = (ut[i][1] + (ut[i][2] * Om[i][0] - ut[i][0] * Om[i][2] + udotOm[i] * Om[i][1])) * denom[i];
+        uavg[i][2] = (ut[i][2] + (ut[i][0] * Om[i][1] - ut[i][1] * Om[i][0] + udotOm[i] * Om[i][2])) * denom[i];
+      }
       // update average position
-      xavg = xorig + uavg * dto2;
-      yavg = yorig + vavg * dto2;
-      zavg = zorig + wavg * dto2;
-    }                           // end of iteration
-    // update the final position and velocity
-    pcl->set_x(xorig + uavg * dt);
-    pcl->set_y(yorig + vavg * dt);
-    pcl->set_z(zorig + wavg * dt);
-    pcl->set_u(2.0 * uavg - uorig);
-    pcl->set_v(2.0 * vavg - vorig);
-    pcl->set_w(2.0 * wavg - worig);
-  }                             // END OF ALL THE PARTICLES
+      // #pragma simd collapse(2)
+      for(int i=0;i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+      for(int j=0;j<3;j++)
+      {
+        xavg[i][j] = xorig[i][j] + uavg[i][j] * dto2;
+      }
+    } // end of iteration
+    // update the final position and velocity (scatter)
+    for(int i=0;i<NUM_PCLS_MOVED_AT_A_TIME;i++)
+    for(int j=0;j<3;j++)
+    {
+      pcl[i]->set_x(j, xorig[i][j] + uavg[i][j] * dt);
+      pcl[i]->set_u(j, 2.*uavg[i][j] - uorig[i][j]);
+    }
+  }
   #pragma omp master
   { timeTasks_end_task(TimeTasks::MOVER_PCL_MOVING); }
 }
 
-// this currently computes garbage but execution time
-// suggests bound on performance.  For correct execution
-// would need to sort by xavg with each iteration
-// like in mover_PC_vectorized
-void Particles3D::mover_PC_AoS_vec(
+// This currently computes extrapolated values based on field in
+// original mesh cell (unstable?), but execution time suggests
+// bound on performance.  For correct execution would need to
+// sort by xavg with each iteration like in mover_PC_vectorized.
+// But in fact this does not run any faster than mover_PC_AoS
+//
+void Particles3D::mover_PC_AoS_vec_onesort(
   Grid * grid, VirtualTopology3D * vct, Field * EMf)
 {
   convertParticlesToAoS();
@@ -789,20 +746,11 @@ void Particles3D::mover_PC_AoS_vec(
   for(int cz=0;cz<nzc;cz++)
   //for(int cell=0; cell<ncells; cell++)
   {
-    // interface to the right of cell
-    const int ix = cx+1;
-    const int iy = cy+1;
-    const int iz = cz+1;
-
+    // Idea of this function is that we only need
+    // to do this once for each group of particles.
+    //
     arr1_double_get field_components[8];
-    field_components[0] = fieldForPcls[ix][iy][iz]; // field000
-    field_components[1] = fieldForPcls[ix][iy][cz]; // field001
-    field_components[2] = fieldForPcls[ix][cy][iz]; // field010
-    field_components[3] = fieldForPcls[ix][cy][cz]; // field011
-    field_components[4] = fieldForPcls[cx][iy][iz]; // field100
-    field_components[5] = fieldForPcls[cx][iy][cz]; // field101
-    field_components[6] = fieldForPcls[cx][cy][iz]; // field110
-    field_components[7] = fieldForPcls[cx][cy][cz]; // field111
+    get_field_components_for_cell(field_components,fieldForPcls,cx,cy,cz);
 
     // push all particles in mesh cell
     //
@@ -841,26 +789,6 @@ void Particles3D::mover_PC_AoS_vec(
         const double cxm1_pos = rel_xpos * inv_dx;
         const double cym1_pos = rel_ypos * inv_dy;
         const double czm1_pos = rel_zpos * inv_dz;
-        //
-        int cx = 1 + int(floor(cxm1_pos));
-        int cy = 1 + int(floor(cym1_pos));
-        int cz = 1 + int(floor(czm1_pos));
-
-        // if the cell is outside the domain, then treat it as
-        // in the nearest ghost cell.
-        //
-        if (cx < 0) cx = 0;
-        if (cy < 0) cy = 0;
-        if (cz < 0) cz = 0;
-        // number of cells in x direction including ghosts is nxc
-        if (cx >= nxc) cx = nxc-1;
-        if (cy >= nyc) cy = nyc-1;
-        if (cz >= nzc) cz = nzc-1;
-
-        // index of interface to right of cell
-        const int ix = cx + 1;
-        const int iy = cy + 1;
-        const int iz = cz + 1;
 
         // fraction of the distance from the right of the cell
         const double w1x = cx - cxm1_pos;
@@ -871,14 +799,16 @@ void Particles3D::mover_PC_AoS_vec(
         const double w0y = 1-w1y;
         const double w0z = 1-w1z;
         //
-        weights[0] = w0x*w0y*w0z; // weight000
-        weights[1] = w0x*w0y*w1z; // weight001
-        weights[2] = w0x*w1y*w0z; // weight010
-        weights[3] = w0x*w1y*w1z; // weight011
-        weights[4] = w1x*w0y*w0z; // weight100
-        weights[5] = w1x*w0y*w1z; // weight101
-        weights[6] = w1x*w1y*w0z; // weight110
-        weights[7] = w1x*w1y*w1z; // weight111
+        Grid::get_weights(weights, w0x, w0y, w0z, w1x, w1y, w1z);
+
+        //if(false) // this would fail
+        //{
+        //   int cx_,cy_,cz_;
+        //   grid->get_safe_cell_coordinates(xavg,yavg,zavg,cx_,cy_,cz_);
+        //   assert_eq(cx,cx_);
+        //   assert_eq(cy,cy_);
+        //   assert_eq(cz,cz_);
+        //}
 
         pfloat Exl = 0.0;
         pfloat Eyl = 0.0;
@@ -886,7 +816,6 @@ void Particles3D::mover_PC_AoS_vec(
         pfloat Bxl = 0.0;
         pfloat Byl = 0.0;
         pfloat Bzl = 0.0;
-
         for(int c=0; c<8; c++)
         {
           Bxl += weights[c] * field_components[c][0];
diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index a3ac877d..7183eb16 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -1044,7 +1044,7 @@ void Particles3Dcomm::sort_particles_serial_AoS(
       const SpeciesParticle& pcl = get_pcl(pidx);
       // get the cell indices of the particle
       int cx,cy,cz;
-      get_safe_cell_for_pos(cx,cy,cz,pcl.get_x(),pcl.get_y(),pcl.get_z());
+      grid->get_safe_cell_coordinates(cx,cy,cz,pcl.get_x(),pcl.get_y(),pcl.get_z());
 
       // increment the number of particles in bucket of this particle
       (*numpcls_in_bucket)[cx][cy][cz]++;
@@ -1070,7 +1070,7 @@ void Particles3Dcomm::sort_particles_serial_AoS(
       const SpeciesParticle& pcl = get_pcl(pidx);
       // get the cell indices of the particle
       int cx,cy,cz;
-      get_safe_cell_for_pos(cx,cy,cz,pcl.get_x(),pcl.get_y(),pcl.get_z());
+      grid->get_safe_cell_coordinates(cx,cy,cz,pcl.get_x(),pcl.get_y(),pcl.get_z());
 
       // compute where the data should go
       const int numpcls_now = (*numpcls_in_bucket_now)[cx][cy][cz]++;
@@ -1132,7 +1132,7 @@ void Particles3Dcomm::sort_particles_serial_SoA(
       // get the cell indices of the particle
       //
       int cx,cy,cz;
-      get_safe_cell_for_pos(cx,cy,cz,x[pidx],y[pidx],z[pidx]);
+      grid->get_safe_cell_coordinates(cx,cy,cz,x[pidx],y[pidx],z[pidx]);
       //
       // is it better just to recompute this?
       //
@@ -1165,7 +1165,7 @@ void Particles3Dcomm::sort_particles_serial_SoA(
       // get the cell indices of the particle
       //
       int cx,cy,cz;
-      get_safe_cell_for_pos(cx,cy,cz,x[pidx],y[pidx],z[pidx]);
+      grid->get_safe_cell_coordinates(cx,cy,cz,x[pidx],y[pidx],z[pidx]);
       //
       //cx = xcell[pidx];
       //cy = ycell[pidx];
@@ -1231,7 +1231,7 @@ void Particles3Dcomm::sort_particles_serial_SoA(
           // confirm that particle is in correct cell
           {
             int cx_,cy_,cz_;
-            get_safe_cell_for_pos(cx_,cy_,cz_,x[pidx],y[pidx],z[pidx]);
+            grid->get_safe_cell_coordinates(cx_,cy_,cz_,x[pidx],y[pidx],z[pidx]);
             if((cx_!=cx)
              ||(cy_!=cy)
              ||(cz_!=cz))
@@ -1285,7 +1285,7 @@ void Particles3Dcomm::sort_particles_serial_SoA_by_xavg(
       // get the cell indices of the particle
       //
       int cx,cy,cz;
-      get_safe_cell_for_pos(cx,cy,cz,xavg[pidx],yavg[pidx],zavg[pidx]);
+      grid->get_safe_cell_coordinates(cx,cy,cz,xavg[pidx],yavg[pidx],zavg[pidx]);
       //
       // is it better just to recompute this?
       //
@@ -1318,7 +1318,7 @@ void Particles3Dcomm::sort_particles_serial_SoA_by_xavg(
       // get the cell indices of the particle
       //
       int cx,cy,cz;
-      get_safe_cell_for_pos(cx,cy,cz,xavg[pidx],yavg[pidx],zavg[pidx]);
+      grid->get_safe_cell_coordinates(cx,cy,cz,xavg[pidx],yavg[pidx],zavg[pidx]);
       //
       //cx = xcell[pidx];
       //cy = ycell[pidx];
@@ -1396,7 +1396,7 @@ void Particles3Dcomm::sort_particles_serial_SoA_by_xavg(
         if(true)
         {
           int cx_,cy_,cz_;
-          get_safe_cell_for_pos(cx_,cy_,cz_,xavg[pidx],yavg[pidx],zavg[pidx]);
+          grid->get_safe_cell_coordinates(cx_,cy_,cz_,xavg[pidx],yavg[pidx],zavg[pidx]);
           if((cx_!=cx)
            ||(cy_!=cy)
            ||(cz_!=cz))

From fe2a01ab5233ec36963ea3d97d446ec93ce987cc Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Sat, 22 Feb 2014 11:20:50 +0100
Subject: [PATCH 101/118] created utility/Basic.cpp for methods that need not
 be inline

---
 include/Basic.h   | 550 ++++++----------------------------------------
 utility/Basic.cpp | 525 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 592 insertions(+), 483 deletions(-)
 create mode 100644 utility/Basic.cpp

diff --git a/include/Basic.h b/include/Basic.h
index 2dd7da6c..09a0d54c 100644
--- a/include/Basic.h
+++ b/include/Basic.h
@@ -5,18 +5,9 @@ developers: Stefano Markidis, Giovanni Lapenta
  ********************************************************************************************/
 #ifndef Basic_H
 #define Basic_H
-
-#include <iostream>
+#include "arraysfwd.h"
 #include <math.h>
 
-#include "MPIdata.h"
-#include "EllipticF.h"
-#include "Alloc.h"
-
-using std::cout;
-using std::endl;
-
-
 /**
  *  
  * Basic operations defined. This library provides methods to calculate:
@@ -39,400 +30,120 @@ using std::endl;
 
 
 /** method to calculate the parallel dot product with vect1, vect2 having the ghost cells*/
-inline double dotP(double *vect1, double *vect2, int n) {
-  double result = 0;
-  double local_result = 0;
-  for (register int i = 0; i < n; i++)
-    local_result += vect1[i] * vect2[i];
-  MPI_Allreduce(&local_result, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  return (result);
-
-}
+double dotP(double *vect1, double *vect2, int n);
 /** method to calculate dot product */
-inline double dot(double *vect1, double *vect2, int n) {
-  double result = 0;
-  for (int i = 0; i < n; i++)
-    result += vect1[i] * vect2[i];
-  return (result);
-}
+double dot(double *vect1, double *vect2, int n);
 /** method to calculate the square norm of a vector */
-inline double norm2(double **vect, int nx, int ny) {
-  double result = 0;
-  for (int i = 0; i < nx; i++)
-    for (int j = 0; j < ny; j++)
-      result += vect[i][j] * vect[i][j];
-  return (result);
-}
+double norm2(double **vect, int nx, int ny);
 /** method to calculate the square norm of a vector */
-inline double norm2(const arr3_double vect, int nx, int ny) {
-  double result = 0;
-  for (int i = 0; i < nx; i++)
-    for (int j = 0; j < ny; j++)
-      result += vect.get(i,j,0) * vect.get(i,j,0);
-  return (result);
-}
+double norm2(const arr3_double vect, int nx, int ny);
 /** method to calculate the square norm of a vector */
-inline double norm2(double *vect, int nx) {
-  double result = 0;
-  for (int i = 0; i < nx; i++)
-    result += vect[i] * vect[i];
-  return (result);
-}
-
-
-
+double norm2(double *vect, int nx);
 /** method to calculate the parallel dot product */
-inline double norm2P(const arr3_double vect, int nx, int ny, int nz) {
-  double result = 0;
-  double local_result = 0;
-  for (int i = 0; i < nx; i++)
-    for (int j = 0; j < ny; j++)
-      for (int k = 0; k < nz; k++)
-        local_result += vect.get(i,j,k) * vect.get(i,j,k);
-
-  MPI_Allreduce(&local_result, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  return (result);
-}
+double norm2P(const arr3_double vect, int nx, int ny, int nz);
 /** method to calculate the parallel norm of a vector on different processors with the ghost cell */
-inline double norm2P(double *vect, int n) {
-  double result = 0;
-  double local_result = 0;
-  for (int i = 0; i < n; i++)
-    local_result += vect[i] * vect[i];
-  MPI_Allreduce(&local_result, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  return (result);
-}
+double norm2P(double *vect, int n);
 /** method to calculate the parallel norm of a vector on different processors with the gost cell*/
-inline double normP(double *vect, int n) {
-  double result = 0.0;
-  double local_result = 0.0;
-  for (register int i = 0; i < n; i++)
-    local_result += vect[i] * vect[i];
-
-
-  MPI_Allreduce(&local_result, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-
-  return (sqrt(result));
-
-}
+double normP(double *vect, int n);
 /** method to calculate the difference of two vectors*/
-inline void sub(double *res, double *vect1, double *vect2, int n) {
-  for (register int i = 0; i < n; i++)
-    res[i] = vect1[i] - vect2[i];
-}
+void sub(double *res, double *vect1, double *vect2, int n);
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(double *vect1, double *vect2, int n) {
-  for (register int i = 0; i < n; i++)
-    vect1[i] += vect2[i];
-
-
-}
+void sum(double *vect1, double *vect2, int n);
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) += vect2.get(i,j,k);
-}
-
+void sum(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz);
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(arr3_double vect1, const arr3_double vect2, int nx, int ny) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect1.fetch(i,j,0) += vect2.get(i,j,0);
-}
-
+void sum(arr3_double vect1, const arr3_double vect2, int nx, int ny);
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(arr3_double vect1, const arr4_double vect2, int nx, int ny, int nz, int ns) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) += vect2.get(ns,i,j,k);
-}
-
+void sum(arr3_double vect1, const arr4_double vect2, int nx, int ny, int nz, int ns);
 /** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
-inline void sum(arr3_double vect1, const arr4_double vect2, int nx, int ny, int ns) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect1.fetch(i,j,0) += vect2.get(ns,i,j,0);
-}
+void sum(arr3_double vect1, const arr4_double vect2, int nx, int ny, int ns);
 /** method to calculate the subtraction of two vectors vector1 = vector1 - vector2*/
-inline void sub(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) -= vect2.get(i,j,k);
-}
-
+void sub(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz);
 /** method to calculate the subtraction of two vectors vector1 = vector1 - vector2*/
-inline void sub(arr3_double vect1, const arr3_double vect2, int nx, int ny) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect1.fetch(i,j,0) -= vect2.get(i,j,0);
-}
-
-
+void sub(arr3_double vect1, const arr3_double vect2, int nx, int ny);
 /** method to sum 4 vectors vector1 = alfa*vector1 + beta*vector2 + gamma*vector3 + delta*vector4 */
-inline void sum4(arr3_double vect1, double alfa, const arr3_double vect2, double beta, const arr3_double vect3, double gamma, const arr3_double vect4, double delta, const arr3_double vect5, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = alfa * (vect2.get(i,j,k) + beta * vect3.get(i,j,k) + gamma * vect4.get(i,j,k) + delta * vect5.get(i,j,k));
-
-}
+void sum4(arr3_double vect1, double alfa, const arr3_double vect2, double beta, const arr3_double vect3, double gamma, const arr3_double vect4, double delta, const arr3_double vect5, int nx, int ny, int nz);
 /** method to calculate the scalar-vector product */
-inline void scale(double *vect, double alfa, int n) {
-  for (register int i = 0; i < n; i++)
-    vect[i] *= alfa;
-}
-
+void scale(double *vect, double alfa, int n);
 /** method to calculate the scalar-vector product */
-inline void scale(arr3_double vect, double alfa, int nx, int ny) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect.fetch(i,j,0) *= alfa;
-}
-
-
+void scale(arr3_double vect, double alfa, int nx, int ny);
 /** method to calculate the scalar-vector product */
-inline void scale(arr3_double vect, double alfa, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect.fetch(i,j,k) *= alfa;
-}
-/** method to calculate the scalar product */
-inline void scale(double vect[][2][2], double alfa, int nx, int ny, int nz) {
-  for (int i = 0; i < nx; i++)
-    for (int j = 0; j < ny; j++)
-      for (int k = 0; k < nz; k++)
-        vect[i][j][k] *= alfa;
-}
+void scale(arr3_double vect, double alfa, int nx, int ny, int nz);
+///** method to calculate the scalar product */
+//inline void scale(double vect[][2][2], double alfa, int nx, int ny, int nz) {
+//  for (int i = 0; i < nx; i++)
+//    for (int j = 0; j < ny; j++)
+//      for (int k = 0; k < nz; k++)
+//        vect[i][j][k] *= alfa;
+//}
 /** method to calculate the scalar-vector product */
-inline void scale(arr3_double vect1, const arr3_double vect2, double alfa, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = vect2.get(i,j,k) * alfa;
-}
-
+void scale(arr3_double vect1, const arr3_double vect2, double alfa, int nx, int ny, int nz);
 /** method to calculate the scalar-vector product */
-inline void scale(arr3_double vect1, const arr3_double vect2, double alfa, int nx, int ny) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect1.fetch(i,j,0) = vect2.get(i,j,0) * alfa;
-}
-
+void scale(arr3_double vect1, const arr3_double vect2, double alfa, int nx, int ny);
 /** method to calculate the scalar-vector product */
-inline void scale(double *vect1, double *vect2, double alfa, int n) {
-  for (register int i = 0; i < n; i++)
-    vect1[i] = vect2[i] * alfa;
-}
-
+void scale(double *vect1, double *vect2, double alfa, int n);
 /** method to calculate vector1 = vector1 + alfa*vector2   */
-inline void addscale(double alfa, arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = vect1.get(i,j,k) + alfa * vect2.get(i,j,k);
-}
+void addscale(double alfa, arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz);
 /** add scale for weights */
-inline void addscale(double alfa, double vect1[][2][2], double vect2[][2][2], int nx, int ny, int nz) {
-  for (int i = 0; i < nx; i++)
-    for (int j = 0; j < ny; j++)
-      for (int k = 0; k < nz; k++)
-        vect1[i][j][k] = vect1[i][j][k] + alfa * vect2[i][j][k];
-
-}
+void addscale(double alfa, double vect1[][2][2], double vect2[][2][2], int nx, int ny, int nz);
 /** method to calculate vector1 = vector1 + alfa*vector2   */
-inline void addscale(double alfa, arr3_double vect1, const arr3_double vect2, int nx, int ny) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect1.fetch(i,j,0) += alfa * vect2.get(i,j,0);
-}
+void addscale(double alfa, arr3_double vect1, const arr3_double vect2, int nx, int ny);
 /** method to calculate vector1 = vector1 + alfa*vector2   */
-inline void addscale(double alfa, double *vect1, double *vect2, int n) {
-  for (register int i = 0; i < n; i++)
-    vect1[i] += alfa * vect2[i];
-
-}
+void addscale(double alfa, double *vect1, double *vect2, int n);
 /** method to calculate vector1 = beta*vector1 + alfa*vector2   */
-inline void addscale(double alfa, double beta, double *vect1, double *vect2, int n) {
-  for (register int i = 0; i < n; i++)
-    vect1[i] = vect1[i] * beta + alfa * vect2[i];
-
-}
+void addscale(double alfa, double beta, double *vect1, double *vect2, int n);
 /** method to calculate vector1 = beta*vector1 + alfa*vector2 */
-inline void addscale(double alfa, double beta, arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
-
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++) {
-        vect1.fetch(i,j,k) = beta * vect1.get(i,j,k) + alfa * vect2.get(i,j,k);
-      }
-
-}
+void addscale(double alfa, double beta, arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz);
 /** method to calculate vector1 = beta*vector1 + alfa*vector2 */
-inline void addscale(double alfa, double beta, arr3_double vect1, const arr3_double vect2, int nx, int ny) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect1.fetch(i,j,0) = beta * vect1.get(i,j,0) + alfa * vect2.get(i,j,0);
-
-}
-
-
+void addscale(double alfa, double beta, arr3_double vect1, const arr3_double vect2, int nx, int ny);
 /** method to calculate vector1 = alfa*vector2 + beta*vector3 */
-inline void scaleandsum(arr3_double vect1, double alfa, double beta, const arr3_double vect2, const arr3_double vect3, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = alfa * vect2.get(i,j,k) + beta * vect3.get(i,j,k);
-}
+void scaleandsum(arr3_double vect1, double alfa, double beta, const arr3_double vect2, const arr3_double vect3, int nx, int ny, int nz);
 /** method to calculate vector1 = alfa*vector2 + beta*vector3 with vector2 depending on species*/
-inline void scaleandsum(arr3_double vect1, double alfa, double beta, const arr4_double vect2, const arr3_double vect3, int ns, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = alfa * vect2.get(ns,i,j,k) + beta * vect3.get(i,j,k);
-}
+void scaleandsum(arr3_double vect1, double alfa, double beta, const arr4_double vect2, const arr3_double vect3, int ns, int nx, int ny, int nz);
 /** method to calculate vector1 = alfa*vector2*vector3 with vector2 depending on species*/
-inline void prod(arr3_double vect1, double alfa, const arr4_double vect2, int ns, const arr3_double vect3, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = alfa * vect2.get(ns,i,j,k) * vect3.get(i,j,k);
-
-}
+void prod(arr3_double vect1, double alfa, const arr4_double vect2, int ns, const arr3_double vect3, int nx, int ny, int nz);
 /** method to calculate vect1 = vect2/alfa */
-inline void div(arr3_double vect1, double alfa, const arr3_double vect2, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = vect2.get(i,j,k) / alfa;
-
-}
-inline void prod6(arr3_double vect1, const arr3_double vect2, const arr3_double vect3, const arr3_double vect4, const arr3_double vect5, const arr3_double vect6, const arr3_double vect7, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = vect2.get(i,j,k) * vect3.get(i,j,k) + vect4.get(i,j,k) * vect5.get(i,j,k) + vect6.get(i,j,k) * vect7.get(i,j,k);
-}
+void div(arr3_double vect1, double alfa, const arr3_double vect2, int nx, int ny, int nz);
+void prod6(arr3_double vect1, const arr3_double vect2, const arr3_double vect3, const arr3_double vect4, const arr3_double vect5, const arr3_double vect6, const arr3_double vect7, int nx, int ny, int nz);
 /** method used for calculating PI */
-inline void proddiv(arr3_double vect1, const arr3_double vect2, double alfa, const arr3_double vect3, const arr3_double vect4, const arr3_double vect5, const arr3_double vect6, double beta, const arr3_double vect7, const arr3_double vect8, double gamma, const arr3_double vect9, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = (vect2.get(i,j,k) + alfa * (vect3.get(i,j,k) * vect4.get(i,j,k) - vect5.get(i,j,k) * vect6.get(i,j,k)) + beta * vect7.get(i,j,k) * vect8.get(i,j,k)) / (1 + gamma * vect9.get(i,j,k));
-
-  // questo mi convince veramente poco!!!!!!!!!!!!!! CAZZO!!!!!!!!!!!!!!!!!!
-  // ***vect1++ = (***vect2++ + alfa*((***vect3++)*(***vect4++) - (***vect5++)*(***vect6++)) + beta*(***vect7++)*(***vect8++))/(1+gamma*(***vect9++));
-}
+void proddiv(arr3_double vect1, const arr3_double vect2, double alfa, const arr3_double vect3, const arr3_double vect4, const arr3_double vect5, const arr3_double vect6, double beta, const arr3_double vect7, const arr3_double vect8, double gamma, const arr3_double vect9, int nx, int ny, int nz);
 /** method to calculate the opposite of a vector */
-inline void neg(arr3_double vect, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect.fetch(i,j,k) = -vect.get(i,j,k);
-}
-
+void neg(arr3_double vect, int nx, int ny, int nz);
 /** method to calculate the opposite of a vector */
-inline void neg(arr3_double vect, int nx, int ny) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect.fetch(i,j,0) = -vect.get(i,j,0);
-}
+void neg(arr3_double vect, int nx, int ny);
 /** method to calculate the opposite of a vector */
-inline void neg(arr3_double vect, int nx) {
-  for (register int i = 0; i < nx; i++)
-    vect.fetch(i,0,0) = -vect.get(i,0,0);
-}
+void neg(arr3_double vect, int nx);
 /** method to calculate the opposite of a vector */
-inline void neg(double *vect, int n) {
-  for (register int i = 0; i < n; i++)
-    vect[i] = -vect[i];
-
-
-}
+void neg(double *vect, int n);
 /** method to set equal two vectors */
-inline void eq(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(i,j,k) = vect2.get(i,j,k);
-
-}
+void eq(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz);
 /** method to set equal two vectors */
-inline void eq(arr3_double vect1, const arr3_double vect2, int nx, int ny) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect1.fetch(i,j,0) = vect2.get(i,j,0);
-
-}
-
+void eq(arr3_double vect1, const arr3_double vect2, int nx, int ny);
 /** method to set equal two vectors */
-inline void eq(arr4_double vect1, const arr3_double vect2, int nx, int ny, int is) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect1.fetch(is,i,j,0) = vect2.get(i,j,0);
-
-}
+void eq(arr4_double vect1, const arr3_double vect2, int nx, int ny, int is);
 /** method to set equal two vectors */
-inline void eq(arr4_double vect1, const arr3_double vect2, int nx, int ny, int nz, int is) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect1.fetch(is,i,j,k) = vect2.get(i,j,k);
-
-}
-
-inline void eq(double *vect1, double *vect2, int n) {
+void eq(arr4_double vect1, const arr3_double vect2, int nx, int ny, int nz, int is);
+inline void eq(double *vect1, double *vect2, int n){
   for (register int i = 0; i < n; i++)
     vect1[i] = vect2[i];
 }
 /** method to set a vector to a Value */
-inline void eqValue(double value, arr3_double vect, int nx, int ny, int nz) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      for (register int k = 0; k < nz; k++)
-        vect.fetch(i,j,k) = value;
-
-}
-inline void eqValue(double value, double vect[][2][2], int nx, int ny, int nz) {
-  for (int i = 0; i < nx; i++)
-    for (int j = 0; j < ny; j++)
-      for (int k = 0; k < nz; k++)
-        vect[i][j][k] = value;
-
-}
+void eqValue(double value, arr3_double vect, int nx, int ny, int nz);
+//void eqValue(double value, double vect[][2][2], int nx, int ny, int nz);
 /** method to set a vector to a Value */
-inline void eqValue(double value, arr3_double vect, int nx, int ny) {
-  for (register int i = 0; i < nx; i++)
-    for (register int j = 0; j < ny; j++)
-      vect.fetch(i,j,0) = value;
-
-}
+void eqValue(double value, arr3_double vect, int nx, int ny);
 /** method to set a vector to a Value */
-inline void eqValue(double value, arr3_double vect, int nx) {
-  for (register int i = 0; i < nx; i++)
-    vect.fetch(i,0,0) = value;
-
-}
+void eqValue(double value, arr3_double vect, int nx);
 /** method to set a vector to a Value */
-inline void eqValue(double value, double *vect, int n) {
-  for (register int i = 0; i < n; i++)
-    vect[i] = value;
-}
+void eqValue(double value, double *vect, int n);
 /** method to put a column in a matrix 2D */
-inline void putColumn(double **Matrix, double *vect, int column, int n) {
-  for (int i = 0; i < n; i++)
-    Matrix[i][column] = vect[i];
-
-}
+void putColumn(double **Matrix, double *vect, int column, int n);
 /** method to get a column in a matrix 2D */
-inline void getColumn(double *vect, double **Matrix, int column, int n) {
-  for (int i = 0; i < n; i++)
-    vect[i] = Matrix[i][column];
-}
+void getColumn(double *vect, double **Matrix, int column, int n);
+
+/** method to get rid of the ghost cells */
+inline void getRidGhost(double **out, double **in, int nx, int ny);
+
 /** RIFAI QUESTA PARTE questo e' la tomba delle performance*/
 inline void MODULO(double *x, double L) {
   *x = *x - floor(*x / L) * L;
@@ -456,12 +167,6 @@ inline double eps() {
   eps = num * 2;
   return (eps);
 }
-/** method to get rid of the ghost cells */
-inline void getRidGhost(double **out, double **in, int nx, int ny) {
-  for (register int i = 1; i < nx - 1; i++)
-    for (register int j = 1; j < ny - 1; j++)
-      out[i - 1][j - 1] = in[i][j];
-}
 /** method to calculate cross product of two vectors C= A x B */
 inline void cross_product(double a1, double a2, double a3, double b1, double b2, double b3, double *c){
   c[0] = a2 * b3 - a3 * b2;
@@ -469,129 +174,8 @@ inline void cross_product(double a1, double a2, double a3, double b1, double b2,
   c[2] = a1 * b2 - a2 * b1;
 }
 
-inline void loopX(double *b, double z, double x, double y, double a, double zc, double xc, double yc, double m){
-
-  double r = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc)+(z-zc)*(z-zc));
-  double theta = acos((z-zc+1e-10)/(r+1e-10));
-  double phi = atan2(y-yc,x-xc);
-  //double Rho = r * sin(theta);
-  double Rho = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc));
-
-  double Alpha = Rho/a;
-  double Beta = (z-zc)/a;
-  double Gamma = (z-zc+1e-10)/(Rho+1e-10);
-
-  double Q = ((1 + Alpha)*(1 + Alpha) + Beta*Beta);
-  double k = sqrt(4*Alpha/Q);
-  double B0 = m / (2*a); //m * (C_LIGHT * MU0)/(2*a*a*a*M_PI);
-
-  int err = 0;
-
-  double Bz = B0*(EllipticE(k,err)*(1-Alpha*Alpha-Beta*Beta)/(Q-4*Alpha)+EllipticF(k,err))/(M_PI*sqrt(Q));
-  double BRho = B0*Gamma*(EllipticE(k,err)*(1+Alpha*Alpha+Beta*Beta)/(Q-4*Alpha)-EllipticF(k,err))/(M_PI*sqrt(Q));
-
-  if (err)
-    cout << "Err came back :" << err << endl;
-
-  if ( isnan(BRho) )
-    BRho = 0;
-  if ( isnan(Bz) )
-    Bz = 0;
-
-  double Bx = BRho * cos(phi);
-  double By = BRho * sin(phi);
-
-  //for debugging
-  /*cout << "\n\nAt (" << x << "," << y << "," << z << "), the field is :" << endl;
-    cout << "Bx: " << Bx << " T" << endl;
-    cout << "By: " << By << " T" << endl;
-    cout << "Bz: " << Bz << " T" << endl;
-    cout << "BRho: " << BRho << " T" << endl;*/
-
-  b[1] = Bx;
-  b[2] = By;
-  b[0] = Bz;
-}
-
-inline void loopY(double *b, double y, double z, double x, double a, double yc, double zc, double xc, double m){
-
-  double r = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc)+(z-zc)*(z-zc));
-  double theta = acos((z-zc+1e-10)/(r+1e-10));
-  double phi = atan2(y-yc,x-xc);
-  //double Rho = r * sin(theta);
-  double Rho = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc));
-
-  double Alpha = Rho/a;
-  double Beta = (z-zc)/a;
-  double Gamma = (z-zc+1e-10)/(Rho+1e-10);
-
-  double Q = ((1 + Alpha)*(1 + Alpha) + Beta*Beta);
-  double k = sqrt(4*Alpha/Q);
-  double B0 = m / (2*a); //m * (C_LIGHT * MU0)/(2*a*a*a*M_PI);
-
-  int err = 0;
-
-  double Bz = B0*(EllipticE(k,err)*(1-Alpha*Alpha-Beta*Beta)/(Q-4*Alpha)+EllipticF(k,err))/(M_PI*sqrt(Q));
-  double BRho = B0*Gamma*(EllipticE(k,err)*(1+Alpha*Alpha+Beta*Beta)/(Q-4*Alpha)-EllipticF(k,err))/(M_PI*sqrt(Q));
-
-  if (err)
-    cout << "Err came back :" << err << endl;
-
-  if ( isnan(BRho) )
-    BRho = 0;
-  if ( isnan(Bz) )
-    Bz = 0;
-
-  double Bx = BRho * cos(phi);
-  double By = BRho * sin(phi);
-
-  //for debugging
-  /*cout << "\n\nAt (" << x << "," << y << "," << z << "), the field is :" << endl;
-    cout << "Bx: " << Bx << " T" << endl;
-    cout << "By: " << By << " T" << endl;
-    cout << "Bz: " << Bz << " T" << endl;
-    cout << "BRho: " << BRho << " T" << endl;*/
-
-  b[2] = Bx;
-  b[0] = By;
-  b[1] = Bz;
-}
-
-inline void loopZ(double *b, double x, double y, double z, double a, double xc, double yc, double zc, double m){
-
-  double r = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc)+(z-zc)*(z-zc));
-  double theta = acos((z-zc+1e-10)/(r+1e-10));
-  double phi = atan2(y-yc,x-xc);
-
-  double Rho = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc));
-
-  double Alpha = Rho/a;
-  double Beta = (z-zc)/a;
-  double Gamma = (z-zc+1e-10)/(Rho+1e-10);
-
-  double Q = ((1 + Alpha)*(1 + Alpha) + Beta*Beta);
-  double k = sqrt(4*Alpha/Q);
-  double B0 = m / (2*a); //m * (C_LIGHT * MU0)/(2*a*a*a*M_PI);
-
-  int err = 0;
-
-  double Bz = B0*(EllipticE(k,err)*(1-Alpha*Alpha-Beta*Beta)/(Q-4*Alpha)+EllipticF(k,err))/(M_PI*sqrt(Q));
-  double BRho = B0*Gamma*(EllipticE(k,err)*(1+Alpha*Alpha+Beta*Beta)/(Q-4*Alpha)-EllipticF(k,err))/(M_PI*sqrt(Q));
-
-  if (err)
-    cout << "Err came back :" << err << endl;
-
-  if ( isnan(BRho) )
-    BRho = 0;
-  if ( isnan(Bz) )
-    Bz = 0;
-
-  double Bx = BRho * cos(phi);
-  double By = BRho * sin(phi);
-
-  b[0] = Bx;
-  b[1] = By;
-  b[2] = Bz;
-}
+void loopX(double *b, double z, double x, double y, double a, double zc, double xc, double yc, double m);
+void loopY(double *b, double y, double z, double x, double a, double yc, double zc, double xc, double m);
+void loopZ(double *b, double x, double y, double z, double a, double xc, double yc, double zc, double m);
 
 #endif
diff --git a/utility/Basic.cpp b/utility/Basic.cpp
new file mode 100644
index 00000000..7725b680
--- /dev/null
+++ b/utility/Basic.cpp
@@ -0,0 +1,525 @@
+#include "mpi.h"
+#include "Basic.h"
+
+#include <errors.h>
+
+#include "MPIdata.h"
+#include "EllipticF.h"
+#include "Alloc.h"
+
+/** method to calculate the parallel dot product with vect1, vect2 having the ghost cells*/
+double dotP(double *vect1, double *vect2, int n) {
+  double result = 0;
+  double local_result = 0;
+  for (register int i = 0; i < n; i++)
+    local_result += vect1[i] * vect2[i];
+  MPI_Allreduce(&local_result, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  return (result);
+
+}
+/** method to calculate dot product */
+double dot(double *vect1, double *vect2, int n) {
+  double result = 0;
+  for (int i = 0; i < n; i++)
+    result += vect1[i] * vect2[i];
+  return (result);
+}
+/** method to calculate the square norm of a vector */
+double norm2(double **vect, int nx, int ny) {
+  double result = 0;
+  for (int i = 0; i < nx; i++)
+    for (int j = 0; j < ny; j++)
+      result += vect[i][j] * vect[i][j];
+  return (result);
+}
+/** method to calculate the square norm of a vector */
+double norm2(const arr3_double vect, int nx, int ny) {
+  double result = 0;
+  for (int i = 0; i < nx; i++)
+    for (int j = 0; j < ny; j++)
+      result += vect.get(i,j,0) * vect.get(i,j,0);
+  return (result);
+}
+/** method to calculate the square norm of a vector */
+double norm2(double *vect, int nx) {
+  double result = 0;
+  for (int i = 0; i < nx; i++)
+    result += vect[i] * vect[i];
+  return (result);
+}
+
+
+
+/** method to calculate the parallel dot product */
+double norm2P(const arr3_double vect, int nx, int ny, int nz) {
+  double result = 0;
+  double local_result = 0;
+  for (int i = 0; i < nx; i++)
+    for (int j = 0; j < ny; j++)
+      for (int k = 0; k < nz; k++)
+        local_result += vect.get(i,j,k) * vect.get(i,j,k);
+
+  MPI_Allreduce(&local_result, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  return (result);
+}
+/** method to calculate the parallel norm of a vector on different processors with the ghost cell */
+double norm2P(double *vect, int n) {
+  double result = 0;
+  double local_result = 0;
+  for (int i = 0; i < n; i++)
+    local_result += vect[i] * vect[i];
+  MPI_Allreduce(&local_result, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  return (result);
+}
+/** method to calculate the parallel norm of a vector on different processors with the gost cell*/
+double normP(double *vect, int n) {
+  double result = 0.0;
+  double local_result = 0.0;
+  for (register int i = 0; i < n; i++)
+    local_result += vect[i] * vect[i];
+
+
+  MPI_Allreduce(&local_result, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  return (sqrt(result));
+
+}
+/** method to calculate the difference of two vectors*/
+void sub(double *res, double *vect1, double *vect2, int n) {
+  for (register int i = 0; i < n; i++)
+    res[i] = vect1[i] - vect2[i];
+}
+/** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
+void sum(double *vect1, double *vect2, int n) {
+  for (register int i = 0; i < n; i++)
+    vect1[i] += vect2[i];
+
+
+}
+/** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
+void sum(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) += vect2.get(i,j,k);
+}
+
+/** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
+void sum(arr3_double vect1, const arr3_double vect2, int nx, int ny) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect1.fetch(i,j,0) += vect2.get(i,j,0);
+}
+
+/** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
+void sum(arr3_double vect1, const arr4_double vect2, int nx, int ny, int nz, int ns) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) += vect2.get(ns,i,j,k);
+}
+
+/** method to calculate the sum of two vectors vector1 = vector1 + vector2*/
+void sum(arr3_double vect1, const arr4_double vect2, int nx, int ny, int ns) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect1.fetch(i,j,0) += vect2.get(ns,i,j,0);
+}
+/** method to calculate the subtraction of two vectors vector1 = vector1 - vector2*/
+void sub(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) -= vect2.get(i,j,k);
+}
+
+/** method to calculate the subtraction of two vectors vector1 = vector1 - vector2*/
+void sub(arr3_double vect1, const arr3_double vect2, int nx, int ny) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect1.fetch(i,j,0) -= vect2.get(i,j,0);
+}
+
+
+/** method to sum 4 vectors vector1 = alfa*vector1 + beta*vector2 + gamma*vector3 + delta*vector4 */
+void sum4(arr3_double vect1, double alfa, const arr3_double vect2, double beta, const arr3_double vect3, double gamma, const arr3_double vect4, double delta, const arr3_double vect5, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = alfa * (vect2.get(i,j,k) + beta * vect3.get(i,j,k) + gamma * vect4.get(i,j,k) + delta * vect5.get(i,j,k));
+
+}
+/** method to calculate the scalar-vector product */
+void scale(double *vect, double alfa, int n) {
+  for (register int i = 0; i < n; i++)
+    vect[i] *= alfa;
+}
+
+/** method to calculate the scalar-vector product */
+void scale(arr3_double vect, double alfa, int nx, int ny) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect.fetch(i,j,0) *= alfa;
+}
+
+
+/** method to calculate the scalar-vector product */
+void scale(arr3_double vect, double alfa, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect.fetch(i,j,k) *= alfa;
+}
+/** method to calculate the scalar-vector product */
+void scale(arr3_double vect1, const arr3_double vect2, double alfa, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = vect2.get(i,j,k) * alfa;
+}
+
+/** method to calculate the scalar-vector product */
+void scale(arr3_double vect1, const arr3_double vect2, double alfa, int nx, int ny) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect1.fetch(i,j,0) = vect2.get(i,j,0) * alfa;
+}
+
+/** method to calculate the scalar-vector product */
+void scale(double *vect1, double *vect2, double alfa, int n) {
+  for (register int i = 0; i < n; i++)
+    vect1[i] = vect2[i] * alfa;
+}
+
+/** method to calculate vector1 = vector1 + alfa*vector2   */
+void addscale(double alfa, arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = vect1.get(i,j,k) + alfa * vect2.get(i,j,k);
+}
+/** add scale for weights */
+void addscale(double alfa, double vect1[][2][2], double vect2[][2][2], int nx, int ny, int nz) {
+  for (int i = 0; i < nx; i++)
+    for (int j = 0; j < ny; j++)
+      for (int k = 0; k < nz; k++)
+        vect1[i][j][k] = vect1[i][j][k] + alfa * vect2[i][j][k];
+
+}
+/** method to calculate vector1 = vector1 + alfa*vector2   */
+void addscale(double alfa, arr3_double vect1, const arr3_double vect2, int nx, int ny) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect1.fetch(i,j,0) += alfa * vect2.get(i,j,0);
+}
+/** method to calculate vector1 = vector1 + alfa*vector2   */
+void addscale(double alfa, double *vect1, double *vect2, int n) {
+  for (register int i = 0; i < n; i++)
+    vect1[i] += alfa * vect2[i];
+
+}
+/** method to calculate vector1 = beta*vector1 + alfa*vector2   */
+void addscale(double alfa, double beta, double *vect1, double *vect2, int n) {
+  for (register int i = 0; i < n; i++)
+    vect1[i] = vect1[i] * beta + alfa * vect2[i];
+
+}
+/** method to calculate vector1 = beta*vector1 + alfa*vector2 */
+void addscale(double alfa, double beta, arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
+
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++) {
+        vect1.fetch(i,j,k) = beta * vect1.get(i,j,k) + alfa * vect2.get(i,j,k);
+      }
+
+}
+/** method to calculate vector1 = beta*vector1 + alfa*vector2 */
+void addscale(double alfa, double beta, arr3_double vect1, const arr3_double vect2, int nx, int ny) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect1.fetch(i,j,0) = beta * vect1.get(i,j,0) + alfa * vect2.get(i,j,0);
+
+}
+
+
+/** method to calculate vector1 = alfa*vector2 + beta*vector3 */
+void scaleandsum(arr3_double vect1, double alfa, double beta, const arr3_double vect2, const arr3_double vect3, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = alfa * vect2.get(i,j,k) + beta * vect3.get(i,j,k);
+}
+/** method to calculate vector1 = alfa*vector2 + beta*vector3 with vector2 depending on species*/
+void scaleandsum(arr3_double vect1, double alfa, double beta, const arr4_double vect2, const arr3_double vect3, int ns, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = alfa * vect2.get(ns,i,j,k) + beta * vect3.get(i,j,k);
+}
+/** method to calculate vector1 = alfa*vector2*vector3 with vector2 depending on species*/
+void prod(arr3_double vect1, double alfa, const arr4_double vect2, int ns, const arr3_double vect3, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = alfa * vect2.get(ns,i,j,k) * vect3.get(i,j,k);
+
+}
+/** method to calculate vect1 = vect2/alfa */
+void div(arr3_double vect1, double alfa, const arr3_double vect2, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = vect2.get(i,j,k) / alfa;
+
+}
+void prod6(arr3_double vect1, const arr3_double vect2, const arr3_double vect3, const arr3_double vect4, const arr3_double vect5, const arr3_double vect6, const arr3_double vect7, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = vect2.get(i,j,k) * vect3.get(i,j,k) + vect4.get(i,j,k) * vect5.get(i,j,k) + vect6.get(i,j,k) * vect7.get(i,j,k);
+}
+/** method used for calculating PI */
+void proddiv(arr3_double vect1, const arr3_double vect2, double alfa, const arr3_double vect3, const arr3_double vect4, const arr3_double vect5, const arr3_double vect6, double beta, const arr3_double vect7, const arr3_double vect8, double gamma, const arr3_double vect9, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = (vect2.get(i,j,k) + alfa * (vect3.get(i,j,k) * vect4.get(i,j,k) - vect5.get(i,j,k) * vect6.get(i,j,k)) + beta * vect7.get(i,j,k) * vect8.get(i,j,k)) / (1 + gamma * vect9.get(i,j,k));
+
+  // questo mi convince veramente poco!!!!!!!!!!!!!! CAZZO!!!!!!!!!!!!!!!!!!
+  // ***vect1++ = (***vect2++ + alfa*((***vect3++)*(***vect4++) - (***vect5++)*(***vect6++)) + beta*(***vect7++)*(***vect8++))/(1+gamma*(***vect9++));
+}
+/** method to calculate the opposite of a vector */
+void neg(arr3_double vect, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect.fetch(i,j,k) = -vect.get(i,j,k);
+}
+
+/** method to calculate the opposite of a vector */
+void neg(arr3_double vect, int nx, int ny) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect.fetch(i,j,0) = -vect.get(i,j,0);
+}
+/** method to calculate the opposite of a vector */
+void neg(arr3_double vect, int nx) {
+  for (register int i = 0; i < nx; i++)
+    vect.fetch(i,0,0) = -vect.get(i,0,0);
+}
+/** method to calculate the opposite of a vector */
+void neg(double *vect, int n) {
+  for (register int i = 0; i < n; i++)
+    vect[i] = -vect[i];
+
+
+}
+/** method to set equal two vectors */
+void eq(arr3_double vect1, const arr3_double vect2, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(i,j,k) = vect2.get(i,j,k);
+
+}
+/** method to set equal two vectors */
+void eq(arr3_double vect1, const arr3_double vect2, int nx, int ny) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect1.fetch(i,j,0) = vect2.get(i,j,0);
+
+}
+
+/** method to set equal two vectors */
+void eq(arr4_double vect1, const arr3_double vect2, int nx, int ny, int is) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect1.fetch(is,i,j,0) = vect2.get(i,j,0);
+
+}
+/** method to set equal two vectors */
+void eq(arr4_double vect1, const arr3_double vect2, int nx, int ny, int nz, int is) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect1.fetch(is,i,j,k) = vect2.get(i,j,k);
+
+}
+
+/** method to set a vector to a Value */
+void eqValue(double value, arr3_double vect, int nx, int ny, int nz) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      for (register int k = 0; k < nz; k++)
+        vect.fetch(i,j,k) = value;
+
+}
+//void eqValue(double value, double vect[][2][2], int nx, int ny, int nz) {
+//  for (int i = 0; i < nx; i++)
+//    for (int j = 0; j < ny; j++)
+//      for (int k = 0; k < nz; k++)
+//        vect[i][j][k] = value;
+//
+//}
+/** method to set a vector to a Value */
+void eqValue(double value, arr3_double vect, int nx, int ny) {
+  for (register int i = 0; i < nx; i++)
+    for (register int j = 0; j < ny; j++)
+      vect.fetch(i,j,0) = value;
+
+}
+/** method to set a vector to a Value */
+void eqValue(double value, arr3_double vect, int nx) {
+  for (register int i = 0; i < nx; i++)
+    vect.fetch(i,0,0) = value;
+
+}
+/** method to set a vector to a Value */
+void eqValue(double value, double *vect, int n) {
+  for (register int i = 0; i < n; i++)
+    vect[i] = value;
+}
+/** method to put a column in a matrix 2D */
+void putColumn(double **Matrix, double *vect, int column, int n) {
+  for (int i = 0; i < n; i++)
+    Matrix[i][column] = vect[i];
+
+}
+/** method to get a column in a matrix 2D */
+void getColumn(double *vect, double **Matrix, int column, int n) {
+  for (int i = 0; i < n; i++)
+    vect[i] = Matrix[i][column];
+}
+/** method to get rid of the ghost cells */
+void getRidGhost(double **out, double **in, int nx, int ny) {
+  for (register int i = 1; i < nx - 1; i++)
+    for (register int j = 1; j < ny - 1; j++)
+      out[i - 1][j - 1] = in[i][j];
+}
+
+void loopX(double *b, double z, double x, double y, double a, double zc, double xc, double yc, double m){
+
+  double r = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc)+(z-zc)*(z-zc));
+  double theta = acos((z-zc+1e-10)/(r+1e-10));
+  double phi = atan2(y-yc,x-xc);
+  //double Rho = r * sin(theta);
+  double Rho = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc));
+
+  double Alpha = Rho/a;
+  double Beta = (z-zc)/a;
+  double Gamma = (z-zc+1e-10)/(Rho+1e-10);
+
+  double Q = ((1 + Alpha)*(1 + Alpha) + Beta*Beta);
+  double k = sqrt(4*Alpha/Q);
+  double B0 = m / (2*a); //m * (C_LIGHT * MU0)/(2*a*a*a*M_PI);
+
+  int err = 0;
+
+  double Bz = B0*(EllipticE(k,err)*(1-Alpha*Alpha-Beta*Beta)/(Q-4*Alpha)+EllipticF(k,err))/(M_PI*sqrt(Q));
+  double BRho = B0*Gamma*(EllipticE(k,err)*(1+Alpha*Alpha+Beta*Beta)/(Q-4*Alpha)-EllipticF(k,err))/(M_PI*sqrt(Q));
+
+  if (err)
+    eprintf("Err came back :%d", err);
+
+  if ( isnan(BRho) )
+    BRho = 0;
+  if ( isnan(Bz) )
+    Bz = 0;
+
+  double Bx = BRho * cos(phi);
+  double By = BRho * sin(phi);
+
+  //for debugging
+  /*cout << "\n\nAt (" << x << "," << y << "," << z << "), the field is :" << endl;
+    cout << "Bx: " << Bx << " T" << endl;
+    cout << "By: " << By << " T" << endl;
+    cout << "Bz: " << Bz << " T" << endl;
+    cout << "BRho: " << BRho << " T" << endl;*/
+
+  b[1] = Bx;
+  b[2] = By;
+  b[0] = Bz;
+}
+
+void loopY(double *b, double y, double z, double x, double a, double yc, double zc, double xc, double m){
+
+  double r = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc)+(z-zc)*(z-zc));
+  double theta = acos((z-zc+1e-10)/(r+1e-10));
+  double phi = atan2(y-yc,x-xc);
+  //double Rho = r * sin(theta);
+  double Rho = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc));
+
+  double Alpha = Rho/a;
+  double Beta = (z-zc)/a;
+  double Gamma = (z-zc+1e-10)/(Rho+1e-10);
+
+  double Q = ((1 + Alpha)*(1 + Alpha) + Beta*Beta);
+  double k = sqrt(4*Alpha/Q);
+  double B0 = m / (2*a); //m * (C_LIGHT * MU0)/(2*a*a*a*M_PI);
+
+  int err = 0;
+
+  double Bz = B0*(EllipticE(k,err)*(1-Alpha*Alpha-Beta*Beta)/(Q-4*Alpha)+EllipticF(k,err))/(M_PI*sqrt(Q));
+  double BRho = B0*Gamma*(EllipticE(k,err)*(1+Alpha*Alpha+Beta*Beta)/(Q-4*Alpha)-EllipticF(k,err))/(M_PI*sqrt(Q));
+
+  if (err)
+    eprintf("Err came back :%d", err);
+
+  if ( isnan(BRho) )
+    BRho = 0;
+  if ( isnan(Bz) )
+    Bz = 0;
+
+  double Bx = BRho * cos(phi);
+  double By = BRho * sin(phi);
+
+  //for debugging
+  /*cout << "\n\nAt (" << x << "," << y << "," << z << "), the field is :" << endl;
+    cout << "Bx: " << Bx << " T" << endl;
+    cout << "By: " << By << " T" << endl;
+    cout << "Bz: " << Bz << " T" << endl;
+    cout << "BRho: " << BRho << " T" << endl;*/
+
+  b[2] = Bx;
+  b[0] = By;
+  b[1] = Bz;
+}
+
+void loopZ(double *b, double x, double y, double z, double a, double xc, double yc, double zc, double m){
+
+  double r = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc)+(z-zc)*(z-zc));
+  double theta = acos((z-zc+1e-10)/(r+1e-10));
+  double phi = atan2(y-yc,x-xc);
+
+  double Rho = sqrt((x-xc)*(x-xc)+(y-yc)*(y-yc));
+
+  double Alpha = Rho/a;
+  double Beta = (z-zc)/a;
+  double Gamma = (z-zc+1e-10)/(Rho+1e-10);
+
+  double Q = ((1 + Alpha)*(1 + Alpha) + Beta*Beta);
+  double k = sqrt(4*Alpha/Q);
+  double B0 = m / (2*a); //m * (C_LIGHT * MU0)/(2*a*a*a*M_PI);
+
+  int err = 0;
+
+  double Bz = B0*(EllipticE(k,err)*(1-Alpha*Alpha-Beta*Beta)/(Q-4*Alpha)+EllipticF(k,err))/(M_PI*sqrt(Q));
+  double BRho = B0*Gamma*(EllipticE(k,err)*(1+Alpha*Alpha+Beta*Beta)/(Q-4*Alpha)-EllipticF(k,err))/(M_PI*sqrt(Q));
+
+  if (err)
+    eprintf("Err came back :%d", err);
+
+  if ( isnan(BRho) )
+    BRho = 0;
+  if ( isnan(Bz) )
+    Bz = 0;
+
+  double Bx = BRho * cos(phi);
+  double By = BRho * sin(phi);
+
+  b[0] = Bx;
+  b[1] = By;
+  b[2] = Bz;
+}
+

From d3ae3cfe777b4889dc75100aab49d7dbaa1c245b Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 14:51:08 +0100
Subject: [PATCH 102/118] inserted #include "errors.h" forgotten in commit
 fe2a01ab

---
 utility/Basic.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/utility/Basic.cpp b/utility/Basic.cpp
index 7725b680..06ca236d 100644
--- a/utility/Basic.cpp
+++ b/utility/Basic.cpp
@@ -1,11 +1,9 @@
 #include "mpi.h"
+#include "ipicdefs.h"
 #include "Basic.h"
-
-#include <errors.h>
-
-#include "MPIdata.h"
 #include "EllipticF.h"
 #include "Alloc.h"
+#include "errors.h"
 
 /** method to calculate the parallel dot product with vect1, vect2 having the ghost cells*/
 double dotP(double *vect1, double *vect2, int n) {

From fb3473f0088359f43415b28291073e74969cbeed Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 13:48:51 +0100
Subject: [PATCH 103/118] created former_MPI_Barrier macro

---
 include/ipicdefs.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/ipicdefs.h b/include/ipicdefs.h
index 1031f60c..53b22cd8 100644
--- a/include/ipicdefs.h
+++ b/include/ipicdefs.h
@@ -9,6 +9,14 @@
 
 // use precprocessor to remove MPI_Barrier() calls.
 #define MPI_Barrier(args...)
+#define former_MPI_Barrier(args...)
+
+#define ipicMPI_Allreduce(args...) \
+  { \
+    static int count=0; \
+    dprint(count++); \
+    MPI_Allreduce(## args); \
+  }
 
 //#define SINGLE_PRECISION_PCLS
 //

From 091349abc43c8e791f3dfe4afcfbfc4c092ca40f Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 14:35:53 +0100
Subject: [PATCH 104/118] moved MPI_Allreduce calls from .h to .cpp

---
 communication/ComParticles3D.cpp | 18 ++++++++++++++++++
 include/ComParticles3D.h         | 17 ++---------------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/communication/ComParticles3D.cpp b/communication/ComParticles3D.cpp
index f74699ae..e78f0597 100644
--- a/communication/ComParticles3D.cpp
+++ b/communication/ComParticles3D.cpp
@@ -1,5 +1,6 @@
 
 #include "ComParticles3D.h"
+#include "ipicdefs.h"
 
 /** comunicate particles and receive particles to and from 6 processors */
 void communicateParticles(int buffer_size, double *b_Xleft, double *b_Xright, double *b_Yleft, double *b_Yright, double *b_Zleft, double *b_Zright, VirtualTopology3D * vct) {
@@ -10,3 +11,20 @@ void communicateParticles(int buffer_size, double *b_Xleft, double *b_Xright, do
   // DIR Z
   communicateParticlesDIR(buffer_size, vct->getCartesian_rank(), vct->getZright_neighbor_P(), vct->getZleft_neighbor_P(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), b_Zright, b_Zleft);
 }
+
+/** communicate the number of particles are not in the right domain*/
+int reduceNumberParticles(int rightDomain) {
+  int result = 0;
+  former_MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Allreduce(&rightDomain, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  return (result);
+}
+
+/** communicate the maximum number of particles from a domain */
+int reduceMaxNpExiting(int npExitingMax) {
+  int result = 0;
+  former_MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Allreduce(&npExitingMax, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+  return (result);
+}
+
diff --git a/include/ComParticles3D.h b/include/ComParticles3D.h
index 76090d45..44081df6 100644
--- a/include/ComParticles3D.h
+++ b/include/ComParticles3D.h
@@ -9,28 +9,15 @@ developers           : Stefano Markidis, Giovanni Lapenta
 #ifndef ComParticles3D_H
 #define ComParticles3D_H
 
-#include "MPIdata.h"
-#include "ipicdefs.h"
 #include "ComBasic3D.h"
 
 /** comunicate particles and receive particles to and from 6 processors */
 void communicateParticles(int buffer_size, double *b_Xleft, double *b_Xright, double *b_Yleft, double *b_Yright, double *b_Zleft, double *b_Zright, VirtualTopology3D * vct);
 
 /** communicate the number of particles are not in the right domain*/
-inline int reduceNumberParticles(int rightDomain) {
-  int result = 0;
-  MPI_Barrier(MPI_COMM_WORLD);
-  MPI_Allreduce(&rightDomain, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-  return (result);
-}
+int reduceNumberParticles(int rightDomain);
 
 /** communicate the maximum number of particles from a domain */
-inline int reduceMaxNpExiting(int npExitingMax) {
-  int result = 0;
-  MPI_Barrier(MPI_COMM_WORLD);
-  MPI_Allreduce(&npExitingMax, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
-  return (result);
-}
-
+int reduceMaxNpExiting(int npExitingMax);
 
 #endif

From b6639f26d056474ba34ce0dd78021b05ce6c539d Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 14:37:12 +0100
Subject: [PATCH 105/118] commented out unused declarations in MPIdata.h

---
 include/MPIdata.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/MPIdata.h b/include/MPIdata.h
index c4d535ab..36d29bcf 100644
--- a/include/MPIdata.h
+++ b/include/MPIdata.h
@@ -44,7 +44,7 @@ class MPIdata {
   /** print MPI data structure */
   void Print(void);
   /** MPI status during the communication */
-  MPI_Status status;
+  //MPI_Status status;
 public:
   static int get_rank(){return instance().rank;}
   static int get_nprocs(){return instance().nprocs;}
@@ -55,7 +55,7 @@ class MPIdata {
   static int nprocs;
 
   // evidently unused...
-  char *buffer;
-  int buffer_size;
+  //char *buffer;
+  //int buffer_size;
 };
 #endif

From 645af80aef4c6840edad8c2546de2783d98f179d Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 14:40:23 +0100
Subject: [PATCH 106/118] changed "ipic run" to call mpirun rather than mpiexec

---
 scripts/ipic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ipic.py b/scripts/ipic.py
index f00b018c..3ae87d9e 100755
--- a/scripts/ipic.py
+++ b/scripts/ipic.py
@@ -68,7 +68,7 @@ def construct_run_command(args):
     output = 'data'
     inputfile = 'src/inputfiles/GEM.inp'
     hostname = ''
-    mpirun = 'mpiexec'
+    mpirun = 'mpirun'
     global system
     if system == 'xeon' or system == 'mic':
       if system == 'xeon':

From 4bd2416cc3d3b08f79c3ba59e20dcbd93a234afb Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 14:54:34 +0100
Subject: [PATCH 107/118] added number of threads to utility/MPIdata.cpp output

---
 utility/MPIdata.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/utility/MPIdata.cpp b/utility/MPIdata.cpp
index 6baa3697..d70a2b50 100644
--- a/utility/MPIdata.cpp
+++ b/utility/MPIdata.cpp
@@ -1,10 +1,7 @@
 #include <mpi.h>
-#include <iostream>
 #include <assert.h>
 #include "MPIdata.h"
-
-using std::cout;
-using std::endl;
+#include "ompdefs.h" // for omp_get_max_threads
 
 // code to check that init() is called before instance()
 //
@@ -49,10 +46,13 @@ void MPIdata::finalize_mpi() {
 }
 
 void MPIdata::Print(void) {
-  cout << endl;
-  cout << "Number of processes = " << get_nprocs() << endl;
-  cout << "-------------------------" << endl;
-  cout << endl;
+  printf("\n"
+    "Number of processes = %d\n"
+    "-------------------------\n"
+    "Number of threads = %d\n"
+    "-------------------------\n",
+     get_nprocs(),
+     omp_get_max_threads());
 }
 
 // extern MPIdata *mpi; // instantiated in iPIC3D.cpp

From f500fad934ef339c23e3e5295debd3e5a27c6891 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 14:38:28 +0100
Subject: [PATCH 108/118] issue#62: changed getVelocityDistribution to make one
 MPI_Allreduce call (untested)

---
 particles/Particles3Dcomm.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/particles/Particles3Dcomm.cpp b/particles/Particles3Dcomm.cpp
index 7183eb16..cb5f6688 100644
--- a/particles/Particles3Dcomm.cpp
+++ b/particles/Particles3Dcomm.cpp
@@ -981,13 +981,17 @@ long long *Particles3Dcomm::getVelocityDistribution(int nBins, double maxVel) {
     else
       f[bin] += 1;
   }
-  long long localN = 0;
-  long long totalN = 0;
-  for (int i = 0; i < nBins; i++) {
-    localN = f[i];
-    MPI_Allreduce(&localN, &totalN, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-    f[i] = totalN;
-  }
+  MPI_Allreduce(MPI_IN_PLACE, f, nBins, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+  // This way of summing is very inefficient
+  //{
+  //  long long localN = 0;
+  //  long long totalN = 0;
+  //  for (int i = 0; i < nBins; i++) {
+  //    localN = f[i];
+  //    MPI_Allreduce(&localN, &totalN, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+  //    f[i] = totalN;
+  //  }
+  //}
   return f;
 }
 

From 699f6c02b62092d387dc9e9a5dba70b527c9c9fd Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 15:02:08 +0100
Subject: [PATCH 109/118] issue#35: replaced MPI_Barrier with
 former_MPI_Barrier; MPI_Barrier is no longer no-op

---
 communication/ComInterpNodes3D.cpp |  8 +--
 communication/ComNodes3D.cpp       | 83 ++++++++++++++++++------------
 include/ipicdefs.h                 |  4 +-
 main/iPic3Dlib.cpp                 |  8 +--
 particles/Particles3D.cpp          |  8 +--
 performances/Timing.cpp            |  6 +--
 6 files changed, 66 insertions(+), 51 deletions(-)

diff --git a/communication/ComInterpNodes3D.cpp b/communication/ComInterpNodes3D.cpp
index d04e6765..91e3e0a7 100644
--- a/communication/ComInterpNodes3D.cpp
+++ b/communication/ComInterpNodes3D.cpp
@@ -48,19 +48,19 @@ void communicateInterp(int nx, int ny, int nz, int ns, double**** vector, int bc
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor_P(), vct->getXleft_neighbor_P(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor_P(), vct->getXleft_neighbor_P(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y -> Z
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor_P(), vct->getYleft_neighbor_P(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor_P(), vct->getYleft_neighbor_P(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor_P(), vct->getZleft_neighbor_P(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor_P(), vct->getZleft_neighbor_P(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   addEdgeZ(nx, ny, nz, vector, ns, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge, vct);
   addEdgeY(nx, ny, nz, vector, ns, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge, vct);
   addEdgeX(nx, ny, nz, vector, ns, ghostXsameYrightZrightEdge, ghostXsameYleftZleftEdge, ghostXsameYleftZrightEdge, ghostXsameYrightZleftEdge, vct);
diff --git a/communication/ComNodes3D.cpp b/communication/ComNodes3D.cpp
index 6b0bb169..b3224c77 100644
--- a/communication/ComNodes3D.cpp
+++ b/communication/ComNodes3D.cpp
@@ -4,10 +4,13 @@
 #include "TimeTasks.h"
 #include "ipicdefs.h"
 #include "Alloc.h"
+#include "debug.h"
+#include "parallel.h"
 
 /** communicate ghost cells (FOR NODES) */
 void communicateNode(int nx, int ny, int nz, arr3_double _vector, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector=_vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
@@ -54,19 +57,19 @@ void communicateNode(int nx, int ny, int nz, arr3_double _vector, VirtualTopolog
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
 
   parseEdgeZ(nx, ny, nz, vector, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge);
   parseEdgeY(nx, ny, nz, vector, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge);
@@ -110,6 +113,7 @@ void communicateNode(int nx, int ny, int nz, arr3_double _vector, VirtualTopolog
 /** communicate ghost cells (FOR NODES) */
 void communicateNodeBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector = _vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -155,19 +159,19 @@ void communicateNodeBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXr
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
 
   parseEdgeZ(nx, ny, nz, vector, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge);
   parseEdgeY(nx, ny, nz, vector, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge);
@@ -214,6 +218,7 @@ void communicateNodeBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXr
 /** communicate ghost cells (FOR NODES) with particles BC*/
 void communicateNodeBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -259,19 +264,19 @@ void communicateNodeBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFace
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor_P(), vct->getXleft_neighbor_P(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor_P(), vct->getXleft_neighbor_P(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor_P(), vct->getYleft_neighbor_P(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor_P(), vct->getYleft_neighbor_P(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor_P(), vct->getZleft_neighbor_P(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor_P(), vct->getZleft_neighbor_P(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
 
   parseEdgeZ(nx, ny, nz, vector, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge);
   parseEdgeY(nx, ny, nz, vector, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge);
@@ -319,6 +324,7 @@ void communicateNodeBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFace
 /** SPECIES: communicate ghost cells */
 void communicateNode(int nx, int ny, int nz, arr4_double _vector, int ns, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ****vector = _vector.fetch_arr4();
 
   // allocate 6 ghost cell Faces
@@ -366,19 +372,19 @@ void communicateNode(int nx, int ny, int nz, arr4_double _vector, int ns, Virtua
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   parseEdgeZ(nx, ny, nz, vector, ns, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge);
   parseEdgeY(nx, ny, nz, vector, ns, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge);
   parseEdgeX(nx, ny, nz, vector, ns, ghostXsameYrightZrightEdge, ghostXsameYleftZleftEdge, ghostXsameYleftZrightEdge, ghostXsameYrightZleftEdge);
@@ -422,6 +428,7 @@ void communicateNode(int nx, int ny, int nz, arr4_double _vector, int ns, Virtua
 /** SPECIES: communicate ghost cells */
 void communicateNode_P(int nx, int ny, int nz, arr4_double _vector, int ns, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ****vector = _vector.fetch_arr4();
 
   // allocate 6 ghost cell Faces
@@ -469,19 +476,19 @@ void communicateNode_P(int nx, int ny, int nz, arr4_double _vector, int ns, Virt
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor_P(), vct->getXleft_neighbor_P(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor_P(), vct->getXleft_neighbor_P(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor_P(), vct->getYleft_neighbor_P(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor_P(), vct->getYleft_neighbor_P(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor_P(), vct->getZleft_neighbor_P(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor_P(), vct->getZleft_neighbor_P(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   parseEdgeZ(nx, ny, nz, vector, ns, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge);
   parseEdgeY(nx, ny, nz, vector, ns, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge);
   parseEdgeX(nx, ny, nz, vector, ns, ghostXsameYrightZrightEdge, ghostXsameYleftZleftEdge, ghostXsameYleftZrightEdge, ghostXsameYrightZleftEdge);
@@ -525,6 +532,7 @@ void communicateNode_P(int nx, int ny, int nz, arr4_double _vector, int ns, Virt
 /** communicate ghost cells (FOR CENTERS) */
 void communicateCenter(int nx, int ny, int nz, arr3_double _vector, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector = _vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
@@ -571,19 +579,19 @@ void communicateCenter(int nx, int ny, int nz, arr3_double _vector, VirtualTopol
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   parseEdgeZ(nx, ny, nz, vector, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge);
   parseEdgeY(nx, ny, nz, vector, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge);
   parseEdgeX(nx, ny, nz, vector, ghostXsameYrightZrightEdge, ghostXsameYleftZleftEdge, ghostXsameYleftZrightEdge, ghostXsameYrightZleftEdge);
@@ -625,6 +633,7 @@ void communicateCenter(int nx, int ny, int nz, arr3_double _vector, VirtualTopol
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
 void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -657,6 +666,7 @@ void communicateCenterBoxStencilBC(int nx, int ny, int nz, arr3_double _vector,
 /** communicate ghost cells (FOR CENTERS) with BOX stencil*/
 void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -691,6 +701,7 @@ void communicateCenterBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector
 
 void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -722,6 +733,7 @@ void communicateNodeBoxStencilBC(int nx, int ny, int nz, arr3_double _vector, in
 
 void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector=_vector.fetch_arr3();
   // allocate 6 ghost cell Faces
   double *ghostXrightFace = new double[(ny - 2) * (nz - 2)];
@@ -756,6 +768,7 @@ void communicateNodeBoxStencilBC_P(int nx, int ny, int nz, arr3_double _vector,
 /** SPECIES: communicate ghost cells */
 void communicateCenter(int nx, int ny, int nz, arr4_double _vector, int ns, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ****vector=_vector.fetch_arr4();
 
   // allocate 6 ghost cell Faces
@@ -801,19 +814,19 @@ void communicateCenter(int nx, int ny, int nz, arr4_double _vector, int ns, Virt
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   parseEdgeZ(nx, ny, nz, vector, ns, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge);
   parseEdgeY(nx, ny, nz, vector, ns, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge);
   parseEdgeX(nx, ny, nz, vector, ns, ghostXsameYrightZrightEdge, ghostXsameYleftZleftEdge, ghostXsameYleftZrightEdge, ghostXsameYrightZleftEdge);
@@ -855,6 +868,7 @@ void communicateCenter(int nx, int ny, int nz, arr4_double _vector, int ns, Virt
 // /////////// communication + BC ////////////////////////////
 void communicateCenterBC(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector=_vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
@@ -900,18 +914,18 @@ void communicateCenterBC(int nx, int ny, int nz, arr3_double _vector, int bcFace
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor(), vct->getXleft_neighbor(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor(), vct->getYleft_neighbor(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor(), vct->getZleft_neighbor(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   parseEdgeZ(nx, ny, nz, vector, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge);
   parseEdgeY(nx, ny, nz, vector, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge);
   parseEdgeX(nx, ny, nz, vector, ghostXsameYrightZrightEdge, ghostXsameYleftZleftEdge, ghostXsameYleftZrightEdge, ghostXsameYrightZleftEdge);
@@ -957,6 +971,7 @@ void communicateCenterBC(int nx, int ny, int nz, arr3_double _vector, int bcFace
 // /////////// communication + BC ////////////////////////////
 void communicateCenterBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFaceXright, int bcFaceXleft, int bcFaceYright, int bcFaceYleft, int bcFaceZright, int bcFaceZleft, VirtualTopology3D * vct) {
   timeTasks_set_communicating();
+//  static int counter=0; if(is_output_thread()) { counter++; dprint(counter); }
   double ***vector=_vector.fetch_arr3();
 
   // allocate 6 ghost cell Faces
@@ -1002,18 +1017,18 @@ void communicateCenterBC_P(int nx, int ny, int nz, arr3_double _vector, int bcFa
 
   // communicate twice each direction
   // X-DIRECTION: Z -> X
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor_P(), vct->getXleft_neighbor_P(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZleftEdge, ghostXleftYsameZleftEdge);
   communicateGhostFace(ny - 2, vct->getCartesian_rank(), vct->getXright_neighbor_P(), vct->getXleft_neighbor_P(), 0, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYsameZrightEdge, ghostXleftYsameZrightEdge);
   // Y-DIRECTION: X -> Y
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor_P(), vct->getYleft_neighbor_P(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXleftYrightZsameEdge, ghostXleftYleftZsameEdge);
   communicateGhostFace(nz - 2, vct->getCartesian_rank(), vct->getYright_neighbor_P(), vct->getYleft_neighbor_P(), 1, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXrightYrightZsameEdge, ghostXrightYleftZsameEdge);
   // Z-DIRECTION: Y -> Z
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor_P(), vct->getZleft_neighbor_P(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYleftZrightEdge, ghostXsameYleftZleftEdge);
   communicateGhostFace(nx - 2, vct->getCartesian_rank(), vct->getZright_neighbor_P(), vct->getZleft_neighbor_P(), 2, vct->getXLEN(), vct->getYLEN(), vct->getZLEN(), ghostXsameYrightZrightEdge, ghostXsameYrightZleftEdge);
   // parse
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   parseEdgeZ(nx, ny, nz, vector, ghostXrightYrightZsameEdge, ghostXleftYleftZsameEdge, ghostXrightYleftZsameEdge, ghostXleftYrightZsameEdge);
   parseEdgeY(nx, ny, nz, vector, ghostXrightYsameZrightEdge, ghostXleftYsameZleftEdge, ghostXleftYsameZrightEdge, ghostXrightYsameZleftEdge);
   parseEdgeX(nx, ny, nz, vector, ghostXsameYrightZrightEdge, ghostXsameYleftZleftEdge, ghostXsameYleftZrightEdge, ghostXsameYrightZleftEdge);
diff --git a/include/ipicdefs.h b/include/ipicdefs.h
index 53b22cd8..765694d1 100644
--- a/include/ipicdefs.h
+++ b/include/ipicdefs.h
@@ -7,8 +7,8 @@
 // uncomment the following line to use parallel hdf5
 //#define USING_PARALLEL_HDF5
 
-// use precprocessor to remove MPI_Barrier() calls.
-#define MPI_Barrier(args...)
+// use precprocessor to remove former MPI_Barrier() calls.
+//#define MPI_Barrier(args...)
 #define former_MPI_Barrier(args...)
 
 #define ipicMPI_Allreduce(args...) \
diff --git a/main/iPic3Dlib.cpp b/main/iPic3Dlib.cpp
index 33f9a96c..401123bf 100644
--- a/main/iPic3Dlib.cpp
+++ b/main/iPic3Dlib.cpp
@@ -63,7 +63,7 @@ int c_Solver::Init(int argc, char **argv) {
     col->save();
   }
   // Create the local grid
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   grid = new Grid3DCU(col, vct);  // Create the local grid
   EMf = new EMfields3D(col, grid);  // Create Electromagnetic Fields Object
 
@@ -139,7 +139,7 @@ int c_Solver::Init(int argc, char **argv) {
     hdf5_agent.close();
   }
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   Eenergy, Benergy, TOTenergy = 0.0, TOTmomentum = 0.0;
   Ke = new double[ns];
   momentum = new double[ns];
@@ -242,11 +242,11 @@ void c_Solver::CalculateMoments() {
 
   EMf->ConstantChargeOpenBC(grid, vct);     // Set a constant charge in the OpenBC boundaries
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
 
   EMf->interpDensitiesN2C(vct, grid);       // calculate densities on centers from nodes
   EMf->calculateHatFunctions(grid, vct);    // calculate the hat quantities for the implicit method
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
 }
 
 //! MAXWELL SOLVER for Efield
diff --git a/particles/Particles3D.cpp b/particles/Particles3D.cpp
index 5186a4d7..4d9e50bf 100644
--- a/particles/Particles3D.cpp
+++ b/particles/Particles3D.cpp
@@ -1088,14 +1088,14 @@ int Particles3D::communicate_particles(VirtualTopology3D * vct)
   const int avail = communicate(vct);
   if (avail < 0)
     return (-1);
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   // communicate again if particles are not in the correct domain
   while (isMessagingDone(vct) > 0) {
     // COMMUNICATION
     const int avail = communicate(vct);
     if (avail < 0)
       return (-1);
-    MPI_Barrier(MPI_COMM_WORLD);
+    former_MPI_Barrier(MPI_COMM_WORLD);
   }
   return 0; // exit successfully
 }
@@ -1463,7 +1463,7 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
   avail = communicate(vct);
   if (avail < 0) return(-1);
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
 
   // communicate again if particles are not in the correct domain
   while(isMessagingDone(vct) >0){
@@ -1471,7 +1471,7 @@ int Particles3D::particle_repopulator(Grid* grid,VirtualTopology3D* vct, Field*
     avail = communicate(vct);
     if (avail < 0)
       return(-1);
-    MPI_Barrier(MPI_COMM_WORLD);
+    former_MPI_Barrier(MPI_COMM_WORLD);
   }
 
   return(0); // exit succcesfully (hopefully)
diff --git a/performances/Timing.cpp b/performances/Timing.cpp
index e639fd88..11a27364 100644
--- a/performances/Timing.cpp
+++ b/performances/Timing.cpp
@@ -35,7 +35,7 @@ Timing::Timing(int my_rank) {
   // MPE_Describe_state(event2a,event2b,"Field","blue"); // the mover is blue in the visualizer
   // MPE_Describe_state(event3a,event3b,"Interp P->G","yellow"); // the interpolation particle->Grid is yellow in the visualizer
   // }
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   // start the log
   // MPE_Start_log();
 
@@ -44,12 +44,12 @@ Timing::Timing(int my_rank) {
 /** start the timer */
 void Timing::startTiming() {
   ttick = MPI_Wtick();
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   tstart = MPI_Wtime();
 }
 /** stop the timer */
 void Timing::stopTiming() {
-  MPI_Barrier(MPI_COMM_WORLD);
+  former_MPI_Barrier(MPI_COMM_WORLD);
   tend = MPI_Wtime();
   texecution = tend - tstart;
   if (rank_id == 0) {

From 88ee39ef0c1f2d4a60fbaed48c0459fec793438e Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 17:05:01 +0100
Subject: [PATCH 110/118] issue #63: made invalid_value_error thread safe

---
 utility/errors.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/utility/errors.cpp b/utility/errors.cpp
index 9d5d66a9..4cca0143 100644
--- a/utility/errors.cpp
+++ b/utility/errors.cpp
@@ -6,8 +6,8 @@
 #include <cstdarg>
 #include <cstdio>
 #include <cstdlib>
+#include <sstream>
 #include "errors.h"
-//#include "MPIdata.h" // for rank
 
 /** implementation of declarations in errors.h **/
 
@@ -33,20 +33,24 @@ void fprintf_fileLine(FILE * fptr, const char *type, const char *func, const cha
 //  abort();
 //}
 
-// This needs to be fixed to be thread-safe like
-// eprintf_fileLine() below.  Write the message to a string and
-// then print it out as an atomic operation.
+// lazy implementation using streams class
 //
-#include <iostream>
 using namespace std;
 #define implement_invalid_value_error(t1) \
   void invalid_value_error_fileLine(const char* file, int line, const char* func, \
     const char* type, const char* expr, t1 val) \
   { \
-    std::cerr<< "ERROR in file " << file << ", line " << line  \
+    /* To be thread-safe, write the message to a string and \
+     * then print it out as an atomic operation. */ \
+    std::stringstream ss; \
+    ss << "(" << MPIdata::get_rank() << "." << omp_get_thread_num() << ") " \
+      << "ERROR in file " << file << ", line " << line  \
       << ", function " << func  \
       <<"\n\t" << type << " value: " << expr << " = " << val << endl; \
-      abort(); \
+    fflush(stdout); \
+      { fprintf(stdout,ss.str().c_str()); } \
+    fflush(stdout); \
+    abort(); \
   }
 
 implement_invalid_value_error(double);

From 8a1729372f64b9aa09eb38e715db3ff8d9e3a9b3 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 17:25:20 +0100
Subject: [PATCH 111/118] warning messages should issue warning and proceed,
 not exit

---
 include/errors.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/errors.h b/include/errors.h
index ace6b5f2..88281428 100644
--- a/include/errors.h
+++ b/include/errors.h
@@ -15,7 +15,7 @@ void eprintf_fileLine(FILE * fptr, const char *type,
 //#define eprintf(args...) \
 //  eprintf_fileLine("ERROR",__func__, __FILE__, __LINE__, ## args);
 #define warning_printf(args...) \
-  eprintf_fileLine("WARNING",__func__, __FILE__, __LINE__, ## args);
+  fprintf_fileLine(stdout,"WARNING",__func__, __FILE__, __LINE__, ## args);
 #define declare_invalid_value_error(t1) \
   void invalid_value_error_fileLine(const char* file, int line, const char* func, \
     const char* type, const char* expr, t1 val);

From 0996301a024d66b09e582f9fb0d60714f3db3bcf Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 17:28:24 +0100
Subject: [PATCH 112/118] implemented MPIdata::exit(code) to exit after calling
 MPI_Finalize()

---
 include/MPIdata.h   | 4 +++-
 utility/MPIdata.cpp | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/MPIdata.h b/include/MPIdata.h
index 36d29bcf..ef4e7a30 100644
--- a/include/MPIdata.h
+++ b/include/MPIdata.h
@@ -40,7 +40,9 @@ class MPIdata {
   /** initialize MPI environment */
   static void init(int *, char ***);
   /** close MPI environment */
-  void finalize_mpi();
+  static void finalize_mpi();
+  /** finalize and exit with error code */
+  static void exit(int code);
   /** print MPI data structure */
   void Print(void);
   /** MPI status during the communication */
diff --git a/utility/MPIdata.cpp b/utility/MPIdata.cpp
index d70a2b50..d9eb1895 100644
--- a/utility/MPIdata.cpp
+++ b/utility/MPIdata.cpp
@@ -41,6 +41,11 @@ void MPIdata::init(int *argc, char ***argv) {
   MPIdata_is_initialized = true;
 }
 
+void MPIdata::exit(int code) {
+  finalize_mpi();
+  ::exit(code);
+}
+
 void MPIdata::finalize_mpi() {
   MPI_Finalize();
 }

From 507e0a613e3301a142d4481ba8099b9777ba988a Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Fri, 7 Mar 2014 17:29:19 +0100
Subject: [PATCH 113/118] issue #64: MPI should be initialized immediately

---
 iPic3D.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/iPic3D.cpp b/iPic3D.cpp
index 91129c7b..cff2f756 100644
--- a/iPic3D.cpp
+++ b/iPic3D.cpp
@@ -9,10 +9,10 @@ using namespace iPic3D;
 
 int main(int argc, char **argv) {
 
+  MPIdata::init(&argc, &argv);
   iPic3D::c_Solver KCode;
   bool b_err = false;
 
-  MPIdata::init(&argc, &argv);
   KCode.Init(argc, argv);
 
   for (int i = KCode.FirstCycle(); i < KCode.LastCycle(); i++) {

From 333cae31df253e0250fe9fd0a73d5629f5939bd2 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 10 Mar 2014 15:39:32 +0100
Subject: [PATCH 114/118] issue #66: moving includes out of Collective.h

---
 include/Collective.h       | 11 +----------
 include/input_array.h      |  2 +-
 inputoutput/Collective.cpp | 12 +++++++++++-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/Collective.h b/include/Collective.h
index 382611b6..95e929dd 100644
--- a/include/Collective.h
+++ b/include/Collective.h
@@ -13,18 +13,9 @@
 #endif
 
 
-#include <math.h>
-//#include <iostream>
-//#include <fstream>
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "ConfigFile.h"
-#include "input_array.h"
-#include "hdf5.h"
 //#include "CollectiveIO.h"
+class ConfigFile;
 using namespace std;
-
 using std::cout;
 using std::endl;
 using std::ofstream;
diff --git a/include/input_array.h b/include/input_array.h
index 6f8bdd94..be25ec5d 100644
--- a/include/input_array.h
+++ b/include/input_array.h
@@ -5,7 +5,7 @@
 // Modified P. Henri 8 June 2011
 // corrected by Markidis
 
-#include <iostream>
+#include <iosfwd>
 
 struct array_int {
   int a, b, c, d, e, f;
diff --git a/inputoutput/Collective.cpp b/inputoutput/Collective.cpp
index 5f0b7bbe..1675dffe 100644
--- a/inputoutput/Collective.cpp
+++ b/inputoutput/Collective.cpp
@@ -1,8 +1,18 @@
 
 #include <mpi.h>
+#include <math.h>
+//#include <iostream>
+//#include <fstream>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "input_array.h"
+#include "hdf5.h"
 #include "Collective.h"
-#include "debug.h"
+#include "ConfigFile.h"
 #include "limits.h" // for INT_MAX
+#include "MPIdata.h"
+#include "errors.h"
 #include "asserts.h" // for assert_ge
 
 /*! Read the input file from text file and put the data in a collective wrapper: if it's a restart read from input file basic sim data and load particles and EM field from restart file */

From 965aeaf0d9d073838a335ecb2865d8f485552ed0 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 10 Mar 2014 15:48:14 +0100
Subject: [PATCH 115/118] iss #65, iss #67: defined options for second-order
 accuracy

---
 include/Collective.h       | 33 ++++++++++++++++++++-
 inputoutput/Collective.cpp | 59 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/include/Collective.h b/include/Collective.h
index 95e929dd..b4f753cd 100644
--- a/include/Collective.h
+++ b/include/Collective.h
@@ -26,6 +26,21 @@ class Collective
 : public InterfaceFluid
 #endif
 {
+  private:
+    enum Enum{
+      thedefault=0,
+      initial,
+      final,
+      // used by ImplSusceptMode
+      explPredict,
+      implPredict,
+      NUMBER_OF_ENUMS, // this must be last
+      INVALID_ENUM
+    };
+    int read_enum_parameter(const char* option_name, char* default_value,
+      const ConfigFile& config);
+  public:
+    static const char* get_name_of_enum(int in);
   public:
     /*! constructor: initialize physical parameters with values */
     Collective(int argc, char **argv);
@@ -65,6 +80,10 @@ class Collective
     double getC()const{ return (c); }
     double getDt()const{ return (dt); }
     double getTh()const{ return (th); }
+    double getPushWithBatTime()const{ return PushWithBatTime; }
+    double getPushWithEatTime()const{ return PushWithEatTime; }
+    double getImplSusceptTime()const{ return ImplSusceptTime; }
+    int getImplSusceptMode()const{ return ImplSusceptMode; }
     double getSmooth()const{ return (Smooth); }
     int getNcycles()const{ return (ncycles); }
     int getNs()const{ return (ns); }
@@ -145,8 +164,20 @@ class Collective
     double fourpi;
     /*! time step */
     double dt;
+    //
+    // parameters used to support second order accuracy in time 
+    //
     /*! decentering parameter */
-    double th;
+    double th; // second-order for th=1/2, stable for 1/2 <= th <= 1
+    /*! time of magnetic field used in particle push (0=initial, 1=final) */
+    double PushWithBatTime; // 0=initial (default), 1=final
+    /*! time of electric field used in particle push */
+    double PushWithEatTime; // 0=initial, 1=final (default)
+    /*! means of estimating time-advanced implicit susceptibility */
+    int ImplSusceptMode; // "initial" (default), "explPredict", "implPredict"
+    /*! time of implicit susceptibility used in field advance */
+    double ImplSusceptTime; // 0=initial (default), 1=final
+    //
     /*! Smoothing value */
     double Smooth;
     /*! number of time cycles */
diff --git a/inputoutput/Collective.cpp b/inputoutput/Collective.cpp
index 1675dffe..f5179c67 100644
--- a/inputoutput/Collective.cpp
+++ b/inputoutput/Collective.cpp
@@ -15,6 +15,49 @@
 #include "errors.h"
 #include "asserts.h" // for assert_ge
 
+// order must agree with Enum in Collective.h
+static const char *enumNames[] =
+{
+  "default",
+  "initial",
+  "final",
+  // used by ImplSusceptMode
+  "explPredict",
+  "implPredict",
+  // marker for last enumerated symbol of this class
+  "NUMBER_OF_ENUMS",
+  "INVALID_ENUM"
+};
+
+int Collective::read_enum_parameter(const char* option_name, char* default_value,
+  const ConfigFile& config)
+{
+  string enum_name = config.read < string >(option_name,default_value);
+  // search the list (could use std::map)
+  //
+  for(int i=0;i<NUMBER_OF_ENUMS;i++)
+  {
+    if(!strcmp(enum_name.c_str(),enumNames[i]))
+      return i;
+  }
+  // could not find enum, so issue error and quit.
+  if(!MPIdata::get_rank())
+  {
+    eprintf("in input file %s there is an invalid option %s\n",
+      inputfile.c_str(), enum_name.c_str());
+  }
+  MPIdata::exit(1);
+  // this is a better way
+  return INVALID_ENUM;
+}
+
+const char* Collective::get_name_of_enum(int in)
+{
+  assert_ge(in, 0);
+  assert_lt(in, NUMBER_OF_ENUMS);
+  return enumNames[in];
+}
+
 /*! Read the input file from text file and put the data in a collective wrapper: if it's a restart read from input file basic sim data and load particles and EM field from restart file */
 void Collective::ReadInput(string inputfile) {
   using namespace std;
@@ -41,6 +84,22 @@ void Collective::ReadInput(string inputfile) {
     ns = config.read < int >("ns");
     NpMaxNpRatio = config.read < double >("NpMaxNpRatio");
     assert_ge(NpMaxNpRatio, 1.);
+    // mode parameters for second order in time
+    PushWithBatTime = config.read < double >("PushWithBatTime",0);
+    PushWithEatTime = config.read < double >("PushWithEatTime",1);
+    ImplSusceptTime = config.read < double >("ImplSusceptTime",0);
+    ImplSusceptMode = read_enum_parameter("ImplSusceptMode", "initial",config);
+    switch(ImplSusceptMode)
+    {
+      // values not yet supported:
+      case explPredict:
+      case implPredict:
+      default:
+        unsupported_value_error(ImplSusceptMode);
+      // supported values:
+      case initial:
+        ;
+    }
     // GEM Challenge 
     B0x = config.read <double>("B0x");
     B0y = config.read <double>("B0y");

From 776da31af6c8b88505ca3852e4c99dd9da1bce45 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 10 Mar 2014 17:25:24 +0100
Subject: [PATCH 116/118] fixed compile errors on icpc probably introduced two
 commits ago

---
 fields/EMfields3D.cpp      |  1 +
 grids/Grid3DCU.cpp         | 22 +++++++++++++++-------
 include/Collective.h       |  6 +-----
 include/Grid3DCU.h         |  3 ---
 include/input_array.h      |  2 +-
 inputoutput/Collective.cpp |  2 +-
 6 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/fields/EMfields3D.cpp b/fields/EMfields3D.cpp
index 2bcc6fef..49747cc2 100644
--- a/fields/EMfields3D.cpp
+++ b/fields/EMfields3D.cpp
@@ -7,6 +7,7 @@
 #include "Parameters.h"
 #include "ompdefs.h"
 #include "debug.h"
+#include "string.h" // for memset
 
 /*! constructor */
 //
diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index f13d8aed..081a0855 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -87,13 +87,21 @@ Grid3DCU::~Grid3DCU() {
 
 /** print the local grid info */
 void Grid3DCU::print(VirtualTopology3D * ptVCT) {
-  cout << endl;
-  cout << "Subgrid (" << ptVCT->getCoordinates(0) << "," << ptVCT->getCoordinates(1) << "," << ptVCT->getCoordinates(2) << ")" << endl;
-  cout << "Number of cell: -X=" << nxc - 2 << " -Y=" << nyc - 2 << " -Z=" << nzc - 2 << endl;
-  cout << "Xin = " << node_xcoord[1] << "; Xfin = " << node_xcoord[nxn - 2] << endl;
-  cout << "Yin = " << node_ycoord[1] << "; Yfin = " << node_ycoord[nyn - 2] << endl;
-  cout << "Zin = " << node_zcoord[1] << "; Zfin = " << node_zcoord[nzn - 2] << endl;
-  cout << endl;
+  printf("\nSubgrid (%d,%d,%d)\n",
+    ptVCT->getCoordinates(0),
+    ptVCT->getCoordinates(1),
+    ptVCT->getCoordinates(2));
+  printf("Number of cells: X:%d, Y:%d, Z:%d\n",
+    nxc - 2,
+    nyc - 2,
+    nzc - 2);
+  printf(
+    "Xin = %d; Xfin = %d\n"
+    "Yin = %d; Yfin = %d\n"
+    "Zin = %d; Zfin = %d\n\n",
+    node_xcoord[1], node_xcoord[nxn - 2],
+    node_ycoord[1], node_ycoord[nyn - 2],
+    node_zcoord[1], node_zcoord[nzn - 2]);
 }
 
 /** calculate gradient on nodes, given a scalar field defined on central points  */
diff --git a/include/Collective.h b/include/Collective.h
index b4f753cd..f98c3dd0 100644
--- a/include/Collective.h
+++ b/include/Collective.h
@@ -11,15 +11,11 @@
 #ifdef BATSRUS
 #include "InterfaceFluid.h"
 #endif
-
+#include <string>
 
 //#include "CollectiveIO.h"
 class ConfigFile;
 using namespace std;
-using std::cout;
-using std::endl;
-using std::ofstream;
-using namespace std;
 
 class Collective
 #ifdef BATSRUS
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index 92651196..497bbf0c 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -14,9 +14,6 @@
 #include "VirtualTopology3D.h"
 #include "Alloc.h"
 
-using std::cout;
-using std::endl;
-
 /**
  * Uniform cartesian local grid 3D
  *
diff --git a/include/input_array.h b/include/input_array.h
index be25ec5d..6f8bdd94 100644
--- a/include/input_array.h
+++ b/include/input_array.h
@@ -5,7 +5,7 @@
 // Modified P. Henri 8 June 2011
 // corrected by Markidis
 
-#include <iosfwd>
+#include <iostream>
 
 struct array_int {
   int a, b, c, d, e, f;
diff --git a/inputoutput/Collective.cpp b/inputoutput/Collective.cpp
index f5179c67..f60fbda2 100644
--- a/inputoutput/Collective.cpp
+++ b/inputoutput/Collective.cpp
@@ -3,7 +3,6 @@
 #include <math.h>
 //#include <iostream>
 //#include <fstream>
-#include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "input_array.h"
@@ -14,6 +13,7 @@
 #include "MPIdata.h"
 #include "errors.h"
 #include "asserts.h" // for assert_ge
+#include "string.h"
 
 // order must agree with Enum in Collective.h
 static const char *enumNames[] =

From e5fa332846c270c6d6f74412b4f109b3026f1c54 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Mon, 10 Mar 2014 17:41:23 +0100
Subject: [PATCH 117/118] "ipic exec" now calls mpiexec, just as "ipic run"
 calls mpirun

---
 scripts/ipic.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/ipic.py b/scripts/ipic.py
index 3ae87d9e..6b2e2540 100755
--- a/scripts/ipic.py
+++ b/scripts/ipic.py
@@ -56,7 +56,7 @@ def issue_shell_command(command):
     print '+', command
     os.system(command)
 
-def construct_run_command(args):
+def construct_run_command(args,mpirun):
 
     # convert from deque to list for getopts
     args = list(args)
@@ -68,7 +68,6 @@ def construct_run_command(args):
     output = 'data'
     inputfile = 'src/inputfiles/GEM.inp'
     hostname = ''
-    mpirun = 'mpirun'
     global system
     if system == 'xeon' or system == 'mic':
       if system == 'xeon':
@@ -169,7 +168,11 @@ def construct_run_command(args):
     return command
 
 def ipic_run(args):
-    command = construct_run_command(args);
+    command = construct_run_command(args,'mpirun');
+    issue_command(command)
+
+def ipic_exec(args):
+    command = construct_run_command(args,'mpiexec');
     issue_command(command)
 
 def ipic_show_run(args):
@@ -497,6 +500,8 @@ def ipic_command(argv1):
         ipic_cmake(args)
     elif command == "run":
         ipic_run(args)
+    elif command == "exec":
+        ipic_exec(args)
     elif command == "findcpph":
         ipic_findcpph(args)
     else:

From 8c8d6577a0ebfe0e5dd3afc6bd8a622d7c71d5c6 Mon Sep 17 00:00:00 2001
From: eajohnson <e.alec.johnson@gmail.com>
Date: Wed, 26 Mar 2014 11:50:41 +0100
Subject: [PATCH 118/118] removed unnecessary header includes in Grid3DCU.h
 (iss #66)

---
 grids/Grid3DCU.cpp  |  4 ++++
 include/Grid3DCU.h  | 10 ++++------
 include/PSKOutput.h |  2 +-
 include/arraysfwd.h |  4 ++--
 include/ipicfwd.h   |  6 ++++++
 5 files changed, 17 insertions(+), 9 deletions(-)
 create mode 100644 include/ipicfwd.h

diff --git a/grids/Grid3DCU.cpp b/grids/Grid3DCU.cpp
index 081a0855..914ae4a9 100644
--- a/grids/Grid3DCU.cpp
+++ b/grids/Grid3DCU.cpp
@@ -2,6 +2,10 @@
 #include <mpi.h>
 #include "Grid3DCU.h"
 #include "MPIdata.h"
+#include "Alloc.h"
+#include "CollectiveIO.h"
+#include "ComNodes3D.h" // for communicateCenterBC
+#include "VirtualTopology3D.h"
 
 /*! constructor */
 Grid3DCU::Grid3DCU(CollectiveIO * col, VirtualTopology3D * vct) {
diff --git a/include/Grid3DCU.h b/include/Grid3DCU.h
index 497bbf0c..2b688557 100644
--- a/include/Grid3DCU.h
+++ b/include/Grid3DCU.h
@@ -7,13 +7,11 @@
 #ifndef GRID3DCU_H
 #define GRID3DCU_H
 
-#include "Grid.h"
-#include "CollectiveIO.h"
-#include "ComInterpNodes3D.h"
-#include "ComNodes3D.h"
-#include "VirtualTopology3D.h"
-#include "Alloc.h"
+#include "arraysfwd.h"
+#include "ipicfwd.h"
+#include "math.h" // for floor
 
+class VirtualTopology3D;
 /**
  * Uniform cartesian local grid 3D
  *
diff --git a/include/PSKOutput.h b/include/PSKOutput.h
index cdc5bbc5..2c461c64 100644
--- a/include/PSKOutput.h
+++ b/include/PSKOutput.h
@@ -14,10 +14,10 @@ developers: D. Burgess, June/July 2006
 #include <list>
 
 #include "errors.h"
+#include "Grid.h"
 #include "PSKException.h"
 #include "Particles3Dcomm.h"
 #include "Field.h"
-#include "Grid.h"
 #include "Collective.h"
 #include "VCtopology3D.h"
 #include "MPIdata.h"
diff --git a/include/arraysfwd.h b/include/arraysfwd.h
index 0afc2826..ca14498b 100644
--- a/include/arraysfwd.h
+++ b/include/arraysfwd.h
@@ -60,8 +60,8 @@ typedef iPic3D::array4<pfloat> array4_pfloat;
 // This directive should be consistent with the directives in Alloc.h
 #if defined(FLAT_ARRAYS) || defined(CHECK_BOUNDS)
 typedef iPic3D::array_fetch1<double> arr1_double_fetch;
-typedef iPic3D::array_get1<double> arr1_double_get;
-typedef iPic3D::array_get1<pfloat> arr1_pfloat_get;
+typedef iPic3D::const_array_get1<double> arr1_double_get;
+typedef iPic3D::const_array_get1<pfloat> arr1_pfloat_get;
 typedef iPic3D::array_fetch2<double> arr2_double_fetch;
 typedef iPic3D::array_fetch3<double> arr3_double_fetch;
 #else
diff --git a/include/ipicfwd.h b/include/ipicfwd.h
new file mode 100644
index 00000000..d546e984
--- /dev/null
+++ b/include/ipicfwd.h
@@ -0,0 +1,6 @@
+#ifndef ipicfwd_h
+#define ipicfwd_h
+// forward declarations for iPic3D
+class Collective;
+typedef Collective CollectiveIO;
+#endif