From 9e4f3052022712567cb21af0891d2d39c5d97bd1 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Wed, 1 Feb 2023 15:24:14 +0200
Subject: [PATCH 01/26] Enabled z-mipmaps for fast converter mode

---
 src/MipMap.cc | 67 +++++++++++++++++++++++++++++++++++++++------------
 src/MipMap.h  | 15 ++++++------
 src/Util.cc   | 12 +++++----
 src/Util.h    |  2 +-
 4 files changed, 67 insertions(+), 29 deletions(-)

diff --git a/src/MipMap.cc b/src/MipMap.cc
index c451022..28f6e71 100644
--- a/src/MipMap.cc
+++ b/src/MipMap.cc
@@ -7,7 +7,7 @@
 
 // MipMap
 
-MipMap::MipMap(const std::vector<hsize_t>& datasetDims, int mip) : datasetDims(datasetDims), mip(mip) {}
+MipMap::MipMap(const std::vector<hsize_t>& datasetDims, int mipXY, int mipZ) : datasetDims(datasetDims), mipXY(mipXY), mipZ(mipZ) {}
 
 MipMap::~MipMap() {
     if (!bufferDims.empty()) {
@@ -21,7 +21,17 @@ void MipMap::createDataset(H5::Group group, const std::vector<hsize_t>& chunkDim
     floatType.setOrder(H5T_ORDER_LE);
     
     std::ostringstream mipMapName;
-    mipMapName << "MipMaps/DATA/DATA_XY_" << mip;
+    
+    mipMapName << "MipMaps/DATA/DATA_";
+    
+    if (mipXY == mipZ)
+        mipMapName << "XYZ_" << mipXY;
+    else if (mipXY < 2)
+        mipMapName << "Z_" << mipZ;
+    else if (mipZ < 2)
+        mipMapName << "XY_" << mipXY;
+    else
+        mipMapName << "XYZ_" << mipXY << "_" << mipXY << "_" << mipZ;
     
     if (useChunks(datasetDims)) {
         createHdf5Dataset(dataset, group, mipMapName.str(), floatType, datasetDims, chunkDims);
@@ -66,29 +76,54 @@ void MipMap::resetBuffers() {
 MipMaps::MipMaps(std::vector<hsize_t> standardDims, const std::vector<hsize_t>& chunkDims) : standardDims(standardDims), chunkDims(chunkDims) {
     auto dims = standardDims;
     int N = dims.size();
-    int mip = 1;
+    int mipXY = 1;
+    int mipZ = 1;
     
     // We keep going until we have a mipmap which fits entirely within the minimum size
-    while (dims[N - 1] > MIN_MIPMAP_SIZE || dims[N - 2] > MIN_MIPMAP_SIZE) {
-        mip *= 2;
-        dims = mipDims(dims, 2);
-        mipMaps.push_back(MipMap(dims, mip));
-    }
+    do {
+        if (N > 2) {
+            do {
+                if (mipXY > 1 || mipZ > 1)
+                    mipMaps.push_back(MipMap(dims, mipXY, mipZ));
+                mipZ *= 2;
+                dims = mipDims(dims, 1, 2);
+            } while (2 * dims[N - 3] > MIN_MIPMAP_SIZE);
+            mipZ = 1;
+            dims[N - 3] = standardDims[N - 3];
+        } else if (mipXY > 1)
+            mipMaps.push_back(MipMap(dims, mipXY, mipZ));
+        mipXY *= 2;
+        dims = mipDims(dims, 2, 1);
+    } while (2 * dims[N - 1] > MIN_MIPMAP_SIZE || 2 * dims[N - 2] > MIN_MIPMAP_SIZE);
+
 }
 
 hsize_t MipMaps::size(const std::vector<hsize_t>& standardDims, const std::vector<hsize_t>& standardBufferDims) {
     hsize_t size = 0;
-    int mip = 1;
+    int mipXY = 1;
+    int mipZ = 1;
     auto datasetDims = standardDims;
     auto bufferDims = standardBufferDims;
     int N = standardDims.size();
     
-    while (datasetDims[N - 1] > MIN_MIPMAP_SIZE || datasetDims[N - 2] > MIN_MIPMAP_SIZE) {
-        mip *= 2;
-        datasetDims = mipDims(datasetDims, 2);
-        bufferDims = mipDims(bufferDims, 2);
-        size += (sizeof(double) + sizeof(int)) * product(bufferDims);
-    }
+    do {
+        if (N > 2) {
+            do {
+                if (mipXY > 1 || mipZ > 1)
+                    size += (sizeof(double) + sizeof(int)) * product(bufferDims);
+                mipZ *= 2;
+                datasetDims = mipDims(datasetDims, 1, 2);
+                bufferDims = mipDims(bufferDims, 1, 2);
+            } while (2 * datasetDims[N - 3] > MIN_MIPMAP_SIZE);
+            mipZ = 1;
+            datasetDims[N - 3] = standardDims[N - 3];
+            bufferDims[N - 3] = standardBufferDims[N - 3];
+        } else if (mipXY > 1)
+            size += (sizeof(double) + sizeof(int)) * product(bufferDims);
+        mipXY *= 2;
+        datasetDims = mipDims(datasetDims, 2, 1);
+        bufferDims = mipDims(bufferDims, 2, 1);
+    } while (2 * datasetDims[N - 1] > MIN_MIPMAP_SIZE || 2 * datasetDims[N - 2] > MIN_MIPMAP_SIZE);
 
     return size;
 }
@@ -102,7 +137,7 @@ void MipMaps::createDatasets(H5::Group group) {
 
 void MipMaps::createBuffers(const std::vector<hsize_t>& standardBufferDims) {
     for (auto& mipMap : mipMaps) {
-        auto dims = mipDims(standardBufferDims, mipMap.mip);
+        auto dims = mipDims(standardBufferDims, mipMap.mipXY, mipMap.mipZ);
         mipMap.createBuffers(dims);
     }
 }
diff --git a/src/MipMap.h b/src/MipMap.h
index 4df365b..b6bc4d0 100644
--- a/src/MipMap.h
+++ b/src/MipMap.h
@@ -12,18 +12,18 @@
 // A single mipmap
 struct MipMap {
     MipMap() {};
-    MipMap(const std::vector<hsize_t>& datasetDims, int mip);
+    MipMap(const std::vector<hsize_t>& datasetDims, int mipXY, int mipZ);
     ~MipMap();
     
     void createDataset(H5::Group group, const std::vector<hsize_t>& chunkDims);
     void createBuffers(std::vector<hsize_t>& bufferDims);
     
-    void accumulate(double val, hsize_t x, hsize_t y, hsize_t totalChannelOffset) {
-        hsize_t mipIndex = totalChannelOffset * width * height + (y / mip) * width + (x / mip);
+    void accumulate(double val, hsize_t x, hsize_t y, hsize_t z) {
+        hsize_t mipIndex = (z / mipZ) * width * height + (y / mipXY) * width + (x / mipXY);
         vals[mipIndex] += val;
         count[mipIndex]++;
     }
-
+    
     void calculate() {
         for (hsize_t mipIndex = 0; mipIndex < bufferSize; mipIndex++) {
             if (count[mipIndex]) {
@@ -38,7 +38,8 @@ struct MipMap {
     void resetBuffers();
     
     std::vector<hsize_t> datasetDims;
-    int mip;
+    int mipXY;
+    int mipZ;
     
     H5::DataSet dataset;
     
@@ -65,9 +66,9 @@ struct MipMaps {
     void createDatasets(H5::Group group);
     void createBuffers(const std::vector<hsize_t>& standardBufferDims);
     
-    void accumulate(double val, hsize_t x, hsize_t y, hsize_t totalChannelOffset) {
+    void accumulate(double val, hsize_t x, hsize_t y, hsize_t z) {
         for (auto& mipMap : mipMaps) {
-            mipMap.accumulate(val, x, y, totalChannelOffset);
+            mipMap.accumulate(val, x, y, z);
         }
     }
 
diff --git a/src/Util.cc b/src/Util.cc
index 79f2590..e338b00 100644
--- a/src/Util.cc
+++ b/src/Util.cc
@@ -31,14 +31,16 @@ std::vector<hsize_t> extend(const std::vector<hsize_t>& left, const std::vector<
     return result;
 }
 
-std::vector<hsize_t> mipDims(const std::vector<hsize_t>& dims, int mip) {
+std::vector<hsize_t> mipDims(const std::vector<hsize_t>& dims, int mipXY, int mipZ) {
     int N = dims.size();
     auto mipDims = dims;
-    
-    for (auto i = std::max(0, N - 2); i < N; i++) {
-        mipDims[i] = std::ceil((float)mipDims[i] / mip);
+    if (mipXY > 1) {
+        for (auto i = std::max(0, N - 2); i < N; i++) {
+            mipDims[i] = std::ceil((float)mipDims[i] / mipXY);
+        }
     }
-    
+    if (mipZ > 1 && N > 2)
+        mipDims[N - 3] = std::ceil((float)mipDims[N - 3] / mipZ);
     return mipDims;
 }
 
diff --git a/src/Util.h b/src/Util.h
index 6518829..be9519e 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -11,7 +11,7 @@
 std::vector<std::string> split(const std::string &str, char separator);
 std::vector<hsize_t> trimAxes(const std::vector<hsize_t>& dims, int N);
 std::vector<hsize_t> extend(const std::vector<hsize_t>& left, const std::vector<hsize_t>& right);
-std::vector<hsize_t> mipDims(const std::vector<hsize_t>& dims, int mip);
+std::vector<hsize_t> mipDims(const std::vector<hsize_t>& dims, int mipXY, int mipZ);
 hsize_t product(const std::vector<hsize_t>& dims);
 
 template <typename T>

From 3ff9618452f5de86a505ac2304ef1520800bff86 Mon Sep 17 00:00:00 2001
From: CosmicElysium <alexandersivitilli@gmail.com>
Date: Mon, 7 Aug 2023 16:09:33 +0200
Subject: [PATCH 02/26] Added z-mipmap arguments and tests

---
 scripts/convertertest.py | 78 +++++++++++++++++++++++++++++++++-------
 src/Converter.cc         | 10 +++---
 src/Converter.h          |  8 ++---
 src/FastConverter.cc     |  2 +-
 src/MipMap.cc            |  4 +--
 src/MipMap.h             |  2 +-
 src/SlowConverter.cc     |  2 +-
 src/main.cc              | 17 +++++----
 8 files changed, 90 insertions(+), 33 deletions(-)

diff --git a/scripts/convertertest.py b/scripts/convertertest.py
index 0e5c699..6f630bf 100755
--- a/scripts/convertertest.py
+++ b/scripts/convertertest.py
@@ -19,7 +19,7 @@
 def pprint_sparse_diff(one, two, tolerance=1e-6):
     return "\n".join(["%r: %g" % (i, v) for i, v in np.ndenumerate(np.abs(one - two)) if v > tolerance])
 
-def compare_fits_hdf5(fitsname, hdf5name):
+def compare_fits_hdf5(fitsname, hdf5name, zmips=False):
 
     fitsfile = fits.open(fitsname)
     hdf5file = h5py.File(hdf5name)
@@ -119,21 +119,45 @@ def assert_close(stat, func, data=fitsdata):
     
     if "MipMaps" in hdf5file["0"]:
         for mname, mipmap in sorted(hdf5file["0/MipMaps"]["DATA"].items(), key=lambda x: x[1].size, reverse=True):
-            factor = int(re.match(r"DATA_XY_(\d+)", mname).group(1))
+            if (re.search("_XYZ_", mname)):
+                factors = re.findall(r"_(\d+)", mname)
+                if (len(factors) == 3):
+                    xyfactor = int(factors[0])
+                    zfactor = int(factors[2])
+                elif (len(factors) == 1):
+                    xyfactor = zfactor = int(factors[0])
+                else:
+                    assert False, "Wrong number of mipmap factors found: {}".format(mname)
+            elif (re.search("_XY_", mname)):
+                xyfactor = int(re.match(r"DATA_XY_(\d+)", mname).group(1))
+                zfactor = 1
+            elif (re.search("_Z_", mname)):
+                zfactor = int(re.match(r"DATA_Z_(\d+)", mname).group(1))
+                xyfactor = 1
+            else:
+                assert False, "Unknown mipmap found: {}".format(mname)
+
             mheight, mwidth = mipmap.shape[-2:]
-            
-            assert mwidth == np.ceil(width / factor) and mheight == np.ceil(height / factor), "Dimensions of mipmap %s are incorrect. Expected: %r Got: %r" % (mname, d.shape[:-2] + (mheight, mwidth), mipmap.shape)
-        
+
+            assert mwidth == np.ceil(width / xyfactor) and mheight == np.ceil(height / xyfactor) , "Dimensions of mipmap %s are incorrect. Expected: %r Got: %r" % (mname, d.shape[:-2] + (mheight, mwidth), mipmap.shape)
+
+            if (zmips and len(mipmap.shape) > 2):
+                mdepth = mipmap.shape[-3]
+                assert mdepth == np.ceil(depth / zfactor)
+
             # check mipmap contents
             
             def assert_mipmap_channel_equal(name, channel, d, got):
                 got = np.array(got)
-                expected = np.array([[np.nanmean(d[y*factor:(y+1)*factor, x*factor:(x+1)*factor]) for x in range(mwidth)] for y in range(mheight)])
+                if zmips and len(mipmap.shape) > 2:
+                    expected = np.array([[[np.nanmean(d[z*zfactor:(z+1)*zfactor, y*xyfactor:(y+1)*xyfactor, x*xyfactor:(x+1)*xyfactor]) for x in range(mwidth)] for y in range(mheight)] for z in range(mdepth)])
+                else:
+                    expected = np.array([[np.nanmean(d[y*xyfactor:(y+1)*xyfactor, x*xyfactor:(x+1)*xyfactor]) for x in range(mwidth)] for y in range(mheight)])
                 assert_allclose(expected, got, rtol=1e-5, atol=1e-7, equal_nan=True, err_msg = "Mipmap %s channel %r is incorrect. \nEXPECTED:\n%r\nGOT:\n%r\nDIFF:\n%s" % (name, channel, expected, got, pprint_sparse_diff(expected, got)))
             
             d = np.array(hdf5data)
-            rest = mipmap.shape[:-2]
-            
+            rest = mipmap.shape[:-3] if zmips else mipmap.shape[:-2]
+
             if rest:
                 for i in np.ndindex(rest):
                     assert_mipmap_channel_equal(mname, i, d[i], mipmap[i])
@@ -142,7 +166,9 @@ def assert_mipmap_channel_equal(name, channel, d, got):
 
         # check that the last mipmap is small enough
         assert mheight <= 128 and mwidth <= 128, "Smallest mipmap (%s) does not fit in 128x128 tile (dims: (%d, %d))" % (mname, mwidth, mheight)
-    
+        if (zmips and len(mipmap.shape) > 2):
+            assert mdepth <= 128
+
     fitsfile.close()
     hdf5file.close()
 
@@ -184,10 +210,12 @@ def make_image(outfile, *dims, **params):
     result = subprocess.run(cmd)
     assert result.returncode == 0, "Image generation failed."
     
-def convert(infile, outfile, executable, slow=False):
+def convert(infile, outfile, executable, slow=False, zMips=False):
     cmd = [executable]
     if slow:
         cmd.append("-s")
+    if zMips:
+        cmd.append("-z")
     cmd.extend(["-o", outfile, infile])
     
     print(*cmd)
@@ -280,6 +308,21 @@ def large_mipmap_image_set():
                 image_set.append((dims, params))
     return image_set
 
+def depth_mipmap_image_set():
+    image_set = []
+
+    # A few 3d images to test z mipmaps
+    for dims in ((130, 130), (130, 130, 130), (130, 130, 130, 2)):
+        for nans in (("pixel",),):
+            for nan_density in (50,):
+                params = {
+                    "--nans": nans,
+                    "--nan-density": nan_density
+                }
+
+                image_set.append((dims, params))
+    return image_set
+
 def large_timer_image_set(slow=False):
     if slow:
         return [
@@ -326,6 +369,7 @@ def dummy_image_set():
     "SMALL_NANS": small_nans_image_set(),
     "SMALL_DIMS": small_dims_image_set(),
     "LARGE_MIPMAP": large_mipmap_image_set(),
+    "DEPTH_MIPMAP": depth_mipmap_image_set(),
     "LARGE_TIMER_SQUARE_FAST": large_timer_image_set(slow=False),
     "LARGE_TIMER_SQUARE_SLOW": large_timer_image_set(slow=True),
     "WIDE_TIMER_SQUARE":  wide_timer_image_set(),
@@ -333,7 +377,7 @@ def dummy_image_set():
     "DUMMY":  dummy_image_set(),
 
 }
-            
+
 def test_speed(args, *image_sets):
     executables = [args.executable]
     if "compare" in args:
@@ -346,7 +390,7 @@ def test_speed(args, *image_sets):
             for image_set in image_sets:
                 for dims, _ in image_set: # we only care about the dimensions
                     make_image("test.fits", *dims)
-                    t = time("test.fits", new_converter, kwargs["slow"])
+                    t = time("test.fits", executable, args.slow)
                     if t is not None:
                         times[dims][executable].append(t)
     
@@ -384,6 +428,11 @@ def test_speed(args, *image_sets):
                     print('-', end='\t')
             print()
 
+def test_zmips_converter(infile, new_converter):
+    convert(infile, "ZMIPS.hdf5", new_converter, False, True)
+    compare_fits_hdf5(infile, "ZMIPS.hdf5", True)
+    subprocess.run(["rm", "test.fits", "ZMIPS.hdf5"])
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Test for the HDF5 converter")
     parser.add_argument('-c', '--compare', help='A path to another converter executable to use as a reference. If a reference converter is given, random files converted using the fast algorithm with both converters will be compared to each other. If none is given, the output of the converter at the default path will be checked for correctness (THIS IS SLOW), and its fast and slow algorithm outputs will be compared for consistency.')
@@ -391,18 +440,21 @@ def test_speed(args, *image_sets):
     parser.add_argument('-s', '--slow', action='store_true', help='Use the slow converter versions when comparing converters or timing converter(s).')
     parser.add_argument('-i', '--image-set', nargs="+", help="The image set(s) to use. Any combination of %s. By default a small dummy set is used." % ", ".join(repr(o) for o in IMAGE_SETS) , default=["DUMMY"])
     parser.add_argument('-r', '--repeat', type=int, help="The number of times to repeat timed conversions (default: 3).", default=3)
+    parser.add_argument('-z', '--zmips', action='store_true', help='Test if HDF5 conversion for depth MipMaps produces valid result.')
     parser.add_argument("executable", help="The path to the executable to test.")
     args = parser.parse_args()
     
     image_sets = (IMAGE_SETS[i] for i in args.image_set)
     
     if args.time:
-        test_speed(args, *image_sets)
+        test_speed(args, *image_sets, args.executable)
     else:
         for image_set in image_sets:
             for dims, params in image_set:
                 make_image("test.fits", *dims, **params)
                 if args.compare:
                     test_new_old_converter("test.fits", args.slow, args.compare, args.executable)
+                elif args.zmips:
+                    test_zmips_converter("test.fits", args.executable);
                 else:
                     test_converter_correctness("test.fits", args.executable)
diff --git a/src/Converter.cc b/src/Converter.cc
index a274645..85bf99a 100644
--- a/src/Converter.cc
+++ b/src/Converter.cc
@@ -5,7 +5,7 @@
 
 #include "Converter.h"
 
-Converter::Converter(std::string inputFileName, std::string outputFileName, bool progress) : timer(), progress(progress) {
+Converter::Converter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips) : timer(), progress(progress) {
     TIMER(timer.start("Setup"););
     
     openFitsFile(&inputFilePtr, inputFileName);
@@ -39,7 +39,7 @@ Converter::Converter(std::string inputFileName, std::string outputFileName, bool
     }
     
     // MIPMAPS
-    mipMaps = MipMaps(standardDims, tileDims);
+    mipMaps = MipMaps(standardDims, tileDims, zMips);
     
     // Prepare output file
     this->outputFileName = outputFileName;
@@ -52,11 +52,11 @@ Converter::~Converter() {
     closeFitsFile(inputFilePtr);
 }
 
-std::unique_ptr<Converter> Converter::getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool progress) {
+std::unique_ptr<Converter> Converter::getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool progress, bool zMips) {
     if (slow) {
-        return std::unique_ptr<Converter>(new SlowConverter(inputFileName, outputFileName, progress));
+        return std::unique_ptr<Converter>(new SlowConverter(inputFileName, outputFileName, progress, zMips));
     } else {
-        return std::unique_ptr<Converter>(new FastConverter(inputFileName, outputFileName, progress));
+        return std::unique_ptr<Converter>(new FastConverter(inputFileName, outputFileName, progress, zMips));
     }
 }
 
diff --git a/src/Converter.h b/src/Converter.h
index ffa5bbe..1f3817e 100644
--- a/src/Converter.h
+++ b/src/Converter.h
@@ -23,10 +23,10 @@ struct MemoryUsage {
 class Converter {
 public:
     Converter() {}
-    Converter(std::string inputFileName, std::string outputFileName, bool progress);
+    Converter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
     ~Converter();
     
-    static std::unique_ptr<Converter> getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool progress);
+    static std::unique_ptr<Converter> getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool progress, bool zMips);
     void convert();
     void reportMemoryUsage();
     virtual MemoryUsage calculateMemoryUsage() = 0;
@@ -74,7 +74,7 @@ class Converter {
 
 class FastConverter : public Converter {
 public:
-    FastConverter(std::string inputFileName, std::string outputFileName, bool progress);
+    FastConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
     MemoryUsage calculateMemoryUsage() override;
     
 protected:
@@ -84,7 +84,7 @@ class FastConverter : public Converter {
 
 class SlowConverter : public Converter {
 public:
-    SlowConverter(std::string inputFileName, std::string outputFileName, bool progress);
+    SlowConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
     MemoryUsage calculateMemoryUsage() override;
     
 protected:
diff --git a/src/FastConverter.cc b/src/FastConverter.cc
index 9b9fb6e..2a8cb3e 100644
--- a/src/FastConverter.cc
+++ b/src/FastConverter.cc
@@ -6,7 +6,7 @@
 #include "Converter.h"
 
 // TODO do we need these?
-FastConverter::FastConverter(std::string inputFileName, std::string outputFileName, bool progress) : Converter(inputFileName, outputFileName, progress) {}
+FastConverter::FastConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips) : Converter(inputFileName, outputFileName, progress, zMips) {}
 
 MemoryUsage FastConverter::calculateMemoryUsage() {
     MemoryUsage m;
diff --git a/src/MipMap.cc b/src/MipMap.cc
index 28f6e71..fedeea7 100644
--- a/src/MipMap.cc
+++ b/src/MipMap.cc
@@ -73,7 +73,7 @@ void MipMap::resetBuffers() {
 
 // MipMaps
 
-MipMaps::MipMaps(std::vector<hsize_t> standardDims, const std::vector<hsize_t>& chunkDims) : standardDims(standardDims), chunkDims(chunkDims) {
+MipMaps::MipMaps(std::vector<hsize_t> standardDims, const std::vector<hsize_t>& chunkDims, bool zMips) : standardDims(standardDims), chunkDims(chunkDims) {
     auto dims = standardDims;
     int N = dims.size();
     int mipXY = 1;
@@ -81,7 +81,7 @@ MipMaps::MipMaps(std::vector<hsize_t> standardDims, const std::vector<hsize_t>&
     
     // We keep going until we have a mipmap which fits entirely within the minimum size
     do {
-        if (N > 2) {
+        if (N > 2 && zMips) {
             do {
                 if (mipXY > 1 || mipZ > 1)
                     mipMaps.push_back(MipMap(dims, mipXY, mipZ));
diff --git a/src/MipMap.h b/src/MipMap.h
index b6bc4d0..9947d5d 100644
--- a/src/MipMap.h
+++ b/src/MipMap.h
@@ -58,7 +58,7 @@ struct MipMap {
 // A set of mipmaps
 struct MipMaps {
     MipMaps() {};
-    MipMaps(std::vector<hsize_t> standardDims, const std::vector<hsize_t>& chunkDims);
+    MipMaps(std::vector<hsize_t> standardDims, const std::vector<hsize_t>& chunkDims, bool zMips);
     
     // We need the dataset dimensions to work out how many mipmaps we have
     static hsize_t size(const std::vector<hsize_t>& standardDims, const std::vector<hsize_t>& standardBufferDims);
diff --git a/src/SlowConverter.cc b/src/SlowConverter.cc
index ef7c08a..00f5ece 100644
--- a/src/SlowConverter.cc
+++ b/src/SlowConverter.cc
@@ -5,7 +5,7 @@
 
 #include "Converter.h"
 
-SlowConverter::SlowConverter(std::string inputFileName, std::string outputFileName, bool progress) : Converter(inputFileName, outputFileName, progress) {}
+SlowConverter::SlowConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips) : Converter(inputFileName, outputFileName, progress, zMips) {}
 
 MemoryUsage SlowConverter::calculateMemoryUsage() {
     MemoryUsage m;
diff --git a/src/main.cc b/src/main.cc
index bf912aa..2e53570 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -9,7 +9,7 @@
 #include <sstream>
 #include "Converter.h"
 
-bool getOptions(int argc, char** argv, std::string& inputFileName, std::string& outputFileName, bool& slow, bool& progress, bool& onlyReportMemory) {
+bool getOptions(int argc, char** argv, std::string& inputFileName, std::string& outputFileName, bool& slow, bool& progress, bool& onlyReportMemory, bool& zMips) {
     extern int optind;
     extern char *optarg;
     
@@ -19,15 +19,16 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
     std::ostringstream usage;
     usage << "IDIA FITS to HDF5 converter version " << HDF5_CONVERTER_VERSION 
     << " using IDIA schema version " << SCHEMA_VERSION << std::endl
-    << "Usage: fits2idia [-o output_filename] [-s] [-p] [-m] input_filename" << std::endl << std::endl
+    << "Usage: fits2idia [-o output_filename] [-s] [-p] [-m] [-z] input_filename" << std::endl << std::endl
     << "Options:" << std::endl 
     << "-o\tOutput filename" << std::endl 
     << "-s\tUse slower but less memory-intensive method (enable if memory allocation fails)" << std::endl 
     << "-p\tPrint progress output (by default the program is silent)" << std::endl
     << "-m\tReport predicted memory usage and exit without performing the conversion" << std::endl
-    << "-q\tSuppress all non-error output. Deprecated; this is now the default." << std::endl;
+    << "-q\tSuppress all non-error output. Deprecated; this is now the default." << std::endl
+    << "-z\tCalculate mipmaps for depth" << std::endl;
     
-    while ((opt = getopt(argc, argv, ":o:spqm")) != -1) {
+    while ((opt = getopt(argc, argv, ":o:spqmz")) != -1) {
         switch (opt) {
             case 'o':
                 outputFileName.assign(optarg);
@@ -46,6 +47,9 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
                 // only print memory usage and exit
                 onlyReportMemory = true;
                 break;
+            case 'z':
+                zMips = true;
+                break;
             case ':':
                 err = true;
                 std::cerr << "Missing argument for option " << opt << "." << std::endl;
@@ -94,8 +98,9 @@ int main(int argc, char** argv) {
     bool slow(false);
     bool progress(false);
     bool onlyReportMemory(false);
+    bool zMips(false);
     
-    if (!getOptions(argc, argv, inputFileName, outputFileName, slow, progress, onlyReportMemory)) {
+    if (!getOptions(argc, argv, inputFileName, outputFileName, slow, progress, onlyReportMemory, zMips)) {
         return 1;
     }
     
@@ -121,7 +126,7 @@ int main(int argc, char** argv) {
     std::unique_ptr<Converter> converter;
         
     try {
-        converter = Converter::getConverter(inputFileName, outputFileName, slow, progress);
+        converter = Converter::getConverter(inputFileName, outputFileName, slow, progress, zMips);
         
         if (onlyReportMemory) {
             converter->reportMemoryUsage();

From 6bca7d97763b690e1965f9756661dbe3823e8c3e Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Tue, 8 Aug 2023 09:37:51 +0200
Subject: [PATCH 03/26] Update README.md with new -z option

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 13f56ee..f3788d6 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,7 @@ important are:
 -s      Use slower but less memory-intensive method (enable if memory allocation fails)
 -p      Print progress output (by default the program is silent)
 -m      Report predicted memory usage and exit without performing the conversion
+-z      Include calculation of mipmaps along 3rd axis of dataset
 ```
 
 ## Configuration

From 220ffe2454577903b8dcc5620d97693564d3b390 Mon Sep 17 00:00:00 2001
From: CosmicElysium <alexandersivitilli@gmail.com>
Date: Wed, 9 Aug 2023 15:38:03 +0200
Subject: [PATCH 04/26] Added error message if -s and -z are both used

---
 src/main.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/main.cc b/src/main.cc
index 2e53570..2c03133 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -26,8 +26,8 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
     << "-p\tPrint progress output (by default the program is silent)" << std::endl
     << "-m\tReport predicted memory usage and exit without performing the conversion" << std::endl
     << "-q\tSuppress all non-error output. Deprecated; this is now the default." << std::endl
-    << "-z\tCalculate mipmaps for depth" << std::endl;
-    
+    << "-z\tInclude axis 3 in mipmap calculation (currently not compatible with -s mode)." << std::endl;
+
     while ((opt = getopt(argc, argv, ":o:spqmz")) != -1) {
         switch (opt) {
             case 'o':
@@ -103,7 +103,12 @@ int main(int argc, char** argv) {
     if (!getOptions(argc, argv, inputFileName, outputFileName, slow, progress, onlyReportMemory, zMips)) {
         return 1;
     }
-    
+
+    if (slow && zMips){
+        std::cerr << "Currently unable to include depth in mipmap calculation for -s mode." << std::endl;
+        return -1;
+    }
+
     hsize_t memoryLimit(0);
     
     std::ifstream rcFile("/etc/fits2idiarc");

From 28b43a207c4fb11a54589b4cc101cb2f359ee0ff Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 23 Oct 2023 10:25:11 +0200
Subject: [PATCH 05/26] Updated memory usage calculation for case of depth
 mipmap calculation

---
 src/Converter.cc     | 2 +-
 src/Converter.h      | 1 +
 src/FastConverter.cc | 2 +-
 src/MipMap.cc        | 4 ++--
 src/MipMap.h         | 3 ++-
 src/SlowConverter.cc | 2 +-
 6 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/Converter.cc b/src/Converter.cc
index 85bf99a..b011fc0 100644
--- a/src/Converter.cc
+++ b/src/Converter.cc
@@ -5,7 +5,7 @@
 
 #include "Converter.h"
 
-Converter::Converter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips) : timer(), progress(progress) {
+Converter::Converter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips) : timer(), progress(progress), zMips(zMips) {
     TIMER(timer.start("Setup"););
     
     openFitsFile(&inputFilePtr, inputFileName);
diff --git a/src/Converter.h b/src/Converter.h
index 1f3817e..60ca35a 100644
--- a/src/Converter.h
+++ b/src/Converter.h
@@ -36,6 +36,7 @@ class Converter {
     
     Timer timer;
     bool progress;
+    bool zMips;
     
     std::string tempOutputFileName;
     std::string outputFileName;
diff --git a/src/FastConverter.cc b/src/FastConverter.cc
index 2a8cb3e..325eeeb 100644
--- a/src/FastConverter.cc
+++ b/src/FastConverter.cc
@@ -12,7 +12,7 @@ MemoryUsage FastConverter::calculateMemoryUsage() {
     MemoryUsage m;
     
     m.sizes["Main dataset"] = depth * height * width * sizeof(float);
-    m.sizes["Mipmaps"] = MipMaps::size(standardDims, {depth, height, width});
+    m.sizes["Mipmaps"] = MipMaps::size(standardDims, {depth, height, width}, zMips);
     m.sizes["XY stats"] = Stats::size({depth}, numBins);
     
     if (depth > 1) {
diff --git a/src/MipMap.cc b/src/MipMap.cc
index fedeea7..c970a8c 100644
--- a/src/MipMap.cc
+++ b/src/MipMap.cc
@@ -98,7 +98,7 @@ MipMaps::MipMaps(std::vector<hsize_t> standardDims, const std::vector<hsize_t>&
 
 }
 
-hsize_t MipMaps::size(const std::vector<hsize_t>& standardDims, const std::vector<hsize_t>& standardBufferDims) {
+hsize_t MipMaps::size(const std::vector<hsize_t>& standardDims, const std::vector<hsize_t>& standardBufferDims, bool zMips) {
     hsize_t size = 0;
     int mipXY = 1;
     int mipZ = 1;
@@ -107,7 +107,7 @@ hsize_t MipMaps::size(const std::vector<hsize_t>& standardDims, const std::vecto
     int N = standardDims.size();
     
     do {
-        if (N > 2) {
+        if (N > 2 && zMips) {
             do {
                 if (mipXY > 1 || mipZ > 1)
                     size += (sizeof(double) + sizeof(int)) * product(bufferDims);
diff --git a/src/MipMap.h b/src/MipMap.h
index 9947d5d..c73f694 100644
--- a/src/MipMap.h
+++ b/src/MipMap.h
@@ -61,7 +61,8 @@ struct MipMaps {
     MipMaps(std::vector<hsize_t> standardDims, const std::vector<hsize_t>& chunkDims, bool zMips);
     
     // We need the dataset dimensions to work out how many mipmaps we have
-    static hsize_t size(const std::vector<hsize_t>& standardDims, const std::vector<hsize_t>& standardBufferDims);
+    static hsize_t size(const std::vector<hsize_t>& standardDims, const std::vector<hsize_t>& standardBufferDims, bool zMips);
+    
     
     void createDatasets(H5::Group group);
     void createBuffers(const std::vector<hsize_t>& standardBufferDims);
diff --git a/src/SlowConverter.cc b/src/SlowConverter.cc
index 00f5ece..5d2506a 100644
--- a/src/SlowConverter.cc
+++ b/src/SlowConverter.cc
@@ -11,7 +11,7 @@ MemoryUsage SlowConverter::calculateMemoryUsage() {
     MemoryUsage m;
 
     m.sizes["Main dataset"] = height * width * sizeof(float);
-    m.sizes["Mipmaps"] = MipMaps::size(standardDims, {1, height, width});
+    m.sizes["Mipmaps"] = MipMaps::size(standardDims, {1, height, width}, zMips);
     m.sizes["XY stats"] = Stats::size({depth}, numBins);
     
     if (depth > 1) {

From a67bef24a3bbcc7c2c18c675ef33ec78a24c190b Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 30 Oct 2023 15:04:37 +0200
Subject: [PATCH 06/26] Added SmartConverter that speeds up XY stats
 calculation with OpenMP

Still possible race conditions present. Need to do further testing.
---
 CMakeLists.txt        |   3 +-
 src/Converter.cc      |   4 +-
 src/Converter.h       |  11 +-
 src/SmartConverter.cc | 378 ++++++++++++++++++++++++++++++++++++++++++
 src/Stats.h           |   8 +
 src/main.cc           |  16 +-
 6 files changed, 412 insertions(+), 8 deletions(-)
 create mode 100644 src/SmartConverter.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 292a228..41b3d0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,8 @@ set(SOURCE_FILES
     src/Converter.cc
     src/FastConverter.cc
     src/SlowConverter.cc
-    src/Util.cc)
+    src/SmartConverter.cc
+    src/Util.cc )
 
 add_executable(fits2idia ${SOURCE_FILES})
 target_link_libraries(fits2idia ${LINK_LIBS})
diff --git a/src/Converter.cc b/src/Converter.cc
index b011fc0..1c1c06b 100644
--- a/src/Converter.cc
+++ b/src/Converter.cc
@@ -52,9 +52,11 @@ Converter::~Converter() {
     closeFitsFile(inputFilePtr);
 }
 
-std::unique_ptr<Converter> Converter::getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool progress, bool zMips) {
+std::unique_ptr<Converter> Converter::getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips) {
     if (slow) {
         return std::unique_ptr<Converter>(new SlowConverter(inputFileName, outputFileName, progress, zMips));
+    } else if (smart) {
+        return std::unique_ptr<Converter>(new SmartConverter(inputFileName, outputFileName, progress, zMips));
     } else {
         return std::unique_ptr<Converter>(new FastConverter(inputFileName, outputFileName, progress, zMips));
     }
diff --git a/src/Converter.h b/src/Converter.h
index 60ca35a..9d2a62f 100644
--- a/src/Converter.h
+++ b/src/Converter.h
@@ -26,7 +26,7 @@ class Converter {
     Converter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
     ~Converter();
     
-    static std::unique_ptr<Converter> getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool progress, bool zMips);
+    static std::unique_ptr<Converter> getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips);
     void convert();
     void reportMemoryUsage();
     virtual MemoryUsage calculateMemoryUsage() = 0;
@@ -83,6 +83,15 @@ class FastConverter : public Converter {
 };
 
 
+class SmartConverter : public Converter {
+public:
+    SmartConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
+    MemoryUsage calculateMemoryUsage() override;
+    
+protected:
+    void copyAndCalculate() override;
+};
+
 class SlowConverter : public Converter {
 public:
     SlowConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
new file mode 100644
index 0000000..147fb1f
--- /dev/null
+++ b/src/SmartConverter.cc
@@ -0,0 +1,378 @@
+/* This file is part of the FITS to IDIA file format converter: https://github.com/idia-astro/fits2idia
+   Copyright 2019, 2020, 2021 the Inter-University Institute for Data Intensive Astronomy (IDIA)
+   SPDX-License-Identifier: GPL-3.0-or-later
+*/
+
+#include "Converter.h"
+
+SmartConverter::SmartConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips) : Converter(inputFileName, outputFileName, progress, zMips) {}
+
+MemoryUsage SmartConverter::calculateMemoryUsage() {
+    MemoryUsage m;
+
+    m.sizes["Main dataset"] = height * width * sizeof(float);
+    m.sizes["Mipmaps"] = MipMaps::size(standardDims, {1, height, width}, zMips);
+    m.sizes["XY stats"] = Stats::size({depth}, numBins);
+    
+    if (depth > 1) {
+        m.sizes["Rotation"] = 2 * product(trimAxes({stokes, depth, TILE_SIZE, TILE_SIZE}, N)) * sizeof(float);
+        m.sizes["XYZ stats"] = Stats::size({}, numBins, depth);
+        m.sizes["Z stats"] = Stats::size({TILE_SIZE, TILE_SIZE});
+    }
+    
+    for (auto& kv : m.sizes) {
+        m.total += kv.second;
+    }
+    
+    if (depth > 1) {
+        m.total -= std::min(m.sizes["Main dataset"], m.sizes["Rotation"] + m.sizes["Z stats"]);
+        m.note = " (Main dataset and slices for rotation and Z statistics are not allocated at the same time.)";
+    }
+
+    return m;
+}
+
+void SmartConverter::copyAndCalculate() {
+    const hsize_t channelProgressStride = std::max((hsize_t)1, (hsize_t)(depth / 100));
+    hsize_t numTiles = std::ceil(width / TILE_SIZE) * std::ceil(height / TILE_SIZE);
+    const hsize_t tileProgressStride = std::max((hsize_t)1, (hsize_t)(numTiles / 100));
+    
+    // Allocate one channel at a time, and no swizzled data
+    hsize_t cubeSize = height * width;
+    TIMER(timer.start("Allocate"););
+    standardCube = new float[cubeSize];
+    
+    // Allocate one stokes of stats at a time
+    statsXY.createBuffers({depth});
+    
+    if (depth > 1) {
+        statsXYZ.createBuffers({}, depth);
+    }
+    
+    mipMaps.createBuffers({1, height, width});
+
+    std::vector<hsize_t> count = trimAxes({1, 1, height, width}, N);
+    std::vector<hsize_t> memDims = {height, width};
+    
+    std::string timerLabelStatsMipmaps = depth > 1 ? "XY and XYZ statistics and mipmaps" : "XY statistics and mipmaps";
+
+    
+    for (unsigned int s = 0; s < stokes; s++) {
+        DEBUG(std::cout << "Processing Stokes " << s << "... " << std::endl;);
+        PROGRESS("Stokes " << s << ":" << std::endl);
+        
+        PROGRESS("\tMain loop\t");
+        
+        StatsCounter counterXYZ;
+        
+        for (hsize_t c = 0; c < depth; c++) {
+            PROGRESS_DECIMATED(c, channelProgressStride, "|");
+            // read one channel
+            DEBUG(std::cout << "+ Processing channel " << c << "... " << std::flush;);
+            DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+            TIMER(timer.start("Read"););
+            readFitsData(inputFilePtr, c, s, cubeSize, standardCube);
+            
+            // Write the standard dataset
+            
+            DEBUG(std::cout << " Writing main dataset..." << std::flush;);
+            TIMER(timer.start("Write"););
+            
+            std::vector<hsize_t> start = trimAxes({s, c, 0, 0}, N);
+            writeHdf5Data(standardDataSet, standardCube, memDims, count, start);
+            
+            DEBUG(std::cout << " Accumulating XY stats and mipmaps..." << std::flush;);
+            TIMER(timer.start(timerLabelStatsMipmaps););
+
+            StatsCounter counterXY;
+            StatsCounter counterX;
+            auto indexXY = c;
+            std::function<void(float)> accumulate;
+            
+            //
+            
+            auto lazy_accumulate = [&] (float val) {
+                counterXY.accumulateFiniteLazy(val);
+            };
+            
+            auto first_accumulate = [&] (float val) {
+                counterXY.accumulateFiniteLazyFirst(val);
+                accumulate = lazy_accumulate;
+            };
+            
+            accumulate = first_accumulate;
+            
+            hsize_t  y,x;
+#pragma omp parallel for default(none) private (y, counterX) shared (counterXY, mipMaps)
+            for (y = 0; y < height; y++) {
+                counterX.reset();
+                for (hsize_t x = 0; x < width; x++) {
+                    auto pos = y * width + x; // relative to channel slice
+                    auto& val = standardCube[pos];
+
+
+                    if (std::isfinite(val)) {
+                        // XY statistics
+                        counterX.accumulateFinite(val);
+                        
+                        // Accumulate mipmaps
+                        mipMaps.accumulate(val, x, y, 0);   //possible race condiion here...
+                        
+                    } else {
+                        counterX.accumulateNonFinite();
+                    }
+                }
+#pragma omp critical
+                {
+                    counterXY.minVal = fmin(counterXY.minVal, counterX.minVal); //an "accumulatestatstocounter" would be better here like below
+                    counterXY.maxVal = fmax(counterXY.maxVal, counterX.maxVal);
+                    counterXY.sum += counterX.sum;
+                    counterXY.sumSq += counterX.sumSq;
+                    counterXY.nanCount += counterX.nanCount;
+                }
+
+            } // end of XY loop
+            
+            // Final correction of XY min and max
+            DEBUG(std::cout << " Final XY stats..." << std::flush;);
+            statsXY.copyStatsFromCounter(indexXY, height * width, counterXY);
+            
+            // Accumulate XYZ statistics
+            if (depth > 1) {
+                DEBUG(std::cout << " Accumulating XYZ stats..." << std::flush;);
+                statsXY.accumulateStatsToCounter(counterXYZ, indexXY);
+            }
+            
+            // Final mipmap calculation
+            DEBUG(std::cout << " Final mipmaps..." << std::flush;);
+            mipMaps.calculate();
+            
+            
+            // Write the mipmaps
+            DEBUG(std::cout << " Writing mipmaps..." << std::flush;);
+            TIMER(timer.start("Write"););
+            mipMaps.write(s, c);
+            
+            // Reset mipmaps before next channel
+            DEBUG(std::cout << " Resetting mipmap objects..." << std::endl;);
+            TIMER(timer.start(timerLabelStatsMipmaps););
+            mipMaps.resetBuffers();
+            
+        } // end of first channel loop
+        
+        PROGRESS(std::endl);
+        
+        if (depth > 1) {
+            // Final correction of XYZ min and max
+            DEBUG(std::cout << " Final XYZ stats..." << std::flush;);
+            PROGRESS("\tXYZ stats" << std::endl);
+            TIMER(timer.start(timerLabelStatsMipmaps););
+            statsXYZ.copyStatsFromCounter(0, depth * height * width, counterXYZ);
+        }
+        
+        // XY and XYZ histograms
+        // We need a second pass over all channels because we need cube min and max (and channel min and max per channel)
+        // We do the second pass backwards to take advantage of caching
+        DEBUG(std::cout << " Histograms..." << std::endl;);
+        PROGRESS("\tHistograms\t");
+        TIMER(timer.start("Histograms"););
+        
+        double cubeMin;
+        double cubeMax;
+        double cubeRange;
+        bool cubeHist(false);
+        
+        if (depth > 1) {
+            cubeMin = statsXYZ.minVals[0];
+            cubeMax = statsXYZ.maxVals[0];
+            cubeRange = cubeMax - cubeMin;
+            cubeHist = std::isfinite(cubeMin) && std::isfinite(cubeMax) && cubeRange > 0;
+        }
+        
+        statsXY.clearHistogramBuffers();
+        statsXYZ.clearHistogramBuffers();
+        
+        DEBUG(std::cout << "+ Will " << (cubeHist ? "" : "not ") << "calculate cube histogram." << std::endl;);
+        
+        for (hsize_t c = depth; c-- > 0; ) {
+            DEBUG(std::cout << "+ Processing channel " << c << "... " << std::flush;);
+            PROGRESS_DECIMATED(c, channelProgressStride, "|");
+            auto indexXY = c;
+                            
+            double chanMin = statsXY.minVals[indexXY];
+            double chanMax = statsXY.maxVals[indexXY];
+            double chanRange = chanMax - chanMin;
+            
+            bool chanHist(std::isfinite(chanMin) && std::isfinite(chanMax) && chanRange > 0);
+            DEBUG(std::cout << " Will " << (chanHist ? "" : "not ") << "calculate channel histogram." << std::flush;);
+            
+            if (!chanHist && !cubeHist) {
+                continue;
+            }
+            
+            auto doChannelHistogram = [&] (float val) {
+                // XY histogram
+                statsXY.accumulateHistogram(val, chanMin, chanRange, c);
+            };
+            
+            auto doCubeHistogram = [&] (float val) {
+                // XYZ histogram
+                statsXYZ.accumulateHistogram(val, cubeMin, cubeRange, 0);
+            };
+            
+            auto doNothing = [&] (float val) {
+                UNUSED(val);
+            };
+            
+            std::function<void(float)> channelHistogramFunc = doChannelHistogram;
+            std::function<void(float)> cubeHistogramFunc = doCubeHistogram;
+            
+            if (!chanHist) {
+                channelHistogramFunc = doNothing;
+            }
+            
+            if (!cubeHist) {
+                cubeHistogramFunc = doNothing;
+            }
+            
+            // read one channel
+            DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+            TIMER(timer.start("Read"););
+            
+            readFitsData(inputFilePtr, c, s, cubeSize, standardCube);
+
+            DEBUG(std::cout << " Calculating histogram(s)..." << std::endl;);
+            TIMER(timer.start("Histograms"););
+            
+            for (hsize_t p = 0; p < width * height; p++) {
+                auto& val = standardCube[p];
+                    if (std::isfinite(val)) {
+                        channelHistogramFunc(val);
+                        cubeHistogramFunc(val);
+                    }
+            } // end of XY loop
+        } // end of second channel loop (XY and XYZ histograms)
+        
+        PROGRESS(std::endl);
+        
+        // Write the statistics
+        TIMER(timer.start("Write"););
+        PROGRESS("\tWrite stats & mipmaps" << std::endl);
+                
+        statsXY.write({1, depth}, {s, 0});
+        
+        if (depth > 1) {
+            statsXYZ.write({1}, {s});
+        }
+    
+    } // end of stokes
+    
+    // Free memory
+    DEBUG(std::cout << "Freeing memory from main dataset... " << std::endl;);
+    TIMER(timer.start("Free"););
+    
+    delete[] standardCube;
+            
+    // Swizzle
+    if (depth > 1) {
+        DEBUG(std::cout << "Performing tiled rotation." << std::endl;);
+        PROGRESS("Tiled rotation & Z stats" << std::endl);
+        TIMER(timer.start("Allocate"););
+        
+        hsize_t sliceSize = product(trimAxes({stokes, depth, TILE_SIZE, TILE_SIZE}, N));
+        float* standardSlice = new float[sliceSize];
+        float* rotatedSlice = new float[sliceSize];
+        
+        statsZ.createBuffers({TILE_SIZE, TILE_SIZE});
+        
+        for (unsigned int s = 0; s < stokes; s++) {
+            DEBUG(std::cout << "Processing Stokes " << s << "..." << std::endl;);
+            PROGRESS("\tStokes " << s << "\t");
+            
+            hsize_t tileCount(0);
+            
+            for (hsize_t xOffset = 0; xOffset < width; xOffset += TILE_SIZE) {
+                for (hsize_t yOffset = 0; yOffset < height; yOffset += TILE_SIZE) {
+                    tileCount++;
+                    hsize_t xSize = std::min(TILE_SIZE, width - xOffset);
+                    hsize_t ySize = std::min(TILE_SIZE, height - yOffset);
+                    
+                    DEBUG(std::cout << "+ Processing tile slice at " << xOffset << ", " << yOffset << "..." << std::flush;);
+                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                    
+                    // read tile slice
+                    DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+                    TIMER(timer.start("Read"););
+                    
+                    auto standardMemDims = trimAxes({1, depth, ySize, xSize}, N);
+                    auto standardCount = trimAxes({1, depth, ySize, xSize}, N);
+                    auto standardStart = trimAxes({s, 0, yOffset, xOffset}, N);
+                    
+                    readHdf5Data(standardDataSet, standardSlice, standardMemDims, standardCount, standardStart);
+                    
+                    // rotate tile slice
+                    DEBUG(std::cout << " Calculating rotation..." << std::flush;);
+                    TIMER(timer.start("Rotation"););
+                    
+                    for (hsize_t i = 0; i < depth; i++) {
+                        for (hsize_t j = 0; j < ySize; j++) {
+                            for (hsize_t k = 0; k < xSize; k++) {
+                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                                auto& val = standardSlice[sourceIndex];
+                                
+                                // rotation
+                                auto destIndex = i + depth * j + (ySize * depth) * k;
+                                rotatedSlice[destIndex] = val;
+                            }
+                        }
+                    }
+                    
+                    // A separate pass over the same slice depth-last 
+                    DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
+                    TIMER(timer.start("Z statistics"););
+                    
+                    for (hsize_t j = 0; j < ySize; j++) {
+                        for (hsize_t k = 0; k < xSize; k++) {
+                            StatsCounter counterZ;
+                            auto indexZ = k + xSize * j;
+                            
+                            for (hsize_t i = 0; i < depth; i++) {
+                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                                auto& val = standardSlice[sourceIndex];
+                                
+                                if (std::isfinite(val)) {
+                                    // Not lazy; too much risk of encountering an ascending / descending sequence.
+                                    counterZ.accumulateFinite(val);
+                                } else {
+                                    counterZ.accumulateNonFinite();
+                                }
+                            }
+                            
+                            statsZ.copyStatsFromCounter(indexZ, depth, counterZ);
+                        }
+                    }
+                    
+                    // write tile slice
+                    DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
+                    TIMER(timer.start("Write"););
+                    
+                    auto swizzledMemDims = trimAxes({1, xSize, ySize, depth}, N);
+                    auto swizzledCount = trimAxes({1, xSize, ySize, depth}, N);
+                    auto swizzledStart = trimAxes({s, xOffset, yOffset, 0}, N);
+                    
+                    writeHdf5Data(swizzledDataSet, rotatedSlice, swizzledMemDims, swizzledCount, swizzledStart);
+                    
+                    DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
+                    // write Z statistics
+                    statsZ.write({ySize, xSize}, {1, ySize, xSize}, {s, yOffset, xOffset});
+                }
+            }
+            PROGRESS(std::endl);
+        }
+        
+        TIMER(timer.start("Free"););
+        DEBUG(std::cout << "Freeing memory from main and rotated dataset slices... " << std::endl;);
+        delete[] standardSlice;
+        delete[] rotatedSlice;
+    }
+}
diff --git a/src/Stats.h b/src/Stats.h
index 6b176d3..d2357a1 100644
--- a/src/Stats.h
+++ b/src/Stats.h
@@ -19,6 +19,14 @@ struct StatsCounter {
         sum += val;
         sumSq += val * val;
     }
+    
+    void reset() {
+        minVal = std::numeric_limits<float>::max();
+        maxVal = -std::numeric_limits<float>::max();
+        sum = 0;
+        sumSq = 0;
+        nanCount = 0;
+    }
 
     void accumulateFiniteLazy(float val) {
         if (val < minVal) {
diff --git a/src/main.cc b/src/main.cc
index 2c03133..0b57fab 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -9,7 +9,7 @@
 #include <sstream>
 #include "Converter.h"
 
-bool getOptions(int argc, char** argv, std::string& inputFileName, std::string& outputFileName, bool& slow, bool& progress, bool& onlyReportMemory, bool& zMips) {
+bool getOptions(int argc, char** argv, std::string& inputFileName, std::string& outputFileName, bool& slow, bool& smart, bool& progress, bool& onlyReportMemory, bool& zMips) {
     extern int optind;
     extern char *optarg;
     
@@ -22,13 +22,14 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
     << "Usage: fits2idia [-o output_filename] [-s] [-p] [-m] [-z] input_filename" << std::endl << std::endl
     << "Options:" << std::endl 
     << "-o\tOutput filename" << std::endl 
-    << "-s\tUse slower but less memory-intensive method (enable if memory allocation fails)" << std::endl 
+    << "-s\tUse slower but less memory-intensive method (enable if memory allocation fails)" << std::endl
+    << "-r\tUse smart method" << std::endl
     << "-p\tPrint progress output (by default the program is silent)" << std::endl
     << "-m\tReport predicted memory usage and exit without performing the conversion" << std::endl
     << "-q\tSuppress all non-error output. Deprecated; this is now the default." << std::endl
     << "-z\tInclude axis 3 in mipmap calculation (currently not compatible with -s mode)." << std::endl;
 
-    while ((opt = getopt(argc, argv, ":o:spqmz")) != -1) {
+    while ((opt = getopt(argc, argv, ":o:srpqmz")) != -1) {
         switch (opt) {
             case 'o':
                 outputFileName.assign(optarg);
@@ -37,6 +38,10 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
                 // use slower but less memory-intensive method
                 slow = true;
                 break;
+            case 'r':
+                // use smart method
+                smart = true;
+                break;
             case 'p':
                 progress = true;
                 break;
@@ -96,11 +101,12 @@ int main(int argc, char** argv) {
     std::string inputFileName;
     std::string outputFileName;
     bool slow(false);
+    bool smart(false);
     bool progress(false);
     bool onlyReportMemory(false);
     bool zMips(false);
     
-    if (!getOptions(argc, argv, inputFileName, outputFileName, slow, progress, onlyReportMemory, zMips)) {
+    if (!getOptions(argc, argv, inputFileName, outputFileName, slow, smart, progress, onlyReportMemory, zMips)) {
         return 1;
     }
 
@@ -131,7 +137,7 @@ int main(int argc, char** argv) {
     std::unique_ptr<Converter> converter;
         
     try {
-        converter = Converter::getConverter(inputFileName, outputFileName, slow, progress, zMips);
+        converter = Converter::getConverter(inputFileName, outputFileName, slow, smart, progress, zMips);
         
         if (onlyReportMemory) {
             converter->reportMemoryUsage();

From da061eff0bf53ae3b754e10cdc0b494f67f61cb7 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Tue, 26 Dec 2023 13:12:34 +0200
Subject: [PATCH 07/26] Simplify accumulating stats from sub-region to counter

---
 src/SmartConverter.cc | 42 +++++++++++++++++++++++++++++++++++++-----
 src/Stats.h           |  9 +++++++++
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index 147fb1f..4f866eb 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -33,6 +33,11 @@ MemoryUsage SmartConverter::calculateMemoryUsage() {
 }
 
 void SmartConverter::copyAndCalculate() {
+    
+    int NUM_STRIPES = omp_get_max_threads();
+    std::mutex* location_mutexes = new std::mutex[NUM_STRIPES];
+    std::cout<< "NUM_STRIPES: " << NUM_STRIPES << std::endl;
+    
     const hsize_t channelProgressStride = std::max((hsize_t)1, (hsize_t)(depth / 100));
     hsize_t numTiles = std::ceil(width / TILE_SIZE) * std::ceil(height / TILE_SIZE);
     const hsize_t tileProgressStride = std::max((hsize_t)1, (hsize_t)(numTiles / 100));
@@ -43,6 +48,7 @@ void SmartConverter::copyAndCalculate() {
     standardCube = new float[cubeSize];
     
     // Allocate one stokes of stats at a time
+    //statsXY.createBuffers({depth}, height);
     statsXY.createBuffers({depth});
     
     if (depth > 1) {
@@ -89,8 +95,6 @@ void SmartConverter::copyAndCalculate() {
             auto indexXY = c;
             std::function<void(float)> accumulate;
             
-            //
-            
             auto lazy_accumulate = [&] (float val) {
                 counterXY.accumulateFiniteLazy(val);
             };
@@ -102,8 +106,13 @@ void SmartConverter::copyAndCalculate() {
             
             accumulate = first_accumulate;
             
+            //hsize_t  y;
             hsize_t  y,x;
-#pragma omp parallel for default(none) private (y, counterX) shared (counterXY, mipMaps)
+            
+            // for each mipmap index in  highest resolution mipmap
+            
+            
+#pragma omp parallel for default(none) private (y, counterX) shared (counterXY, mipMaps, location_mutexes)
             for (y = 0; y < height; y++) {
                 counterX.reset();
                 for (hsize_t x = 0; x < width; x++) {
@@ -116,8 +125,9 @@ void SmartConverter::copyAndCalculate() {
                         counterX.accumulateFinite(val);
                         
                         // Accumulate mipmaps
-                        mipMaps.accumulate(val, x, y, 0);   //possible race condiion here...
-                        
+                        mipMaps.accumulateWithLock(val, x, y, 0, location_mutexes);   //possible race condiion here...should do by row? Or maybe for loop by mipindex
+                        //accumulate to mipmap row buffer
+                        //accumulate 2D mipmap
                     } else {
                         counterX.accumulateNonFinite();
                     }
@@ -212,11 +222,13 @@ void SmartConverter::copyAndCalculate() {
             
             auto doChannelHistogram = [&] (float val) {
                 // XY histogram
+                //statsXY.accumulatePartialHistogram(val, chanMin, chanRange, c);
                 statsXY.accumulateHistogram(val, chanMin, chanRange, c);
             };
             
             auto doCubeHistogram = [&] (float val) {
                 // XYZ histogram
+                //statsXYZ.accumulatePartialHistogram(val, cubeMin, cubeRange, 0);
                 statsXYZ.accumulateHistogram(val, cubeMin, cubeRange, 0);
             };
             
@@ -244,6 +256,9 @@ void SmartConverter::copyAndCalculate() {
             DEBUG(std::cout << " Calculating histogram(s)..." << std::endl;);
             TIMER(timer.start("Histograms"););
             
+            //need a partial histogram multiplier here... I think number of rows in the dataset. THen the partials can be done row by row
+            
+            
             for (hsize_t p = 0; p < width * height; p++) {
                 auto& val = standardCube[p];
                     if (std::isfinite(val)) {
@@ -251,6 +266,23 @@ void SmartConverter::copyAndCalculate() {
                         cubeHistogramFunc(val);
                     }
             } // end of XY loop
+            /*
+            hsize_t y;
+#pragma omp parallel for default(none) private (y) shared (standardCube, channelHistogramFunc, cubeHistogramFunc)
+            for (y = 0; y < height; y++) {
+                for (hsize_t x = 0; x < width; x++) {
+                    auto p = y * width + x;
+                    auto& val = standardCube[p];
+                    if (std::isfinite(val)) {
+                        channelHistogramFunc(val);
+                        cubeHistogramFunc(val);
+                    }
+                }
+#pragma omp critical
+                statsXY.consolidatePartialHistogram();
+            } // end of XY loop
+            statsXYZ.consolidatePartialHistogram();
+            */
         } // end of second channel loop (XY and XYZ histograms)
         
         PROGRESS(std::endl);
diff --git a/src/Stats.h b/src/Stats.h
index d2357a1..f6fa7d4 100644
--- a/src/Stats.h
+++ b/src/Stats.h
@@ -48,6 +48,15 @@ struct StatsCounter {
     void accumulateNonFinite() {
         nanCount++;
     }
+    
+    void accumulateFromCounter(StatsCounter otherCounter) {
+        minVal = fmin(otherCounter.minVal, otherCounter.minVal);
+        maxVal = fmax(otherCounter.maxVal, otherCounter.maxVal);
+        sum += otherCounter.sum;
+        sumSq += otherCounter.sumSq;
+        nanCount += otherCounter.nanCount;
+    }
+    
 
     float minVal;
     float maxVal;

From 0830d37f5f9ecdb3585c4adf677666dd179bf902 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Tue, 26 Dec 2023 14:27:21 +0200
Subject: [PATCH 08/26] Add back the implementation of the simplification

---
 src/SmartConverter.cc | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index 4f866eb..2fbc1de 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -133,14 +133,7 @@ void SmartConverter::copyAndCalculate() {
                     }
                 }
 #pragma omp critical
-                {
-                    counterXY.minVal = fmin(counterXY.minVal, counterX.minVal); //an "accumulatestatstocounter" would be better here like below
-                    counterXY.maxVal = fmax(counterXY.maxVal, counterX.maxVal);
-                    counterXY.sum += counterX.sum;
-                    counterXY.sumSq += counterX.sumSq;
-                    counterXY.nanCount += counterX.nanCount;
-                }
-
+                counterXY.accumulateFromCounter(counterX);      // Accumulate to slice XY stats from thread-local X stats
             } // end of XY loop
             
             // Final correction of XY min and max

From 120385616de242e1c94ee62eab9037bf6af7e8a7 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Tue, 26 Dec 2023 14:27:43 +0200
Subject: [PATCH 09/26] Revert "Simplify accumulating stats from sub-region to
 counter"

This reverts commit da061eff0bf53ae3b754e10cdc0b494f67f61cb7.
---
 src/SmartConverter.cc | 42 +++++-------------------------------------
 src/Stats.h           |  9 ---------
 2 files changed, 5 insertions(+), 46 deletions(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index 2fbc1de..6f308c8 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -33,11 +33,6 @@ MemoryUsage SmartConverter::calculateMemoryUsage() {
 }
 
 void SmartConverter::copyAndCalculate() {
-    
-    int NUM_STRIPES = omp_get_max_threads();
-    std::mutex* location_mutexes = new std::mutex[NUM_STRIPES];
-    std::cout<< "NUM_STRIPES: " << NUM_STRIPES << std::endl;
-    
     const hsize_t channelProgressStride = std::max((hsize_t)1, (hsize_t)(depth / 100));
     hsize_t numTiles = std::ceil(width / TILE_SIZE) * std::ceil(height / TILE_SIZE);
     const hsize_t tileProgressStride = std::max((hsize_t)1, (hsize_t)(numTiles / 100));
@@ -48,7 +43,6 @@ void SmartConverter::copyAndCalculate() {
     standardCube = new float[cubeSize];
     
     // Allocate one stokes of stats at a time
-    //statsXY.createBuffers({depth}, height);
     statsXY.createBuffers({depth});
     
     if (depth > 1) {
@@ -95,6 +89,8 @@ void SmartConverter::copyAndCalculate() {
             auto indexXY = c;
             std::function<void(float)> accumulate;
             
+            //
+            
             auto lazy_accumulate = [&] (float val) {
                 counterXY.accumulateFiniteLazy(val);
             };
@@ -106,13 +102,8 @@ void SmartConverter::copyAndCalculate() {
             
             accumulate = first_accumulate;
             
-            //hsize_t  y;
             hsize_t  y,x;
-            
-            // for each mipmap index in  highest resolution mipmap
-            
-            
-#pragma omp parallel for default(none) private (y, counterX) shared (counterXY, mipMaps, location_mutexes)
+#pragma omp parallel for default(none) private (y, counterX) shared (counterXY, mipMaps)
             for (y = 0; y < height; y++) {
                 counterX.reset();
                 for (hsize_t x = 0; x < width; x++) {
@@ -125,9 +116,8 @@ void SmartConverter::copyAndCalculate() {
                         counterX.accumulateFinite(val);
                         
                         // Accumulate mipmaps
-                        mipMaps.accumulateWithLock(val, x, y, 0, location_mutexes);   //possible race condiion here...should do by row? Or maybe for loop by mipindex
-                        //accumulate to mipmap row buffer
-                        //accumulate 2D mipmap
+                        mipMaps.accumulate(val, x, y, 0);   //possible race condiion here...
+                        
                     } else {
                         counterX.accumulateNonFinite();
                     }
@@ -215,13 +205,11 @@ void SmartConverter::copyAndCalculate() {
             
             auto doChannelHistogram = [&] (float val) {
                 // XY histogram
-                //statsXY.accumulatePartialHistogram(val, chanMin, chanRange, c);
                 statsXY.accumulateHistogram(val, chanMin, chanRange, c);
             };
             
             auto doCubeHistogram = [&] (float val) {
                 // XYZ histogram
-                //statsXYZ.accumulatePartialHistogram(val, cubeMin, cubeRange, 0);
                 statsXYZ.accumulateHistogram(val, cubeMin, cubeRange, 0);
             };
             
@@ -249,9 +237,6 @@ void SmartConverter::copyAndCalculate() {
             DEBUG(std::cout << " Calculating histogram(s)..." << std::endl;);
             TIMER(timer.start("Histograms"););
             
-            //need a partial histogram multiplier here... I think number of rows in the dataset. THen the partials can be done row by row
-            
-            
             for (hsize_t p = 0; p < width * height; p++) {
                 auto& val = standardCube[p];
                     if (std::isfinite(val)) {
@@ -259,23 +244,6 @@ void SmartConverter::copyAndCalculate() {
                         cubeHistogramFunc(val);
                     }
             } // end of XY loop
-            /*
-            hsize_t y;
-#pragma omp parallel for default(none) private (y) shared (standardCube, channelHistogramFunc, cubeHistogramFunc)
-            for (y = 0; y < height; y++) {
-                for (hsize_t x = 0; x < width; x++) {
-                    auto p = y * width + x;
-                    auto& val = standardCube[p];
-                    if (std::isfinite(val)) {
-                        channelHistogramFunc(val);
-                        cubeHistogramFunc(val);
-                    }
-                }
-#pragma omp critical
-                statsXY.consolidatePartialHistogram();
-            } // end of XY loop
-            statsXYZ.consolidatePartialHistogram();
-            */
         } // end of second channel loop (XY and XYZ histograms)
         
         PROGRESS(std::endl);
diff --git a/src/Stats.h b/src/Stats.h
index f6fa7d4..d2357a1 100644
--- a/src/Stats.h
+++ b/src/Stats.h
@@ -48,15 +48,6 @@ struct StatsCounter {
     void accumulateNonFinite() {
         nanCount++;
     }
-    
-    void accumulateFromCounter(StatsCounter otherCounter) {
-        minVal = fmin(otherCounter.minVal, otherCounter.minVal);
-        maxVal = fmax(otherCounter.maxVal, otherCounter.maxVal);
-        sum += otherCounter.sum;
-        sumSq += otherCounter.sumSq;
-        nanCount += otherCounter.nanCount;
-    }
-    
 
     float minVal;
     float maxVal;

From 06e07fff7cdbbfcbfe03d6e97de5310c654ad93a Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Tue, 26 Dec 2023 14:29:04 +0200
Subject: [PATCH 10/26] Put back function for sub-region counter

---
 src/Stats.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/Stats.h b/src/Stats.h
index d2357a1..30d9452 100644
--- a/src/Stats.h
+++ b/src/Stats.h
@@ -48,6 +48,14 @@ struct StatsCounter {
     void accumulateNonFinite() {
         nanCount++;
     }
+    
+    void accumulateFromCounter(StatsCounter otherCounter) {
+        minVal = fmin(otherCounter.minVal, otherCounter.minVal);
+        maxVal = fmax(otherCounter.maxVal, otherCounter.maxVal);
+        sum += otherCounter.sum;
+        sumSq += otherCounter.sumSq;
+        nanCount += otherCounter.nanCount;
+    }
 
     float minVal;
     float maxVal;

From 6ca10c5dbbb42680f9f45a1ef400385016382449 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Sat, 30 Dec 2023 17:39:52 +0200
Subject: [PATCH 11/26] Parallelize mipmap and stats calculation by region
 instead of row

---
 src/SmartConverter.cc | 50 ++++++++++++++++++++++++++-----------------
 src/Util.cc           |  9 ++++++++
 src/Util.h            |  3 +++
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index 6f308c8..3049d6f 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -85,7 +85,7 @@ void SmartConverter::copyAndCalculate() {
             TIMER(timer.start(timerLabelStatsMipmaps););
 
             StatsCounter counterXY;
-            StatsCounter counterX;
+            StatsCounter counterRegion;
             auto indexXY = c;
             std::function<void(float)> accumulate;
             
@@ -102,28 +102,38 @@ void SmartConverter::copyAndCalculate() {
             
             accumulate = first_accumulate;
             
-            hsize_t  y,x;
-#pragma omp parallel for default(none) private (y, counterX) shared (counterXY, mipMaps)
-            for (y = 0; y < height; y++) {
-                counterX.reset();
-                for (hsize_t x = 0; x < width; x++) {
-                    auto pos = y * width + x; // relative to channel slice
-                    auto& val = standardCube[pos];
-
-
-                    if (std::isfinite(val)) {
-                        // XY statistics
-                        counterX.accumulateFinite(val);
-                        
-                        // Accumulate mipmaps
-                        mipMaps.accumulate(val, x, y, 0);   //possible race condiion here...
-                        
-                    } else {
-                        counterX.accumulateNonFinite();
+                
+            int mipIndex;
+            int REGION_MULTIPLIER = 32;
+            int adjustedStandardCubeSize = cubeSize / REGION_MULTIPLIER / REGION_MULTIPLIER;
+            
+#pragma omp parallel for default(none) private (mipIndex, counterRegion) shared (standardCube, adjustedStandardCubeSize, mipMaps, counterXY, cubeSize, REGION_MULTIPLIER)
+            for (mipIndex = 0; mipIndex < adjustedStandardCubeSize; mipIndex += 1 ) {
+                counterRegion.reset();
+                hsize_t x0,y0,z0;
+                MipIndexToXYZ(mipIndex, x0, y0, z0, width, height, REGION_MULTIPLIER, REGION_MULTIPLIER, 1); //use index of higher-order mipmap-space to keep lower-order mipmaps thread-safe
+                for (int y = y0; y < y0 + REGION_MULTIPLIER; y++) {
+                    for (int x = x0; x < x0 + REGION_MULTIPLIER; x++) {
+                        auto pos = y * width + x;
+                        if (x >= width || y >= height) {    //check if we are out of bounds
+                            continue;
+                        }
+                        //std::cout << "pos: " << pos << std::endl;
+                        auto& val = standardCube[pos];
+                        if (std::isfinite(val)) {
+                            // XY statistics
+                            counterRegion.accumulateFinite(val);
+                    
+                            // Accumulate mipmaps
+                            mipMaps.accumulate(val, x, y, 0); //This should not conflict with the other threads
+                            
+                        } else {
+                            counterRegion.accumulateNonFinite();
+                        }
                     }
                 }
 #pragma omp critical
-                counterXY.accumulateFromCounter(counterX);      // Accumulate to slice XY stats from thread-local X stats
+                counterXY.accumulateFromCounter(counterRegion);      // Accumulate to slice's XY stats from thread-local X stats
             } // end of XY loop
             
             // Final correction of XY min and max
diff --git a/src/Util.cc b/src/Util.cc
index e338b00..3b6f3aa 100644
--- a/src/Util.cc
+++ b/src/Util.cc
@@ -254,3 +254,12 @@ void readHdf5Data(H5::DataSet& dataset, float* data, const std::vector<hsize_t>&
     }
     dataset.read(data, H5::PredType::NATIVE_FLOAT, memSpace, fileSpace);
 }
+
+void MipIndexToXYZ(hsize_t mipIndex, hsize_t& x, hsize_t& y, hsize_t& z, hsize_t width, hsize_t height, hsize_t mipX, hsize_t mipY, hsize_t mipZ) {
+    int mipHeight = height / mipY;
+    int mipWidth = width / mipX;
+    x = (mipIndex % (mipWidth * mipHeight)) % mipWidth * mipX;
+    y = (mipIndex % (mipWidth * mipHeight)) / mipWidth * mipY;
+    z = mipIndex / (mipWidth * mipHeight) * mipZ;
+    
+}
\ No newline at end of file
diff --git a/src/Util.h b/src/Util.h
index be9519e..b6a4b01 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -48,4 +48,7 @@ void writeHdf5Data(H5::DataSet& dataset, int64_t* data, const std::vector<hsize_
 
 void readHdf5Data(H5::DataSet& dataset, float* data, const std::vector<hsize_t>& dims, const std::vector<hsize_t>& count = EMPTY_DIMS, const std::vector<hsize_t>& start = EMPTY_DIMS);
 
+void MipIndexToXYZ(hsize_t mipIndex, hsize_t& x, hsize_t& y, hsize_t& z, hsize_t width, hsize_t height, hsize_t mipX, hsize_t mipY, hsize_t mipZ);
+
+
 #endif

From 10345b37b883905560ad66a927edaea8826d08f8 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Sun, 31 Dec 2023 16:14:02 +0200
Subject: [PATCH 12/26] Parallelize XY histograms by row

---
 src/SmartConverter.cc | 32 ++++++++++++++++++++++----------
 src/Stats.h           | 10 ++++++++++
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index 3049d6f..1768a3b 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -43,10 +43,10 @@ void SmartConverter::copyAndCalculate() {
     standardCube = new float[cubeSize];
     
     // Allocate one stokes of stats at a time
-    statsXY.createBuffers({depth});
+    statsXY.createBuffers({depth}, height);
     
     if (depth > 1) {
-        statsXYZ.createBuffers({}, depth);
+        statsXYZ.createBuffers({});
     }
     
     mipMaps.createBuffers({1, height, width});
@@ -213,9 +213,9 @@ void SmartConverter::copyAndCalculate() {
                 continue;
             }
             
-            auto doChannelHistogram = [&] (float val) {
+            auto doChannelHistogram = [&] (float val, hsize_t offset) {
                 // XY histogram
-                statsXY.accumulateHistogram(val, chanMin, chanRange, c);
+                statsXY.accumulatePartialHistogram(val, chanMin, chanRange, offset);
             };
             
             auto doCubeHistogram = [&] (float val) {
@@ -227,11 +227,15 @@ void SmartConverter::copyAndCalculate() {
                 UNUSED(val);
             };
             
-            std::function<void(float)> channelHistogramFunc = doChannelHistogram;
+            auto doNothingOffset = [&] (float val, hsize_t offset) {
+                UNUSED(val);
+            };
+            
+            std::function<void(float,hsize_t)> channelHistogramFunc = doChannelHistogram;
             std::function<void(float)> cubeHistogramFunc = doCubeHistogram;
             
             if (!chanHist) {
-                channelHistogramFunc = doNothing;
+                channelHistogramFunc = doNothingOffset;
             }
             
             if (!cubeHist) {
@@ -246,14 +250,22 @@ void SmartConverter::copyAndCalculate() {
 
             DEBUG(std::cout << " Calculating histogram(s)..." << std::endl;);
             TIMER(timer.start("Histograms"););
-            
-            for (hsize_t p = 0; p < width * height; p++) {
-                auto& val = standardCube[p];
+
+            hsize_t y ;
+#pragma omp parallel for default(none) private(y) shared (standardCube, height, width, channelHistogramFunc, cubeHistogramFunc)
+            for (y = 0; y < height; y++) {
+                for (hsize_t x = 0; x < width; x++) {
+                    auto pos = y * width + x;
+                    auto& val = standardCube[pos];
                     if (std::isfinite(val)) {
-                        channelHistogramFunc(val);
+                        channelHistogramFunc(val, y);
                         cubeHistogramFunc(val);
                     }
+                }
             } // end of XY loop
+            
+            statsXY.consolidatePartialHistogram(c);
+            
         } // end of second channel loop (XY and XYZ histograms)
         
         PROGRESS(std::endl);
diff --git a/src/Stats.h b/src/Stats.h
index 30d9452..6e31b81 100644
--- a/src/Stats.h
+++ b/src/Stats.h
@@ -105,6 +105,7 @@ struct Stats {
     // Histograms
     
     void clearHistogramBuffers();
+    void clearPartialHistogramBuffer();
 
     void accumulateHistogram(float val, double min, double range, hsize_t offset) {
         int binIndex = std::min(numBins - 1, (hsize_t)(numBins * (val - min) / range));
@@ -124,6 +125,15 @@ struct Stats {
         }
     }
     
+    void consolidatePartialHistogram(hsize_t mainOffset) {
+        for (hsize_t offset = 0; offset < partialHistMultiplier; offset++) {
+            for (hsize_t binIndex = 0; binIndex < numBins; binIndex++) {
+                histograms[mainOffset * numBins + binIndex] += partialHistograms[offset * numBins + binIndex];
+                partialHistograms[offset * numBins + binIndex] = 0;     //reset partial histogram after consolidation
+            }
+        }
+    }
+    
     // Writing
     void write();
     void write(const std::vector<hsize_t>& count, const std::vector<hsize_t>& start);

From d3383265cbd73b3269a461ee2b96d8d097b31ea4 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 26 Feb 2024 13:14:58 +0200
Subject: [PATCH 13/26] Fixed bug with stats counter accumulation

---
 src/Stats.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Stats.h b/src/Stats.h
index 6e31b81..e38fc5c 100644
--- a/src/Stats.h
+++ b/src/Stats.h
@@ -50,8 +50,8 @@ struct StatsCounter {
     }
     
     void accumulateFromCounter(StatsCounter otherCounter) {
-        minVal = fmin(otherCounter.minVal, otherCounter.minVal);
-        maxVal = fmax(otherCounter.maxVal, otherCounter.maxVal);
+        minVal = fmin(minVal, otherCounter.minVal);
+        maxVal = fmax(maxVal, otherCounter.maxVal);
         sum += otherCounter.sum;
         sumSq += otherCounter.sumSq;
         nanCount += otherCounter.nanCount;

From 5a6d5375b2672715b90d12aab4c235d45162c1c6 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 26 Feb 2024 16:02:49 +0200
Subject: [PATCH 14/26] Added multithreading to cube rotation and statsZ

Still need to debug statsZ as the current statsZ region has race conditions as a shared variable in the parallel for loop
---
 src/SmartConverter.cc | 189 ++++++++++++++++++++++++------------------
 1 file changed, 110 insertions(+), 79 deletions(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index 1768a3b..639cd60 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -37,6 +37,8 @@ void SmartConverter::copyAndCalculate() {
     hsize_t numTiles = std::ceil(width / TILE_SIZE) * std::ceil(height / TILE_SIZE);
     const hsize_t tileProgressStride = std::max((hsize_t)1, (hsize_t)(numTiles / 100));
     
+    const hsize_t REGION_MULTIPLIER = 32;
+    
     // Allocate one channel at a time, and no swizzled data
     hsize_t cubeSize = height * width;
     TIMER(timer.start("Allocate"););
@@ -85,7 +87,7 @@ void SmartConverter::copyAndCalculate() {
             TIMER(timer.start(timerLabelStatsMipmaps););
 
             StatsCounter counterXY;
-            StatsCounter counterRegion;
+            
             auto indexXY = c;
             std::function<void(float)> accumulate;
             
@@ -104,8 +106,11 @@ void SmartConverter::copyAndCalculate() {
             
                 
             int mipIndex;
-            int REGION_MULTIPLIER = 32;
-            int adjustedStandardCubeSize = cubeSize / REGION_MULTIPLIER / REGION_MULTIPLIER;
+            StatsCounter counterRegion;
+            
+            int regionRows = std::ceil(height / (float)REGION_MULTIPLIER);
+            int regionCols = std::ceil(width / (float)REGION_MULTIPLIER);
+            int adjustedStandardCubeSize = regionRows * regionCols;
             
 #pragma omp parallel for default(none) private (mipIndex, counterRegion) shared (standardCube, adjustedStandardCubeSize, mipMaps, counterXY, cubeSize, REGION_MULTIPLIER)
             for (mipIndex = 0; mipIndex < adjustedStandardCubeSize; mipIndex += 1 ) {
@@ -121,11 +126,11 @@ void SmartConverter::copyAndCalculate() {
                         //std::cout << "pos: " << pos << std::endl;
                         auto& val = standardCube[pos];
                         if (std::isfinite(val)) {
-                            // XY statistics
+                            // region statistics
                             counterRegion.accumulateFinite(val);
                     
                             // Accumulate mipmaps
-                            mipMaps.accumulate(val, x, y, 0); //This should not conflict with the other threads
+                            mipMaps.accumulate(val, x, y, 0); //This will not conflict with the other threads as regions are separate
                             
                         } else {
                             counterRegion.accumulateNonFinite();
@@ -134,7 +139,7 @@ void SmartConverter::copyAndCalculate() {
                 }
 #pragma omp critical
                 counterXY.accumulateFromCounter(counterRegion);      // Accumulate to slice's XY stats from thread-local X stats
-            } // end of XY loop
+            } // end of region loop
             
             // Final correction of XY min and max
             DEBUG(std::cout << " Final XY stats..." << std::flush;);
@@ -294,11 +299,17 @@ void SmartConverter::copyAndCalculate() {
         PROGRESS("Tiled rotation & Z stats" << std::endl);
         TIMER(timer.start("Allocate"););
         
-        hsize_t sliceSize = product(trimAxes({stokes, depth, TILE_SIZE, TILE_SIZE}, N));
         float* standardSlice = new float[sliceSize];
-        float* rotatedSlice = new float[sliceSize];
         
-        statsZ.createBuffers({TILE_SIZE, TILE_SIZE});
+        int numThreads = omp_get_max_threads();
+        
+        // Create a vector of float pointers for each thread
+        std::vector<float*> rotatedSlices(numThreads);
+        for (int i = 0; i < numThreads; i++) {
+            rotatedSlices[i] = new float[sliceSize];
+        }
+        
+        statsZ.createBuffers({REGION_MULTIPLIER, REGION_MULTIPLIER});
         
         for (unsigned int s = 0; s < stokes; s++) {
             DEBUG(std::cout << "Processing Stokes " << s << "..." << std::endl;);
@@ -306,88 +317,108 @@ void SmartConverter::copyAndCalculate() {
             
             hsize_t tileCount(0);
             
-            for (hsize_t xOffset = 0; xOffset < width; xOffset += TILE_SIZE) {
-                for (hsize_t yOffset = 0; yOffset < height; yOffset += TILE_SIZE) {
-                    tileCount++;
-                    hsize_t xSize = std::min(TILE_SIZE, width - xOffset);
-                    hsize_t ySize = std::min(TILE_SIZE, height - yOffset);
-                    
-                    DEBUG(std::cout << "+ Processing tile slice at " << xOffset << ", " << yOffset << "..." << std::flush;);
-                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
-                    
-                    // read tile slice
-                    DEBUG(std::cout << " Reading main dataset..." << std::flush;);
-                    TIMER(timer.start("Read"););
-                    
-                    auto standardMemDims = trimAxes({1, depth, ySize, xSize}, N);
-                    auto standardCount = trimAxes({1, depth, ySize, xSize}, N);
-                    auto standardStart = trimAxes({s, 0, yOffset, xOffset}, N);
-                    
-                    readHdf5Data(standardDataSet, standardSlice, standardMemDims, standardCount, standardStart);
-                    
-                    // rotate tile slice
-                    DEBUG(std::cout << " Calculating rotation..." << std::flush;);
-                    TIMER(timer.start("Rotation"););
-                    
-                    for (hsize_t i = 0; i < depth; i++) {
-                        for (hsize_t j = 0; j < ySize; j++) {
-                            for (hsize_t k = 0; k < xSize; k++) {
-                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
-                                auto& val = standardSlice[sourceIndex];
-                                
-                                // rotation
-                                auto destIndex = i + depth * j + (ySize * depth) * k;
-                                rotatedSlice[destIndex] = val;
-                            }
-                        }
-                    }
-                    
-                    // A separate pass over the same slice depth-last 
-                    DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
-                    TIMER(timer.start("Z statistics"););
-                    
+            int mipIndex;
+            
+            int mipRows = std::ceil(height / (float)REGION_MULTIPLIER);
+            int mipCols = std::ceil(width / (float)REGION_MULTIPLIER);
+            int adjustedStandardCubeSize = mipRows * mipCols;
+            
+#pragma omp parallel for default(none) private (mipIndex) shared (s, statsZ, tileProgressStride, standardSlice, rotatedSlices, adjustedStandardCubeSize, mipMaps, tileCount, cubeSize, REGION_MULTIPLIER, std::cout)
+            for (mipIndex = 0; mipIndex < adjustedStandardCubeSize; mipIndex += 1 ) {
+                // Get the thread id
+                int threadId = omp_get_thread_num();
+                
+                // Use the thread's own rotatedSlice
+                float* rotatedSlice = rotatedSlices[threadId];
+                
+                hsize_t x0,y0,z0;
+                MipIndexToXYZ(mipIndex, x0, y0, z0, width, height, REGION_MULTIPLIER, REGION_MULTIPLIER, 1); //use index of higher-order mipmap-space to keep lower-order mipmaps thread-safe
+                hsize_t xOffset = x0;
+                hsize_t yOffset = y0;
+                tileCount++;
+                hsize_t xSize = std::min(REGION_MULTIPLIER, width - xOffset);
+                hsize_t ySize = std::min(REGION_MULTIPLIER, height - yOffset);
+                
+                DEBUG(std::cout << "+ Processing tile slice at " << xOffset << ", " << yOffset << "..." << std::flush;);
+                PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                
+                // read tile slice
+                DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+                TIMER(timer.start("Read"););
+                
+                auto standardMemDims = trimAxes({1, depth, ySize, xSize}, N);
+                auto standardCount = trimAxes({1, depth, ySize, xSize}, N);
+                auto standardStart = trimAxes({s, 0, yOffset, xOffset}, N);
+                
+                readHdf5Data(standardDataSet, standardSlice, standardMemDims, standardCount, standardStart);
+                
+                // rotate tile slice
+                DEBUG(std::cout << " Calculating rotation..." << std::flush;);
+                TIMER(timer.start("Rotation"););
+                
+                //depth is going all the way, but ysize and xsize are not!
+                for (hsize_t i = 0; i < depth; i++) {
                     for (hsize_t j = 0; j < ySize; j++) {
                         for (hsize_t k = 0; k < xSize; k++) {
-                            StatsCounter counterZ;
-                            auto indexZ = k + xSize * j;
+                            auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                            auto& val = standardSlice[sourceIndex];
                             
-                            for (hsize_t i = 0; i < depth; i++) {
-                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
-                                auto& val = standardSlice[sourceIndex];
-                                
-                                if (std::isfinite(val)) {
-                                    // Not lazy; too much risk of encountering an ascending / descending sequence.
-                                    counterZ.accumulateFinite(val);
-                                } else {
-                                    counterZ.accumulateNonFinite();
-                                }
-                            }
+                            // rotation
+                            auto destIndex = i + depth * j + (ySize * depth) * k;
+                            rotatedSlice[destIndex] = val;
+                        }
+                    }
+                }
+                
+                // A separate pass over the same slice depth-last
+                DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
+                TIMER(timer.start("Z statistics"););
+                StatsCounter counterZ;
+                for (hsize_t j = 0; j < ySize; j++) {
+                    for (hsize_t k = 0; k < xSize; k++) {
+                        counterZ.reset();
+                        auto indexZ = k + xSize * j;
+                        
+                        for (hsize_t i = 0; i < depth; i++) {
+                            auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                            auto& val = standardSlice[sourceIndex];
                             
-                            statsZ.copyStatsFromCounter(indexZ, depth, counterZ);
+                            if (std::isfinite(val)) {
+                                // Not lazy; too much risk of encountering an ascending / descending sequence.
+                                counterZ.accumulateFinite(val);
+                            } else {
+                                counterZ.accumulateNonFinite();
+                            }
                         }
+#pragma omp critical
+                            statsZ.copyStatsFromCounter(indexZ, depth, counterZ);
                     }
-                    
-                    // write tile slice
-                    DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
-                    TIMER(timer.start("Write"););
-                    
-                    auto swizzledMemDims = trimAxes({1, xSize, ySize, depth}, N);
-                    auto swizzledCount = trimAxes({1, xSize, ySize, depth}, N);
-                    auto swizzledStart = trimAxes({s, xOffset, yOffset, 0}, N);
-                    
-                    writeHdf5Data(swizzledDataSet, rotatedSlice, swizzledMemDims, swizzledCount, swizzledStart);
-                    
-                    DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
-                    // write Z statistics
-                    statsZ.write({ySize, xSize}, {1, ySize, xSize}, {s, yOffset, xOffset});
                 }
-            }
+                
+                // write tile slice
+                DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
+                TIMER(timer.start("Write"););
+                
+                auto swizzledMemDims = trimAxes({1, xSize, ySize, depth}, N);
+                auto swizzledCount = trimAxes({1, xSize, ySize, depth}, N);
+                auto swizzledStart = trimAxes({s, xOffset, yOffset, 0}, N);
+                
+                writeHdf5Data(swizzledDataSet, rotatedSlice, swizzledMemDims, swizzledCount, swizzledStart);
+                
+                DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
+                // write Z statistics
+                statsZ.write({ySize, xSize}, {1, ySize, xSize}, {s, yOffset, xOffset});
+                
+                
+                }
             PROGRESS(std::endl);
         }
         
         TIMER(timer.start("Free"););
         DEBUG(std::cout << "Freeing memory from main and rotated dataset slices... " << std::endl;);
         delete[] standardSlice;
-        delete[] rotatedSlice;
+        for (int i = 0; i < numThreads; i++) {
+            delete[] rotatedSlices[i];
+        }
     }
 }

From e2f2ddf824295c1979a1d766e001ada979e41481 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 26 Feb 2024 16:07:26 +0200
Subject: [PATCH 15/26] Added missing lines from last commit

---
 src/SmartConverter.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index 639cd60..e0eef16 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -104,7 +104,6 @@ void SmartConverter::copyAndCalculate() {
             
             accumulate = first_accumulate;
             
-                
             int mipIndex;
             StatsCounter counterRegion;
             
@@ -299,6 +298,7 @@ void SmartConverter::copyAndCalculate() {
         PROGRESS("Tiled rotation & Z stats" << std::endl);
         TIMER(timer.start("Allocate"););
         
+        hsize_t sliceSize = product(trimAxes({stokes, depth, REGION_MULTIPLIER, REGION_MULTIPLIER}, N));
         float* standardSlice = new float[sliceSize];
         
         int numThreads = omp_get_max_threads();

From b983e1637b5c21f259110a9e5a1a9367756be622 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 26 Feb 2024 16:55:17 +0200
Subject: [PATCH 16/26] Fixed statsZ calculation with multithreading

---
 src/SmartConverter.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index e0eef16..55bfdea 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -309,7 +309,7 @@ void SmartConverter::copyAndCalculate() {
             rotatedSlices[i] = new float[sliceSize];
         }
         
-        statsZ.createBuffers({REGION_MULTIPLIER, REGION_MULTIPLIER});
+        statsZ.createBuffers({height, width});
         
         for (unsigned int s = 0; s < stokes; s++) {
             DEBUG(std::cout << "Processing Stokes " << s << "..." << std::endl;);
@@ -390,8 +390,9 @@ void SmartConverter::copyAndCalculate() {
                                 counterZ.accumulateNonFinite();
                             }
                         }
+                        hsize_t index = (xOffset + k) + width * (yOffset + j);
 #pragma omp critical
-                            statsZ.copyStatsFromCounter(indexZ, depth, counterZ);
+                            statsZ.copyStatsFromCounter(index, depth, counterZ);
                     }
                 }
                 
@@ -406,11 +407,11 @@ void SmartConverter::copyAndCalculate() {
                 writeHdf5Data(swizzledDataSet, rotatedSlice, swizzledMemDims, swizzledCount, swizzledStart);
                 
                 DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
-                // write Z statistics
-                statsZ.write({ySize, xSize}, {1, ySize, xSize}, {s, yOffset, xOffset});
-                
+              
                 
                 }
+                // write Z statistics
+                statsZ.write();
             PROGRESS(std::endl);
         }
         

From 105bdb7ea876c23abacf7b9c2486872ecab833ef Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 25 Mar 2024 12:14:24 +0200
Subject: [PATCH 17/26] Bring back single-threaded zStats calculation

---
 src/SmartConverter.cc | 174 ++++++++++++++++++------------------------
 1 file changed, 73 insertions(+), 101 deletions(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index 55bfdea..b952267 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -298,18 +298,11 @@ void SmartConverter::copyAndCalculate() {
         PROGRESS("Tiled rotation & Z stats" << std::endl);
         TIMER(timer.start("Allocate"););
         
-        hsize_t sliceSize = product(trimAxes({stokes, depth, REGION_MULTIPLIER, REGION_MULTIPLIER}, N));
+        hsize_t sliceSize = product(trimAxes({stokes, depth, TILE_SIZE, TILE_SIZE}, N));
         float* standardSlice = new float[sliceSize];
+        float* rotatedSlice = new float[sliceSize];
         
-        int numThreads = omp_get_max_threads();
-        
-        // Create a vector of float pointers for each thread
-        std::vector<float*> rotatedSlices(numThreads);
-        for (int i = 0; i < numThreads; i++) {
-            rotatedSlices[i] = new float[sliceSize];
-        }
-        
-        statsZ.createBuffers({height, width});
+        statsZ.createBuffers({TILE_SIZE, TILE_SIZE});
         
         for (unsigned int s = 0; s < stokes; s++) {
             DEBUG(std::cout << "Processing Stokes " << s << "..." << std::endl;);
@@ -317,109 +310,88 @@ void SmartConverter::copyAndCalculate() {
             
             hsize_t tileCount(0);
             
-            int mipIndex;
-            
-            int mipRows = std::ceil(height / (float)REGION_MULTIPLIER);
-            int mipCols = std::ceil(width / (float)REGION_MULTIPLIER);
-            int adjustedStandardCubeSize = mipRows * mipCols;
-            
-#pragma omp parallel for default(none) private (mipIndex) shared (s, statsZ, tileProgressStride, standardSlice, rotatedSlices, adjustedStandardCubeSize, mipMaps, tileCount, cubeSize, REGION_MULTIPLIER, std::cout)
-            for (mipIndex = 0; mipIndex < adjustedStandardCubeSize; mipIndex += 1 ) {
-                // Get the thread id
-                int threadId = omp_get_thread_num();
-                
-                // Use the thread's own rotatedSlice
-                float* rotatedSlice = rotatedSlices[threadId];
-                
-                hsize_t x0,y0,z0;
-                MipIndexToXYZ(mipIndex, x0, y0, z0, width, height, REGION_MULTIPLIER, REGION_MULTIPLIER, 1); //use index of higher-order mipmap-space to keep lower-order mipmaps thread-safe
-                hsize_t xOffset = x0;
-                hsize_t yOffset = y0;
-                tileCount++;
-                hsize_t xSize = std::min(REGION_MULTIPLIER, width - xOffset);
-                hsize_t ySize = std::min(REGION_MULTIPLIER, height - yOffset);
-                
-                DEBUG(std::cout << "+ Processing tile slice at " << xOffset << ", " << yOffset << "..." << std::flush;);
-                PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
-                
-                // read tile slice
-                DEBUG(std::cout << " Reading main dataset..." << std::flush;);
-                TIMER(timer.start("Read"););
-                
-                auto standardMemDims = trimAxes({1, depth, ySize, xSize}, N);
-                auto standardCount = trimAxes({1, depth, ySize, xSize}, N);
-                auto standardStart = trimAxes({s, 0, yOffset, xOffset}, N);
-                
-                readHdf5Data(standardDataSet, standardSlice, standardMemDims, standardCount, standardStart);
-                
-                // rotate tile slice
-                DEBUG(std::cout << " Calculating rotation..." << std::flush;);
-                TIMER(timer.start("Rotation"););
-                
-                //depth is going all the way, but ysize and xsize are not!
-                for (hsize_t i = 0; i < depth; i++) {
-                    for (hsize_t j = 0; j < ySize; j++) {
-                        for (hsize_t k = 0; k < xSize; k++) {
-                            auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
-                            auto& val = standardSlice[sourceIndex];
-                            
-                            // rotation
-                            auto destIndex = i + depth * j + (ySize * depth) * k;
-                            rotatedSlice[destIndex] = val;
+            for (hsize_t xOffset = 0; xOffset < width; xOffset += TILE_SIZE) {
+                for (hsize_t yOffset = 0; yOffset < height; yOffset += TILE_SIZE) {
+                    tileCount++;
+                    hsize_t xSize = std::min(TILE_SIZE, width - xOffset);
+                    hsize_t ySize = std::min(TILE_SIZE, height - yOffset);
+                    
+                    DEBUG(std::cout << "+ Processing tile slice at " << xOffset << ", " << yOffset << "..." << std::flush;);
+                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                    
+                    // read tile slice
+                    DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+                    TIMER(timer.start("Read"););
+                    
+                    auto standardMemDims = trimAxes({1, depth, ySize, xSize}, N);
+                    auto standardCount = trimAxes({1, depth, ySize, xSize}, N);
+                    auto standardStart = trimAxes({s, 0, yOffset, xOffset}, N);
+                    
+                    readHdf5Data(standardDataSet, standardSlice, standardMemDims, standardCount, standardStart);
+                    
+                    // rotate tile slice
+                    DEBUG(std::cout << " Calculating rotation..." << std::flush;);
+                    TIMER(timer.start("Rotation"););
+                    
+                    for (hsize_t i = 0; i < depth; i++) {
+                        for (hsize_t j = 0; j < ySize; j++) {
+                            for (hsize_t k = 0; k < xSize; k++) {
+                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                                auto& val = standardSlice[sourceIndex];
+                                
+                                // rotation
+                                auto destIndex = i + depth * j + (ySize * depth) * k;
+                                rotatedSlice[destIndex] = val;
+                            }
                         }
                     }
-                }
-                
-                // A separate pass over the same slice depth-last
-                DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
-                TIMER(timer.start("Z statistics"););
-                StatsCounter counterZ;
-                for (hsize_t j = 0; j < ySize; j++) {
-                    for (hsize_t k = 0; k < xSize; k++) {
-                        counterZ.reset();
-                        auto indexZ = k + xSize * j;
-                        
-                        for (hsize_t i = 0; i < depth; i++) {
-                            auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
-                            auto& val = standardSlice[sourceIndex];
+                    
+                    // A separate pass over the same slice depth-last
+                    DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
+                    TIMER(timer.start("Z statistics"););
+                    
+                    for (hsize_t j = 0; j < ySize; j++) {
+                        for (hsize_t k = 0; k < xSize; k++) {
+                            StatsCounter counterZ;
+                            auto indexZ = k + xSize * j;
                             
-                            if (std::isfinite(val)) {
-                                // Not lazy; too much risk of encountering an ascending / descending sequence.
-                                counterZ.accumulateFinite(val);
-                            } else {
-                                counterZ.accumulateNonFinite();
+                            for (hsize_t i = 0; i < depth; i++) {
+                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                                auto& val = standardSlice[sourceIndex];
+                                
+                                if (std::isfinite(val)) {
+                                    // Not lazy; too much risk of encountering an ascending / descending sequence.
+                                    counterZ.accumulateFinite(val);
+                                } else {
+                                    counterZ.accumulateNonFinite();
+                                }
                             }
+                            
+                            statsZ.copyStatsFromCounter(indexZ, depth, counterZ);
                         }
-                        hsize_t index = (xOffset + k) + width * (yOffset + j);
-#pragma omp critical
-                            statsZ.copyStatsFromCounter(index, depth, counterZ);
                     }
+                    
+                    // write tile slice
+                    DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
+                    TIMER(timer.start("Write"););
+                    
+                    auto swizzledMemDims = trimAxes({1, xSize, ySize, depth}, N);
+                    auto swizzledCount = trimAxes({1, xSize, ySize, depth}, N);
+                    auto swizzledStart = trimAxes({s, xOffset, yOffset, 0}, N);
+                    
+                    writeHdf5Data(swizzledDataSet, rotatedSlice, swizzledMemDims, swizzledCount, swizzledStart);
+                    
+                    DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
+                    // write Z statistics
+                    statsZ.write({ySize, xSize}, {1, ySize, xSize}, {s, yOffset, xOffset});
                 }
-                
-                // write tile slice
-                DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
-                TIMER(timer.start("Write"););
-                
-                auto swizzledMemDims = trimAxes({1, xSize, ySize, depth}, N);
-                auto swizzledCount = trimAxes({1, xSize, ySize, depth}, N);
-                auto swizzledStart = trimAxes({s, xOffset, yOffset, 0}, N);
-                
-                writeHdf5Data(swizzledDataSet, rotatedSlice, swizzledMemDims, swizzledCount, swizzledStart);
-                
-                DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
-              
-                
-                }
-                // write Z statistics
-                statsZ.write();
+            }
             PROGRESS(std::endl);
         }
         
         TIMER(timer.start("Free"););
         DEBUG(std::cout << "Freeing memory from main and rotated dataset slices... " << std::endl;);
         delete[] standardSlice;
-        for (int i = 0; i < numThreads; i++) {
-            delete[] rotatedSlices[i];
-        }
+        delete[] rotatedSlice;
     }
 }

From a25e8eb75c956fd58c36d43d3c33ac744186eeed Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 8 Apr 2024 13:17:59 +0200
Subject: [PATCH 18/26] Fixed bugs in speed test of convertertest.py

---
 scripts/convertertest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convertertest.py b/scripts/convertertest.py
index 6f630bf..7ad41f4 100755
--- a/scripts/convertertest.py
+++ b/scripts/convertertest.py
@@ -380,7 +380,7 @@ def dummy_image_set():
 
 def test_speed(args, *image_sets):
     executables = [args.executable]
-    if "compare" in args:
+    if args.compare is not None:
         executables.append(args.compare)
     
     times = defaultdict(lambda: defaultdict(list))
@@ -447,7 +447,7 @@ def test_zmips_converter(infile, new_converter):
     image_sets = (IMAGE_SETS[i] for i in args.image_set)
     
     if args.time:
-        test_speed(args, *image_sets, args.executable)
+        test_speed(args, *image_sets)
     else:
         for image_set in image_sets:
             for dims, params in image_set:

From 02f0fd1afc91efd4a895c6e47ba94c0033e5164d Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 8 Apr 2024 15:05:23 +0200
Subject: [PATCH 19/26] Add smart mode testing to python converter test

---
 scripts/convertertest.py | 140 +++++++++++++++++++++++----------------
 1 file changed, 82 insertions(+), 58 deletions(-)

diff --git a/scripts/convertertest.py b/scripts/convertertest.py
index 7ad41f4..83fe8a7 100755
--- a/scripts/convertertest.py
+++ b/scripts/convertertest.py
@@ -44,7 +44,7 @@ def compare_fits_hdf5(fitsname, hdf5name, zmips=False):
 
     axes = {v: k for k, v in enumerate(reversed(axis_names))}
     dims = {v: fitsdata.shape[k] for k, v in enumerate(reversed(axis_names))}
-    
+
     width, height, depth, stokes = dims["X"], dims["Y"], dims.get("Z", 1), dims.get("W", 1)
 
     # CHECK SWIZZLES
@@ -57,7 +57,7 @@ def compare_fits_hdf5(fitsname, hdf5name, zmips=False):
         swizzled_name = "ZYXW"
 
     if swizzled_name:
-        swizzled_shape = tuple(dims[a] for a in reversed(swizzled_name))    
+        swizzled_shape = tuple(dims[a] for a in reversed(swizzled_name))
         assert swizzled_name in hdf5file["0/SwizzledData"], "No swizzled dataset found."
         assert hdf5file["0/SwizzledData"][swizzled_name].shape == swizzled_shape, "Swizzled dataset has incorrect dimensions."
 
@@ -68,34 +68,34 @@ def compare_fits_hdf5(fitsname, hdf5name, zmips=False):
     if ndim >= 3 and dims["Z"] > 1:
         stats.append("Z")
         stats.append("XYZ")
-        
+
     for s in stats:
         assert s in hdf5file["0/Statistics"], "%s statistics missing." % s
-        
+
         sdata = hdf5file["0/Statistics"][s]
         stats_axis = tuple(axes[a] for a in s)
-        
+
         # CHECK BASIC STATS
-        
+
         def assert_close(stat, func, data=fitsdata):
             if stat in sdata:
                 assert_allclose(sdata[stat], func(data, axis=stats_axis), rtol=1e-5, err_msg = "%s/%s is incorrect. DIFF: %r" % (s, stat, func(data, axis=stats_axis) - sdata[stat]))
-        
+
         assert_close("SUM", np.nansum, fitsdata.astype(np.float64))
         assert_close("SUM_SQ", np.nansum, fitsdata.astype(np.float64)**2)
         assert_close("MEAN", np.nanmean)
         assert_close("MIN", np.nanmin)
         assert_close("MAX", np.nanmax)
-        
+
         assert (np.count_nonzero(np.isnan(fitsdata), axis=stats_axis) == sdata["NAN_COUNT"]).all(), "%s/NAN_COUNT is incorrect." % s
-        
+
         if s in ["XY", "XYZ"]:
             # CHECK HISTOGRAMS
-            
+
             hist = np.array(sdata["HISTOGRAM"])
             num_bins = int(max(np.sqrt(width * height), 2))
             d = np.array(hdf5data)
-            
+
             # Note: we produce a zero histogram for datasets with only one not-nan value. Numpy changes the bounds when calculating the histogram for a single value, so it always bins it somewhere.
             if ndim == len(s):
                 d = d[~np.isnan(d)]
@@ -106,17 +106,17 @@ def assert_close(stat, func, data=fitsdata):
                 rest_shape = tuple(dims[a] for a in reversed(axis_names) if a not in s)
                 rest_merged_shape = np.multiply.reduce(rest_shape)
                 hist_shape = rest_shape + (num_bins,)
-                                            
+
                 reference = np.apply_along_axis(lambda a: np.histogram(a[~np.isnan(a)].astype(np.float64), bins=num_bins)[0] if a[~np.isnan(a)].size > 1 else np.zeros(num_bins), 1, d.reshape(rest_merged_shape, stats_merged_shape)).reshape(hist_shape)
-            
+
             diff = reference - hist
-            
+
             assert hist.shape == reference.shape, "%s histogram shape %r does not match expected shape %r" % (s, hist.shape, reference.shape)
             # Note: we can't replicate the converter's binning exactly because of a precision issue, so there are occasional off-by-one bin increments.
             assert diff.sum() == 0 and diff[diff != 0].size / diff.size < 0.01, "Too many %s histogram values do not match expected values.\nSum of difference: %d\nDifference:\n%s" % (s, diff.sum(), pprint_sparse_diff(hist, reference))
-    
+
     # CHECK MIPMAPS
-    
+
     if "MipMaps" in hdf5file["0"]:
         for mname, mipmap in sorted(hdf5file["0/MipMaps"]["DATA"].items(), key=lambda x: x[1].size, reverse=True):
             if (re.search("_XYZ_", mname)):
@@ -146,7 +146,7 @@ def assert_close(stat, func, data=fitsdata):
                 assert mdepth == np.ceil(depth / zfactor)
 
             # check mipmap contents
-            
+
             def assert_mipmap_channel_equal(name, channel, d, got):
                 got = np.array(got)
                 if zmips and len(mipmap.shape) > 2:
@@ -154,7 +154,7 @@ def assert_mipmap_channel_equal(name, channel, d, got):
                 else:
                     expected = np.array([[np.nanmean(d[y*xyfactor:(y+1)*xyfactor, x*xyfactor:(x+1)*xyfactor]) for x in range(mwidth)] for y in range(mheight)])
                 assert_allclose(expected, got, rtol=1e-5, atol=1e-7, equal_nan=True, err_msg = "Mipmap %s channel %r is incorrect. \nEXPECTED:\n%r\nGOT:\n%r\nDIFF:\n%s" % (name, channel, expected, got, pprint_sparse_diff(expected, got)))
-            
+
             d = np.array(hdf5data)
             rest = mipmap.shape[:-3] if zmips else mipmap.shape[:-2]
 
@@ -176,17 +176,17 @@ def compare_hdf5_hdf5(file1, file2, fail_msg, ignore_converter_attrs=True):
     h5diff = subprocess.run(["h5diff", file1, file2], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     if h5diff.returncode == 0:
         return
-    
+
     if not ignore_converter_attrs:
         assert False, fail_msg
     else:
         ignore_attributes = ("SCHEMA_VERSION", "HDF5_CONVERTER", "HDF5_CONVERTER_VERSION")
-        
+
         output = h5diff.stdout.decode('ascii')
-        
+
         for attribute in ignore_attributes:
             output = re.sub(r"attribute: <%s of </0>> and <%s of </0>>\n\d+ differences found\n" % (attribute, attribute), "", output)
-            
+
         if output:
             print(output)
             print("(Permitted converter version attributes have been ignored.)")
@@ -204,68 +204,89 @@ def make_image(outfile, *dims, **params):
                 cmd.append(str(values))
     cmd.append("--")
     cmd.extend(str(d) for d in dims)
-    
+
     print(*cmd)
-    
+
     result = subprocess.run(cmd)
     assert result.returncode == 0, "Image generation failed."
-    
-def convert(infile, outfile, executable, slow=False, zMips=False):
+
+def convert(infile, outfile, executable, slow=False, smart=False, z_mips=False, mem_limit_in_mb=200):
     cmd = [executable]
     if slow:
+        if smart:
+            raise "Only slow or smart can be selected at once."
         cmd.append("-s")
-    if zMips:
+    if smart:
+        cmd.append("-r" + str(mem_limit_in_mb))
+    if z_mips:
         cmd.append("-z")
     cmd.extend(["-o", outfile, infile])
-    
+
     print(*cmd)
-    
+
     result = subprocess.run(cmd)
     assert result.returncode == 0, "Conversion failed."
-    
-def time(infile, executable, slow=False):
+
+def time(infile, executable, slow=False, smart=False, mem_limit_in_mb=4000):
     os.system("sudo sh -c 'sync && echo 3 > /proc/sys/vm/drop_caches'")
-    
+
     cmd = [executable]
     if slow:
         cmd.append("-s")
     cmd.extend(["-o", "TIMED.hdf5", infile])
-    
+
+    if smart:
+        cmd.append("-r" + str(mem_limit_in_mb))
+
     print(*cmd)
-    
+
     start = timer()
     result = subprocess.run(cmd)
     end = timer()
-    
+
     if result.returncode > 0:
         return None
 
     subprocess.run(["rm", "test.fits", "TIMED.hdf5"])
-        
+
     return end - start
 
 def test_converter_correctness(infile, new_converter):
     convert(infile, "FAST.hdf5", new_converter)
     convert(infile, "SLOW.hdf5", new_converter, True)
-    
+    convert(infile, "SMART.hdf5", new_converter, smart=True, mem_limit_in_mb=200)
+
     # Do this first so that we fail more quickly
     compare_hdf5_hdf5("FAST.hdf5", "SLOW.hdf5", "Fast and slow versions differ.", False)
-    
+
+    compare_hdf5_hdf5("FAST.hdf5", "SMART.hdf5", "Fast and smart versions differ.", False)
+
+    compare_hdf5_hdf5("SLOW.hdf5", "SMART.hdf5", "Slow and smart versions differ.", False)
+
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=RuntimeWarning)
         compare_fits_hdf5("test.fits", "FAST.hdf5")
-    
-    subprocess.run(["rm", "test.fits", "FAST.hdf5", "SLOW.hdf5"])
+
+    subprocess.run(["rm", "test.fits", "FAST.hdf5", "SLOW.hdf5", "SMART.hdf5"])
 
 def test_new_old_converter(infile, slow, old_converter, new_converter):
     convert(infile, "OLD.hdf5", old_converter, slow)
     convert(infile, "NEW.hdf5", new_converter, slow)
-    
+
     compare_hdf5_hdf5("OLD.hdf5", "NEW.hdf5", "Old and new versions differ.", True)
 
+# Test the smart converter for several levels of memory limits
+def test_smart_converter_levels(infile, executable):
+    for mem_limit_in_mb in [0, 5, 50, 100, 500]:
+        out_file = "SMART_" + str(mem_limit_in_mb) + "_.hdf5"
+        convert(infile, out_file, executable, smart=True, mem_limit_in_mb=mem_limit_in_mb)
+        if mem_limit_in_mb > 0:
+            compare_hdf5_hdf5("SMART_0_.hdf5", out_file, "Smart converter with memory limit %dMB differs from smart converter with no limit." % mem_limit_in_mb, True)
+    compare_fits_hdf5(infile, "SMART_0_.hdf5")
+
 def small_nans_image_set():
     image_set = []
-    
+
     for N in (2, 3, 4):
         for nans in itertools.chain((None, ("image",)), itertools.chain.from_iterable(itertools.combinations(("pixel", "row", "column", "channel", "stokes"), n) for n in range(1, 5+1))):
             for nan_density in (0, 33, 66, 100):
@@ -274,12 +295,12 @@ def small_nans_image_set():
                     dims.append(random.randint(50, 100))
                 if N > 3:
                     dims.append(random.randint(1, 4))
-                    
+
                 params = {
                     "--nans": nans,
                     "--nan-density": nan_density
                 }
-                
+
                 image_set.append((dims, params))
     return image_set
 
@@ -295,7 +316,7 @@ def small_dims_image_set():
 
 def large_mipmap_image_set():
     image_set = []
-    
+
     # A few bigger images to test multiple mipmaps
     for dims in ((5000, 200), (5000, 200, 10), (5000, 200, 10, 2)):
         for nans in (("pixel",),):
@@ -304,7 +325,7 @@ def large_mipmap_image_set():
                     "--nans": nans,
                     "--nan-density": nan_density
                 }
-                
+
                 image_set.append((dims, params))
     return image_set
 
@@ -382,25 +403,25 @@ def test_speed(args, *image_sets):
     executables = [args.executable]
     if args.compare is not None:
         executables.append(args.compare)
-    
+
     times = defaultdict(lambda: defaultdict(list))
-    
+
     for i in range(args.repeat):
         for executable in executables:
             for image_set in image_sets:
                 for dims, _ in image_set: # we only care about the dimensions
                     make_image("test.fits", *dims)
-                    t = time("test.fits", executable, args.slow)
+                    t = time("test.fits", executable, args.slow, args.smart)
                     if t is not None:
                         times[dims][executable].append(t)
-    
+
     winners = {}
     xs = set()
     zs = set()
-    
+
     print("Image dimensions", *executables, sep='\t')
     print()
-    
+
     for dims in sorted(times):
         best = [np.min(times[dims][e]) for e in executables]
         print(*dims, *best, sep='\t')
@@ -408,17 +429,17 @@ def test_speed(args, *image_sets):
         xs.add(x)
         zs.add(z)
         winners[(x, z)] = "A" if min(best) == best[0] else "B"
-    
+
     if len(executables) > 1:
         xs = sorted(xs)
         zs = sorted(zs)
-        
+
         for label, path in zip(["A", "B"], executables):
             print("%s: %s" % (label, path))
         print()
-        
+
         print('X \ Z', *zs, sep='\t')
-        
+
         for x in xs:
             print(x, end='\t')
             for z in zs:
@@ -438,14 +459,15 @@ def test_zmips_converter(infile, new_converter):
     parser.add_argument('-c', '--compare', help='A path to another converter executable to use as a reference. If a reference converter is given, random files converted using the fast algorithm with both converters will be compared to each other. If none is given, the output of the converter at the default path will be checked for correctness (THIS IS SLOW), and its fast and slow algorithm outputs will be compared for consistency.')
     parser.add_argument('-t', '--time', action='store_true', help='Time the converter(s) instead of checking the output.')
     parser.add_argument('-s', '--slow', action='store_true', help='Use the slow converter versions when comparing converters or timing converter(s).')
+    parser.add_argument('-m', '--smart', action='store_true', help='Use the smart converter versions when comparing converters or timing converter(s).')
     parser.add_argument('-i', '--image-set', nargs="+", help="The image set(s) to use. Any combination of %s. By default a small dummy set is used." % ", ".join(repr(o) for o in IMAGE_SETS) , default=["DUMMY"])
     parser.add_argument('-r', '--repeat', type=int, help="The number of times to repeat timed conversions (default: 3).", default=3)
     parser.add_argument('-z', '--zmips', action='store_true', help='Test if HDF5 conversion for depth MipMaps produces valid result.')
     parser.add_argument("executable", help="The path to the executable to test.")
     args = parser.parse_args()
-    
+
     image_sets = (IMAGE_SETS[i] for i in args.image_set)
-    
+
     if args.time:
         test_speed(args, *image_sets)
     else:
@@ -456,5 +478,7 @@ def test_zmips_converter(infile, new_converter):
                     test_new_old_converter("test.fits", args.slow, args.compare, args.executable)
                 elif args.zmips:
                     test_zmips_converter("test.fits", args.executable);
+                elif args.smart:
+                    test_smart_converter_levels("test.fits", args.executable)
                 else:
                     test_converter_correctness("test.fits", args.executable)

From 3d3ef264ce2c8334c0a1c42372ee613025939985 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Tue, 9 Apr 2024 14:40:45 +0200
Subject: [PATCH 20/26] Correct problems that were causing false results

---
 src/Converter.cc      |   4 +-
 src/Converter.h       |   7 ++
 src/FastConverter.cc  |  15 +++--
 src/MipMap.h          |  11 ++-
 src/SmartConverter.cc | 152 ++++++++++++++++++++++++++++--------------
 src/Stats.h           |   2 +-
 src/Util.cc           |  13 ++--
 src/Util.h            |   2 +-
 src/main.cc           |  18 +++--
 9 files changed, 149 insertions(+), 75 deletions(-)

diff --git a/src/Converter.cc b/src/Converter.cc
index 1c1c06b..9744af2 100644
--- a/src/Converter.cc
+++ b/src/Converter.cc
@@ -52,11 +52,11 @@ Converter::~Converter() {
     closeFitsFile(inputFilePtr);
 }
 
-std::unique_ptr<Converter> Converter::getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips) {
+std::unique_ptr<Converter> Converter::getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips, int memoryLimitInMb = 0) {
     if (slow) {
         return std::unique_ptr<Converter>(new SlowConverter(inputFileName, outputFileName, progress, zMips));
     } else if (smart) {
-        return std::unique_ptr<Converter>(new SmartConverter(inputFileName, outputFileName, progress, zMips));
+        return std::unique_ptr<Converter>(new SmartConverter(inputFileName, outputFileName, progress, zMips, memoryLimitInMb));
     } else {
         return std::unique_ptr<Converter>(new FastConverter(inputFileName, outputFileName, progress, zMips));
     }
diff --git a/src/Converter.h b/src/Converter.h
index 9d2a62f..0becf8f 100644
--- a/src/Converter.h
+++ b/src/Converter.h
@@ -11,6 +11,7 @@
 #include "MipMap.h"
 #include "Timer.h"
 #include "Util.h"
+#include "omp.h"
 
 struct MemoryUsage {
     MemoryUsage() : total(0) {}
@@ -24,9 +25,11 @@ class Converter {
 public:
     Converter() {}
     Converter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
+    Converter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips, int memoryLimitInMb);
     ~Converter();
     
     static std::unique_ptr<Converter> getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips);
+    static std::unique_ptr<Converter> getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips, int memoryLimitInMb);
     void convert();
     void reportMemoryUsage();
     virtual MemoryUsage calculateMemoryUsage() = 0;
@@ -86,8 +89,12 @@ class FastConverter : public Converter {
 class SmartConverter : public Converter {
 public:
     SmartConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
+    SmartConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMip, int memoryLimitInMb);
     MemoryUsage calculateMemoryUsage() override;
     
+private:
+    int memoryLimitInMb;
+    
 protected:
     void copyAndCalculate() override;
 };
diff --git a/src/FastConverter.cc b/src/FastConverter.cc
index 325eeeb..bf26508 100644
--- a/src/FastConverter.cc
+++ b/src/FastConverter.cc
@@ -75,10 +75,12 @@ void FastConverter::copyAndCalculate() {
 
         // First loop calculates stats for each XY slice and rotates the dataset
         
-#pragma omp parallel for
-        for (hsize_t i = 0; i < depth; i++) {
+        hsize_t i;
+        StatsCounter counterXY;
+#pragma omp parallel for default(none) private(i, counterXY) shared(standardCube, rotatedCube, statsXY, statsXYZ, statsZ, channelProgressStride, std::cout, pixelProgressStride, depth, height, width)
+        for (i = 0; i < depth; i++) {
             PROGRESS_DECIMATED(i, channelProgressStride, "|");
-            StatsCounter counterXY;
+            counterXY.reset();
             
             auto& indexXY = i;
             std::function<void(float)> accumulate;
@@ -114,6 +116,7 @@ void FastConverter::copyAndCalculate() {
             }
             
             // Final correction of XY min and max
+
             statsXY.copyStatsFromCounter(indexXY, height * width, counterXY);
         }
         
@@ -140,7 +143,7 @@ void FastConverter::copyAndCalculate() {
             PROGRESS("\tZ stats\t\t");
             TIMER(timer.start("Z statistics"););
 
-#pragma omp parallel for
+#pragma omp parallel for default(none) private(i, counterXY) shared(standardCube, rotatedCube, statsXY, statsXYZ, statsZ, channelProgressStride, std::cout, pixelProgressStride, depth, height, width)
             for (hsize_t j = 0; j < height; j++) {
                 for (hsize_t k = 0; k < width; k++) {
                     StatsCounter counterZ;
@@ -188,7 +191,7 @@ void FastConverter::copyAndCalculate() {
         statsXY.clearHistogramBuffers();
         statsXYZ.clearHistogramBuffers();
 
-#pragma omp parallel for
+#pragma omp parallel for default(none) private(i) shared(standardCube, statsXY, statsXYZ, channelProgressStride, std::cout, pixelProgressStride, depth, height, width, cubeHist, cubeMin, cubeRange)
         for (hsize_t i = 0; i < depth; i++) {
             PROGRESS_DECIMATED(i, channelProgressStride, "|");
             
@@ -277,7 +280,7 @@ void FastConverter::copyAndCalculate() {
         PROGRESS("\tMipmaps\t\t");
         TIMER(timer.start("Mipmaps"););
         
-#pragma omp parallel for
+#pragma omp parallel for default(none) private(mipMaps) shared(standardCube, channelProgressStride, std::cout, pixelProgressStride, depth, height, width)
         for (hsize_t c = 0; c < depth; c++) {
             PROGRESS_DECIMATED(c, channelProgressStride, "|");
             for (hsize_t y = 0; y < height; y++) {
diff --git a/src/MipMap.h b/src/MipMap.h
index c73f694..17a5d43 100644
--- a/src/MipMap.h
+++ b/src/MipMap.h
@@ -19,11 +19,16 @@ struct MipMap {
     void createBuffers(std::vector<hsize_t>& bufferDims);
     
     void accumulate(double val, hsize_t x, hsize_t y, hsize_t z) {
-        hsize_t mipIndex = (z / mipZ) * width * height + (y / mipXY) * width + (x / mipXY);
+        hsize_t mipIndex;
+        XYZToMipIndex(x, y, z, mipIndex);
         vals[mipIndex] += val;
         count[mipIndex]++;
     }
     
+    void XYZToMipIndex(hsize_t x, hsize_t y, hsize_t z, hsize_t& mipIndex) {
+        mipIndex = (z / mipZ) * width * height + (y / mipXY) * width + (x / mipXY);
+    }
+    
     void calculate() {
         for (hsize_t mipIndex = 0; mipIndex < bufferSize; mipIndex++) {
             if (count[mipIndex]) {
@@ -68,7 +73,9 @@ struct MipMaps {
     void createBuffers(const std::vector<hsize_t>& standardBufferDims);
     
     void accumulate(double val, hsize_t x, hsize_t y, hsize_t z) {
-        for (auto& mipMap : mipMaps) {
+        int n = mipMaps.size();
+        for (int i = 0; i < n; i++) {
+            auto& mipMap = mipMaps[i];
             mipMap.accumulate(val, x, y, z);
         }
     }
diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index b952267..d24a01a 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -5,7 +5,10 @@
 
 #include "Converter.h"
 
-SmartConverter::SmartConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips) : Converter(inputFileName, outputFileName, progress, zMips) {}
+SmartConverter::SmartConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMip, int memoryLimitInMb = 0) : Converter(inputFileName, outputFileName, progress, zMips)
+{
+    this->memoryLimitInMb = memoryLimitInMb;
+}
 
 MemoryUsage SmartConverter::calculateMemoryUsage() {
     MemoryUsage m;
@@ -37,10 +40,22 @@ void SmartConverter::copyAndCalculate() {
     hsize_t numTiles = std::ceil(width / TILE_SIZE) * std::ceil(height / TILE_SIZE);
     const hsize_t tileProgressStride = std::max((hsize_t)1, (hsize_t)(numTiles / 100));
     
+    // 32 has produced best results in testing
     const hsize_t REGION_MULTIPLIER = 32;
     
-    // Allocate one channel at a time, and no swizzled data
-    hsize_t cubeSize = height * width;
+    // Allocate one batch of slices at a time, and no swizzled data.
+    // Batch size in slices determined by memory limit.
+    // If limit not set, use 4GB.
+    
+    hsize_t memoryLimitInBytes = std::max(memoryLimitInMb, 4000) * 1024 * 1024;
+    hsize_t sliceSizeInPixels = height * width;
+    hsize_t batchLimitInPixels = memoryLimitInBytes / sizeof(float);
+    hsize_t batchLimitInSlices = std::ceil((float)batchLimitInPixels / (float)sliceSizeInPixels);
+    
+    // Increment over the full depth if the batch size is larger than the depth
+    hsize_t sliceIncrement = std::min(batchLimitInSlices, depth);
+    
+    hsize_t cubeSize = height * width * sliceIncrement;
     TIMER(timer.start("Allocate"););
     standardCube = new float[cubeSize];
     
@@ -48,13 +63,14 @@ void SmartConverter::copyAndCalculate() {
     statsXY.createBuffers({depth}, height);
     
     if (depth > 1) {
-        statsXYZ.createBuffers({});
+        statsXYZ.createBuffers({}, height);
     }
     
     mipMaps.createBuffers({1, height, width});
 
-    std::vector<hsize_t> count = trimAxes({1, 1, height, width}, N);
-    std::vector<hsize_t> memDims = {height, width};
+    std::vector<hsize_t> count = trimAxes({1, sliceIncrement, height, width}, N);
+    std::vector<hsize_t> memDims = {sliceIncrement, height, width};
+    
     
     std::string timerLabelStatsMipmaps = depth > 1 ? "XY and XYZ statistics and mipmaps" : "XY statistics and mipmaps";
 
@@ -67,13 +83,29 @@ void SmartConverter::copyAndCalculate() {
         
         StatsCounter counterXYZ;
         
-        for (hsize_t c = 0; c < depth; c++) {
+        // "working" versions of these variables are used to handle the last increment of slices that may be smaller than the batch size
+        hsize_t workingIncrement = sliceIncrement;
+        hsize_t workingCubeSize = cubeSize;
+        std::vector<hsize_t> workingCount = count;
+        std::vector<hsize_t> workingMemDims = memDims;
+        
+        for (hsize_t c = 0; c < depth; c = c + workingIncrement) {
+            hsize_t leftOverSlices = depth - c;
+            size_t actualIncrement = std::min(sliceIncrement, leftOverSlices);
+            
+            // If the last increment is smaller than the batch size, adjust the "working" variables
+            if (actualIncrement < sliceIncrement) {
+                workingIncrement = leftOverSlices;
+                workingCubeSize = height * width * leftOverSlices;
+                workingCount = trimAxes({1, leftOverSlices, height, width}, N);
+                workingMemDims = {leftOverSlices, height, width};
+            }
             PROGRESS_DECIMATED(c, channelProgressStride, "|");
             // read one channel
             DEBUG(std::cout << "+ Processing channel " << c << "... " << std::flush;);
             DEBUG(std::cout << " Reading main dataset..." << std::flush;);
             TIMER(timer.start("Read"););
-            readFitsData(inputFilePtr, c, s, cubeSize, standardCube);
+            readFitsData(inputFilePtr, c, s, workingCubeSize, standardCube);
             
             // Write the standard dataset
             
@@ -81,14 +113,16 @@ void SmartConverter::copyAndCalculate() {
             TIMER(timer.start("Write"););
             
             std::vector<hsize_t> start = trimAxes({s, c, 0, 0}, N);
-            writeHdf5Data(standardDataSet, standardCube, memDims, count, start);
+            writeHdf5Data(standardDataSet, standardCube, workingMemDims, workingCount, start);
             
             DEBUG(std::cout << " Accumulating XY stats and mipmaps..." << std::flush;);
             TIMER(timer.start(timerLabelStatsMipmaps););
 
+            for(int slice = 0; slice < actualIncrement; slice++) {
+            
             StatsCounter counterXY;
             
-            auto indexXY = c;
+            auto indexXY = c + slice;
             std::function<void(float)> accumulate;
             
             //
@@ -104,27 +138,28 @@ void SmartConverter::copyAndCalculate() {
             
             accumulate = first_accumulate;
             
-            int mipIndex;
+            int regionIndex;
             StatsCounter counterRegion;
             
-            int regionRows = std::ceil(height / (float)REGION_MULTIPLIER);
-            int regionCols = std::ceil(width / (float)REGION_MULTIPLIER);
-            int adjustedStandardCubeSize = regionRows * regionCols;
+            int regionRows = std::ceil((float)height / (float)REGION_MULTIPLIER);
+            int regionCols = std::ceil((float)width / (float)REGION_MULTIPLIER);
+            int cubeSizeInRegions = regionRows * regionCols;
             
-#pragma omp parallel for default(none) private (mipIndex, counterRegion) shared (standardCube, adjustedStandardCubeSize, mipMaps, counterXY, cubeSize, REGION_MULTIPLIER)
-            for (mipIndex = 0; mipIndex < adjustedStandardCubeSize; mipIndex += 1 ) {
+#pragma omp parallel for default(none) private (regionIndex, counterRegion) shared (slice, standardCube, cubeSizeInRegions, mipMaps, counterXY, cubeSize, REGION_MULTIPLIER)
+            for (regionIndex = 0; regionIndex < cubeSizeInRegions; regionIndex += 1 ) {
                 counterRegion.reset();
                 hsize_t x0,y0,z0;
-                MipIndexToXYZ(mipIndex, x0, y0, z0, width, height, REGION_MULTIPLIER, REGION_MULTIPLIER, 1); //use index of higher-order mipmap-space to keep lower-order mipmaps thread-safe
-                for (int y = y0; y < y0 + REGION_MULTIPLIER; y++) {
-                    for (int x = x0; x < x0 + REGION_MULTIPLIER; x++) {
-                        auto pos = y * width + x;
+                RegionIndexToXYZ(regionIndex, x0, y0, z0, width, height, REGION_MULTIPLIER, REGION_MULTIPLIER,
+                    1); //use index of higher-order mipmap-space to keep lower-order mipmaps thread-safe
+                for (hsize_t y = y0; y < y0 + REGION_MULTIPLIER; y++) {
+                    for (hsize_t x = x0; x < x0 + REGION_MULTIPLIER; x++) {
+                        auto pos = y * width + x + slice * width * height;
                         if (x >= width || y >= height) {    //check if we are out of bounds
                             continue;
                         }
-                        //std::cout << "pos: " << pos << std::endl;
                         auto& val = standardCube[pos];
                         if (std::isfinite(val)) {
+                            
                             // region statistics
                             counterRegion.accumulateFinite(val);
                     
@@ -158,13 +193,14 @@ void SmartConverter::copyAndCalculate() {
             // Write the mipmaps
             DEBUG(std::cout << " Writing mipmaps..." << std::flush;);
             TIMER(timer.start("Write"););
-            mipMaps.write(s, c);
+            mipMaps.write(s, c + slice);
             
             // Reset mipmaps before next channel
             DEBUG(std::cout << " Resetting mipmap objects..." << std::endl;);
             TIMER(timer.start(timerLabelStatsMipmaps););
             mipMaps.resetBuffers();
             
+        }
         } // end of first channel loop
         
         PROGRESS(std::endl);
@@ -201,11 +237,31 @@ void SmartConverter::copyAndCalculate() {
         
         DEBUG(std::cout << "+ Will " << (cubeHist ? "" : "not ") << "calculate cube histogram." << std::endl;);
         
-        for (hsize_t c = depth; c-- > 0; ) {
+        
+        for (hssize_t c = depth - workingIncrement; c > -1; c = c - sliceIncrement) {
+            
             DEBUG(std::cout << "+ Processing channel " << c << "... " << std::flush;);
             PROGRESS_DECIMATED(c, channelProgressStride, "|");
-            auto indexXY = c;
-                            
+            
+            
+            // read one batch of slices
+            DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+            TIMER(timer.start("Read"););
+            
+
+            readFitsData(inputFilePtr, c, s, workingCubeSize, standardCube);
+
+            DEBUG(std::cout << " Calculating histogram(s)..." << std::endl;);
+            TIMER(timer.start("Histograms"););
+
+           
+            
+        for (hsize_t slice = 0; slice < workingIncrement; slice++) {
+            
+            auto indexXY = c + slice;
+            
+            workingCubeSize = height * width * workingIncrement;
+            
             double chanMin = statsXY.minVals[indexXY];
             double chanMax = statsXY.maxVals[indexXY];
             double chanRange = chanMax - chanMin;
@@ -222,9 +278,9 @@ void SmartConverter::copyAndCalculate() {
                 statsXY.accumulatePartialHistogram(val, chanMin, chanRange, offset);
             };
             
-            auto doCubeHistogram = [&] (float val) {
+            auto doCubeHistogram = [&] (float val, hsize_t offset) {
                 // XYZ histogram
-                statsXYZ.accumulateHistogram(val, cubeMin, cubeRange, 0);
+                statsXYZ.accumulatePartialHistogram(val, cubeMin, cubeRange, offset);
             };
             
             auto doNothing = [&] (float val) {
@@ -236,40 +292,34 @@ void SmartConverter::copyAndCalculate() {
             };
             
             std::function<void(float,hsize_t)> channelHistogramFunc = doChannelHistogram;
-            std::function<void(float)> cubeHistogramFunc = doCubeHistogram;
+            std::function<void(float,hsize_t)> cubeHistogramFunc = doCubeHistogram;
             
             if (!chanHist) {
                 channelHistogramFunc = doNothingOffset;
             }
             
             if (!cubeHist) {
-                cubeHistogramFunc = doNothing;
+                cubeHistogramFunc = doNothingOffset;
             }
             
-            // read one channel
-            DEBUG(std::cout << " Reading main dataset..." << std::flush;);
-            TIMER(timer.start("Read"););
-            
-            readFitsData(inputFilePtr, c, s, cubeSize, standardCube);
-
-            DEBUG(std::cout << " Calculating histogram(s)..." << std::endl;);
-            TIMER(timer.start("Histograms"););
-
             hsize_t y ;
-#pragma omp parallel for default(none) private(y) shared (standardCube, height, width, channelHistogramFunc, cubeHistogramFunc)
-            for (y = 0; y < height; y++) {
-                for (hsize_t x = 0; x < width; x++) {
-                    auto pos = y * width + x;
-                    auto& val = standardCube[pos];
-                    if (std::isfinite(val)) {
-                        channelHistogramFunc(val, y);
-                        cubeHistogramFunc(val);
-                    }
-                }
-            } // end of XY loop
-            
-            statsXY.consolidatePartialHistogram(c);
             
+#pragma omp parallel for default(none) private(y) shared(slice, standardCube, height, width, channelHistogramFunc, cubeHistogramFunc)
+                for (y = 0; y < height; y++) {
+                    for (hsize_t x = 0; x < width; x++) {
+                        auto pos = y * width + x + slice * width * height;
+                        auto& val = standardCube[pos];
+                        if (std::isfinite(val)) {
+                            channelHistogramFunc(val, y);
+                            cubeHistogramFunc(val, y);
+                        }
+                    }
+                } // end of XY loop
+
+                statsXY.consolidateAndClearPartialHistogram(c + slice);
+                statsXYZ.consolidateAndClearPartialHistogram(0);
+            } //for loop of sliceincrement ends here
+            workingIncrement = sliceIncrement;
         } // end of second channel loop (XY and XYZ histograms)
         
         PROGRESS(std::endl);
diff --git a/src/Stats.h b/src/Stats.h
index e38fc5c..ca8449b 100644
--- a/src/Stats.h
+++ b/src/Stats.h
@@ -125,7 +125,7 @@ struct Stats {
         }
     }
     
-    void consolidatePartialHistogram(hsize_t mainOffset) {
+    void consolidateAndClearPartialHistogram(hsize_t mainOffset) {
         for (hsize_t offset = 0; offset < partialHistMultiplier; offset++) {
             for (hsize_t binIndex = 0; binIndex < numBins; binIndex++) {
                 histograms[mainOffset * numBins + binIndex] += partialHistograms[offset * numBins + binIndex];
diff --git a/src/Util.cc b/src/Util.cc
index 3b6f3aa..6f2ce6f 100644
--- a/src/Util.cc
+++ b/src/Util.cc
@@ -255,11 +255,10 @@ void readHdf5Data(H5::DataSet& dataset, float* data, const std::vector<hsize_t>&
     dataset.read(data, H5::PredType::NATIVE_FLOAT, memSpace, fileSpace);
 }
 
-void MipIndexToXYZ(hsize_t mipIndex, hsize_t& x, hsize_t& y, hsize_t& z, hsize_t width, hsize_t height, hsize_t mipX, hsize_t mipY, hsize_t mipZ) {
-    int mipHeight = height / mipY;
-    int mipWidth = width / mipX;
-    x = (mipIndex % (mipWidth * mipHeight)) % mipWidth * mipX;
-    y = (mipIndex % (mipWidth * mipHeight)) / mipWidth * mipY;
-    z = mipIndex / (mipWidth * mipHeight) * mipZ;
-    
+void RegionIndexToXYZ(hsize_t mipIndex, hsize_t& x, hsize_t& y, hsize_t& z, hsize_t width, hsize_t height, hsize_t factorX, hsize_t factorY, hsize_t factorZ) {
+    int mipHeight = (height + factorY - 1)/ factorY;      //round result up
+    int mipWidth = (width + factorX - 1) / factorX;       //round result up
+    x = (mipIndex % (mipWidth * mipHeight)) % mipWidth * factorX;
+    y = (mipIndex % (mipWidth * mipHeight)) / mipWidth * factorY;
+    z = mipIndex / (mipWidth * mipHeight) * factorZ;
 }
\ No newline at end of file
diff --git a/src/Util.h b/src/Util.h
index b6a4b01..3abec82 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -48,7 +48,7 @@ void writeHdf5Data(H5::DataSet& dataset, int64_t* data, const std::vector<hsize_
 
 void readHdf5Data(H5::DataSet& dataset, float* data, const std::vector<hsize_t>& dims, const std::vector<hsize_t>& count = EMPTY_DIMS, const std::vector<hsize_t>& start = EMPTY_DIMS);
 
-void MipIndexToXYZ(hsize_t mipIndex, hsize_t& x, hsize_t& y, hsize_t& z, hsize_t width, hsize_t height, hsize_t mipX, hsize_t mipY, hsize_t mipZ);
+void RegionIndexToXYZ(hsize_t mipIndex, hsize_t& x, hsize_t& y, hsize_t& z, hsize_t width, hsize_t height, hsize_t factorX, hsize_t factorY, hsize_t factorZ);
 
 
 #endif
diff --git a/src/main.cc b/src/main.cc
index 0b57fab..06f96ac 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -9,7 +9,7 @@
 #include <sstream>
 #include "Converter.h"
 
-bool getOptions(int argc, char** argv, std::string& inputFileName, std::string& outputFileName, bool& slow, bool& smart, bool& progress, bool& onlyReportMemory, bool& zMips) {
+bool getOptions(int argc, char** argv, std::string& inputFileName, std::string& outputFileName, bool& slow, bool& smart, bool& progress, bool& onlyReportMemory, bool& zMips, int& memoryLimitInMb) {
     extern int optind;
     extern char *optarg;
     
@@ -29,7 +29,7 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
     << "-q\tSuppress all non-error output. Deprecated; this is now the default." << std::endl
     << "-z\tInclude axis 3 in mipmap calculation (currently not compatible with -s mode)." << std::endl;
 
-    while ((opt = getopt(argc, argv, ":o:srpqmz")) != -1) {
+    while ((opt = getopt(argc, argv, ":o:sr::pqmz")) != -1) {
         switch (opt) {
             case 'o':
                 outputFileName.assign(optarg);
@@ -40,7 +40,14 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
                 break;
             case 'r':
                 // use smart method
-                smart = true;
+                if (optarg) { // If an argument is provided
+                    int arg = std::stoi(optarg); // Convert the argument to an integer
+                    if (arg < 1) {
+                        break;  // use fast mode if the limit is less than 1 mb
+                    }
+                    smart = true; // Enable smart mode
+                    memoryLimitInMb = arg; // Assign the argument to the memory limit for smart mode
+                } //else, fast mode will be used by default because memory limit is not concern for user
                 break;
             case 'p':
                 progress = true;
@@ -105,8 +112,9 @@ int main(int argc, char** argv) {
     bool progress(false);
     bool onlyReportMemory(false);
     bool zMips(false);
+    int memoryLimitInMb(0);
     
-    if (!getOptions(argc, argv, inputFileName, outputFileName, slow, smart, progress, onlyReportMemory, zMips)) {
+    if (!getOptions(argc, argv, inputFileName, outputFileName, slow, smart, progress, onlyReportMemory, zMips, memoryLimitInMb)) {
         return 1;
     }
 
@@ -137,7 +145,7 @@ int main(int argc, char** argv) {
     std::unique_ptr<Converter> converter;
         
     try {
-        converter = Converter::getConverter(inputFileName, outputFileName, slow, smart, progress, zMips);
+        converter = Converter::getConverter(inputFileName, outputFileName, slow, smart, progress, zMips, memoryLimitInMb);
         
         if (onlyReportMemory) {
             converter->reportMemoryUsage();

From 229faf49c3ec48b55fc65ff5d62850e35e0239c6 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Wed, 10 Apr 2024 10:30:34 +0200
Subject: [PATCH 21/26] Update README.md with smart mode flag

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index f3788d6..4732e42 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,7 @@ important are:
 ```
 -o      Output filename
 -s      Use slower but less memory-intensive method (enable if memory allocation fails)
+-r      Use smart mode with an optional indicated size of max memory allocation in MB (example: -r2000) 
 -p      Print progress output (by default the program is silent)
 -m      Report predicted memory usage and exit without performing the conversion
 -z      Include calculation of mipmaps along 3rd axis of dataset

From 07b5d19520d4724ebf7998bc288d631880c634b6 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Fri, 3 May 2024 13:06:41 +0200
Subject: [PATCH 22/26] Apply memory allocation limit to tile rotation

---
 src/Converter.h       |   2 +
 src/SmartConverter.cc | 269 +++++++++++++++++++++++++++---------------
 2 files changed, 178 insertions(+), 93 deletions(-)

diff --git a/src/Converter.h b/src/Converter.h
index 68f8731..c90b530 100644
--- a/src/Converter.h
+++ b/src/Converter.h
@@ -97,6 +97,8 @@ class SmartConverter : public Converter {
     
 protected:
     void copyAndCalculate() override;
+    void ReadRotateWrite(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
+        hsize_t xLimit, hsize_t yLimit, hsize_t xIncrement, hsize_t yIncrement);
 };
 
 class SlowConverter : public Converter {
diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index d24a01a..c2b51f1 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -45,15 +45,14 @@ void SmartConverter::copyAndCalculate() {
     
     // Allocate one batch of slices at a time, and no swizzled data.
     // Batch size in slices determined by memory limit.
-    // If limit not set, use 4GB.
-    
-    hsize_t memoryLimitInBytes = std::max(memoryLimitInMb, 4000) * 1024 * 1024;
+  
+    hsize_t memoryLimitInBytes = memoryLimitInMb * 1024 * 1024;
     hsize_t sliceSizeInPixels = height * width;
-    hsize_t batchLimitInPixels = memoryLimitInBytes / sizeof(float);
-    hsize_t batchLimitInSlices = std::ceil((float)batchLimitInPixels / (float)sliceSizeInPixels);
+    hsize_t memoryLimitInPixels = memoryLimitInBytes / sizeof(float);
+    hsize_t memoryLimitInSlices = std::ceil((float)memoryLimitInPixels / (float)sliceSizeInPixels);
     
-    // Increment over the full depth if the batch size is larger than the depth
-    hsize_t sliceIncrement = std::min(batchLimitInSlices, depth);
+    // Increment over the full depth if the memory size limit is larger than the depth
+    hsize_t sliceIncrement = std::min(memoryLimitInSlices, depth);
     
     hsize_t cubeSize = height * width * sliceIncrement;
     TIMER(timer.start("Allocate"););
@@ -237,25 +236,23 @@ void SmartConverter::copyAndCalculate() {
         
         DEBUG(std::cout << "+ Will " << (cubeHist ? "" : "not ") << "calculate cube histogram." << std::endl;);
         
-        
-        for (hssize_t c = depth - workingIncrement; c > -1; c = c - sliceIncrement) {
+        hsize_t startingSlice = depth - workingIncrement;
+        for (hssize_t c = startingSlice; c > -1; c = c - sliceIncrement) {
             
             DEBUG(std::cout << "+ Processing channel " << c << "... " << std::flush;);
             PROGRESS_DECIMATED(c, channelProgressStride, "|");
             
+            // skip first batch of slices to prevent unnecessary file read
+            if (c < startingSlice) {
+                DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+                TIMER(timer.start("Read"););
+                readFitsData(inputFilePtr, c, s, workingCubeSize, standardCube);
+            }
             
-            // read one batch of slices
-            DEBUG(std::cout << " Reading main dataset..." << std::flush;);
-            TIMER(timer.start("Read"););
-            
-
-            readFitsData(inputFilePtr, c, s, workingCubeSize, standardCube);
-
             DEBUG(std::cout << " Calculating histogram(s)..." << std::endl;);
             TIMER(timer.start("Histograms"););
 
-           
-            
+
         for (hsize_t slice = 0; slice < workingIncrement; slice++) {
             
             auto indexXY = c + slice;
@@ -319,7 +316,7 @@ void SmartConverter::copyAndCalculate() {
                 statsXY.consolidateAndClearPartialHistogram(c + slice);
                 statsXYZ.consolidateAndClearPartialHistogram(0);
             } //for loop of sliceincrement ends here
-            workingIncrement = sliceIncrement;
+            workingIncrement = sliceIncrement; //reset workingIncrement to sliceIncrement for remainder of depth
         } // end of second channel loop (XY and XYZ histograms)
         
         PROGRESS(std::endl);
@@ -341,17 +338,28 @@ void SmartConverter::copyAndCalculate() {
     TIMER(timer.start("Free"););
     
     delete[] standardCube;
-            
+    
     // Swizzle
     if (depth > 1) {
         DEBUG(std::cout << "Performing tiled rotation." << std::endl;);
         PROGRESS("Tiled rotation & Z stats" << std::endl);
         TIMER(timer.start("Allocate"););
         
-        hsize_t sliceSize = product(trimAxes({stokes, depth, TILE_SIZE, TILE_SIZE}, N));
-        float* standardSlice = new float[sliceSize];
-        float* rotatedSlice = new float[sliceSize];
+        hsize_t tileSize = product(trimAxes({stokes, depth, TILE_SIZE, TILE_SIZE}, N));
+        if (2 * tileSize > memoryLimitInPixels) {
+            hsize_t memoryRequiredInMb = 2 * tileSize * sizeof(float) / 1024 / 1024;
+            std::cerr << "Memory limit too low for tiled rotation. Minimum: " << std::to_string(memoryRequiredInMb) << " mb.";
+            std::exit(EXIT_FAILURE);
+        }
+        hsize_t memoryLimitInTiles = std::floor((float)memoryLimitInPixels / (float)tileSize / 2); //divide by 2 because we have standard and rotated slices
+        float widthInTiles = (float)width / (float)TILE_SIZE;
+        float heightInTiles = (float)height / (float)TILE_SIZE;
+        hsize_t TotalFullTiles = std::ceil(widthInTiles) * std::ceil(heightInTiles);
+        hsize_t maxTilesAtATime = std::min(TotalFullTiles, memoryLimitInTiles);
         
+        //Allocate memory for main and rotated dataset slices and Z statistics
+        float* standardSlice = new float[tileSize * maxTilesAtATime];
+        float* rotatedSlice = new float[tileSize * maxTilesAtATime];
         statsZ.createBuffers({TILE_SIZE, TILE_SIZE});
         
         for (unsigned int s = 0; s < stokes; s++) {
@@ -360,82 +368,78 @@ void SmartConverter::copyAndCalculate() {
             
             hsize_t tileCount(0);
             
-            for (hsize_t xOffset = 0; xOffset < width; xOffset += TILE_SIZE) {
-                for (hsize_t yOffset = 0; yOffset < height; yOffset += TILE_SIZE) {
-                    tileCount++;
-                    hsize_t xSize = std::min(TILE_SIZE, width - xOffset);
-                    hsize_t ySize = std::min(TILE_SIZE, height - yOffset);
-                    
-                    DEBUG(std::cout << "+ Processing tile slice at " << xOffset << ", " << yOffset << "..." << std::flush;);
-                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
-                    
-                    // read tile slice
-                    DEBUG(std::cout << " Reading main dataset..." << std::flush;);
-                    TIMER(timer.start("Read"););
-                    
-                    auto standardMemDims = trimAxes({1, depth, ySize, xSize}, N);
-                    auto standardCount = trimAxes({1, depth, ySize, xSize}, N);
-                    auto standardStart = trimAxes({s, 0, yOffset, xOffset}, N);
-                    
-                    readHdf5Data(standardDataSet, standardSlice, standardMemDims, standardCount, standardStart);
-                    
-                    // rotate tile slice
-                    DEBUG(std::cout << " Calculating rotation..." << std::flush;);
-                    TIMER(timer.start("Rotation"););
-                    
-                    for (hsize_t i = 0; i < depth; i++) {
-                        for (hsize_t j = 0; j < ySize; j++) {
-                            for (hsize_t k = 0; k < xSize; k++) {
-                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
-                                auto& val = standardSlice[sourceIndex];
-                                
-                                // rotation
-                                auto destIndex = i + depth * j + (ySize * depth) * k;
-                                rotatedSlice[destIndex] = val;
-                            }
+            // Calculate excess pixels that won't fit into full tiles
+            hsize_t rightStripeWidthInPixels = width % TILE_SIZE;
+            hsize_t bottomStripeHeightInPixels = height % TILE_SIZE;
+            
+            hsize_t mainBlockWidthInTiles = (width - rightStripeWidthInPixels) / TILE_SIZE;
+            hsize_t mainBlockHeightInTiles = (height - bottomStripeHeightInPixels) / TILE_SIZE;
+            
+            hsize_t xTileIncrement = 1;
+            hsize_t yTileIncrement = 1;
+            
+            // Main block of full tiles
+            if (!(mainBlockWidthInTiles == 0 && mainBlockHeightInTiles == 0)) {
+                if (maxTilesAtATime < mainBlockWidthInTiles) {
+                    hsize_t possibleIncrement = 1;
+                    while (possibleIncrement < maxTilesAtATime) {
+                        possibleIncrement++;
+                        if (mainBlockWidthInTiles % possibleIncrement == 0) {
+                            xTileIncrement = possibleIncrement;
                         }
                     }
-                    
-                    // A separate pass over the same slice depth-last
-                    DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
-                    TIMER(timer.start("Z statistics"););
-                    
-                    for (hsize_t j = 0; j < ySize; j++) {
-                        for (hsize_t k = 0; k < xSize; k++) {
-                            StatsCounter counterZ;
-                            auto indexZ = k + xSize * j;
-                            
-                            for (hsize_t i = 0; i < depth; i++) {
-                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
-                                auto& val = standardSlice[sourceIndex];
-                                
-                                if (std::isfinite(val)) {
-                                    // Not lazy; too much risk of encountering an ascending / descending sequence.
-                                    counterZ.accumulateFinite(val);
-                                } else {
-                                    counterZ.accumulateNonFinite();
-                                }
-                            }
-                            
-                            statsZ.copyStatsFromCounter(indexZ, depth, counterZ);
+                }
+                else {
+                    xTileIncrement = mainBlockWidthInTiles;
+                    hsize_t possibleIncrement = 1;
+                    while (possibleIncrement * xTileIncrement < maxTilesAtATime) {
+                        possibleIncrement++;
+                        if (mainBlockHeightInTiles % possibleIncrement == 0) {
+                            yTileIncrement = possibleIncrement;
                         }
                     }
-                    
-                    // write tile slice
-                    DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
-                    TIMER(timer.start("Write"););
-                    
-                    auto swizzledMemDims = trimAxes({1, xSize, ySize, depth}, N);
-                    auto swizzledCount = trimAxes({1, xSize, ySize, depth}, N);
-                    auto swizzledStart = trimAxes({s, xOffset, yOffset, 0}, N);
-                    
-                    writeHdf5Data(swizzledDataSet, rotatedSlice, swizzledMemDims, swizzledCount, swizzledStart);
-                    
-                    DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
-                    // write Z statistics
-                    statsZ.write({ySize, xSize}, {1, ySize, xSize}, {s, yOffset, xOffset});
                 }
+                DEBUG(std::cout << "+ Processing main block tiles at " << 0 << ", " << 0 << "..." << std::flush;);
+                PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                ReadRotateWrite(standardSlice, rotatedSlice, s, 0, 0, mainBlockWidthInTiles * TILE_SIZE, mainBlockHeightInTiles * TILE_SIZE,
+                    xTileIncrement * TILE_SIZE, yTileIncrement * TILE_SIZE);
+                tileCount += xTileIncrement * yTileIncrement;
+            }
+            
+            // Bottom stripe of partial tiles
+            if (bottomStripeHeightInPixels > 0 && mainBlockWidthInTiles > 0) {
+                DEBUG(std::cout << "+ Processing bottom stripe of tiles at " << 0 << ", " << mainBlockHeightInTiles * TILE_SIZE << "..." << std::flush;);
+                yTileIncrement = 1;
+                if (maxTilesAtATime > mainBlockWidthInTiles)
+                    xTileIncrement = mainBlockWidthInTiles;
+                else
+                    xTileIncrement = maxTilesAtATime;
+                ReadRotateWrite(standardSlice, rotatedSlice, s, 0, mainBlockHeightInTiles * TILE_SIZE, mainBlockWidthInTiles * TILE_SIZE,
+                    height, xTileIncrement * TILE_SIZE, bottomStripeHeightInPixels);
+                tileCount += mainBlockWidthInTiles;
+                PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+            }
+            
+            //Right-side stripe of partial tiles
+            if (rightStripeWidthInPixels > 0 && mainBlockHeightInTiles > 0) {
+                DEBUG(std::cout << "+ Processing right stripe of tiles at " << mainBlockWidthInTiles * TILE_SIZE << ", " << 0 << "..." << std::flush;);
+                xTileIncrement = 1;
+                if (maxTilesAtATime > mainBlockHeightInTiles)
+                    yTileIncrement = mainBlockHeightInTiles;
+                else
+                    yTileIncrement = maxTilesAtATime;
+                ReadRotateWrite(standardSlice, rotatedSlice, s, mainBlockWidthInTiles * TILE_SIZE, 0, width,
+                    mainBlockHeightInTiles * TILE_SIZE, rightStripeWidthInPixels, yTileIncrement * TILE_SIZE);
+                tileCount += mainBlockHeightInTiles;
+                PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
             }
+            
+            // Bottom-right partial tile that remains
+            DEBUG(std::cout << "+ Processing remainder partial tile in bottom right " << mainBlockWidthInTiles * TILE_SIZE << ", " << mainBlockHeightInTiles * TILE_SIZE << "..." << std::flush;);
+            ReadRotateWrite(standardSlice, rotatedSlice, s, mainBlockWidthInTiles * TILE_SIZE, mainBlockHeightInTiles * TILE_SIZE, width,
+                height, rightStripeWidthInPixels, bottomStripeHeightInPixels);
+            tileCount++;
+            PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
             PROGRESS(std::endl);
         }
         
@@ -445,3 +449,82 @@ void SmartConverter::copyAndCalculate() {
         delete[] rotatedSlice;
     }
 }
+void SmartConverter::ReadRotateWrite(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
+    hsize_t xLimit, hsize_t yLimit, hsize_t xIncrement, hsize_t yIncrement) {
+    
+    for (hsize_t yOffset = yStart; yOffset < yLimit; yOffset += yIncrement) {
+        // If remaining rows are less than the increment, adjust the size of the processed slice
+        hsize_t ySize = yOffset + yIncrement > yLimit ? yLimit - yOffset : yIncrement;
+        for (hsize_t xOffset = xStart; xOffset < xLimit; xOffset += xIncrement) {
+            // If remaining columns are less than the increment, adjust the size of the processed slice
+            hsize_t xSize = xOffset + xIncrement > xLimit ? xLimit - xOffset : xIncrement;
+            // read tile slice
+            DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+            TIMER(timer.start("Read"););
+            
+            auto standardMemDims = trimAxes({1, depth, ySize, xSize}, N);
+            auto standardCount = trimAxes({1, depth, ySize, xSize}, N);
+            auto standardStart = trimAxes({s, 0, yOffset, xOffset}, N);
+            
+            readHdf5Data(standardDataSet, standardSliceToRead, standardMemDims, standardCount, standardStart);
+            
+            // rotate tile slice
+            DEBUG(std::cout << " Calculating rotation..." << std::flush;);
+            TIMER(timer.start("Rotation"););
+            
+            hsize_t i;
+#pragma omp parallel for default(none) private (i) shared (depth, xSize, ySize, xStart, yStart, standardSliceToRead, rotatedSliceToWrite)
+            for (i = 0; i < depth; i++) {
+                for (hsize_t j = yStart; j < ySize; j++) {
+                    for (hsize_t k = xStart; k < xSize; k++) {
+                        auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                        auto& val = standardSliceToRead[sourceIndex];
+                
+                        // rotation
+                        auto destIndex = i + depth * j + (ySize * depth) * k;
+                        rotatedSliceToWrite[destIndex] = val;
+                    }
+                }
+            }
+            
+            // A separate pass over the same slice depth-last
+            DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
+            TIMER(timer.start("Z statistics"););
+            
+            for (hsize_t j = yStart; j < ySize; j++) {
+                for (hsize_t k = xStart; k < xSize; k++) {
+                    StatsCounter counterZ;
+                    auto indexZ = k + xSize * j;
+                    
+                    for (hsize_t i = 0; i < depth; i++) {
+                        auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                        auto& val = standardSliceToRead[sourceIndex];
+                        
+                        if (std::isfinite(val)) {
+                            // Not lazy; too much risk of encountering an ascending / descending sequence.
+                            counterZ.accumulateFinite(val);
+                        } else {
+                            counterZ.accumulateNonFinite();
+                        }
+                    }
+                    
+                    statsZ.copyStatsFromCounter(indexZ, depth, counterZ);
+                }
+            }
+
+            // write tile slice
+            DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
+            TIMER(timer.start("Write"););
+            
+            auto swizzledMemDims = trimAxes({1, xSize, ySize, depth}, N);
+            auto swizzledCount = trimAxes({1, xSize, ySize, depth}, N);
+            auto swizzledStart = trimAxes({s, xOffset, yOffset, 0}, N);
+            
+            writeHdf5Data(swizzledDataSet, rotatedSliceToWrite, swizzledMemDims, swizzledCount, swizzledStart);
+            
+            DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
+            // write Z statistics
+            statsZ.write({ySize, xSize}, {1, ySize, xSize}, {s, yOffset, xOffset});
+        }
+    }
+}

From b611940a823845a90e69f857d895a965e52d15fa Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 6 May 2024 16:28:26 +0200
Subject: [PATCH 23/26] Fixed bug with the rotation and zStats

---
 src/SmartConverter.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index c2b51f1..599074d 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -475,8 +475,8 @@ void SmartConverter::ReadRotateWrite(float* standardSliceToRead, float* rotatedS
             hsize_t i;
 #pragma omp parallel for default(none) private (i) shared (depth, xSize, ySize, xStart, yStart, standardSliceToRead, rotatedSliceToWrite)
             for (i = 0; i < depth; i++) {
-                for (hsize_t j = yStart; j < ySize; j++) {
-                    for (hsize_t k = xStart; k < xSize; k++) {
+                for (hsize_t j = 0; j < ySize; j++) {
+                    for (hsize_t k = 0; k < xSize; k++) {
                         auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
                         auto& val = standardSliceToRead[sourceIndex];
                 
@@ -491,8 +491,8 @@ void SmartConverter::ReadRotateWrite(float* standardSliceToRead, float* rotatedS
             DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
             TIMER(timer.start("Z statistics"););
             
-            for (hsize_t j = yStart; j < ySize; j++) {
-                for (hsize_t k = xStart; k < xSize; k++) {
+            for (hsize_t j = 0; j < ySize; j++) {
+                for (hsize_t k = 0; k < xSize; k++) {
                     StatsCounter counterZ;
                     auto indexZ = k + xSize * j;
                     

From c10c1ab4f0d5b68e58df0e87efc4e2ad26003b33 Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 20 May 2024 12:40:15 +0200
Subject: [PATCH 24/26] Fixed issues with FastConverter not producing correct
 mipmaps

---
 src/FastConverter.cc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/FastConverter.cc b/src/FastConverter.cc
index bf26508..3401678 100644
--- a/src/FastConverter.cc
+++ b/src/FastConverter.cc
@@ -77,7 +77,7 @@ void FastConverter::copyAndCalculate() {
         
         hsize_t i;
         StatsCounter counterXY;
-#pragma omp parallel for default(none) private(i, counterXY) shared(standardCube, rotatedCube, statsXY, statsXYZ, statsZ, channelProgressStride, std::cout, pixelProgressStride, depth, height, width)
+#pragma omp parallel for default(none) private(i, counterXY) shared(channelProgressStride, std::cout)
         for (i = 0; i < depth; i++) {
             PROGRESS_DECIMATED(i, channelProgressStride, "|");
             counterXY.reset();
@@ -143,8 +143,9 @@ void FastConverter::copyAndCalculate() {
             PROGRESS("\tZ stats\t\t");
             TIMER(timer.start("Z statistics"););
 
-#pragma omp parallel for default(none) private(i, counterXY) shared(standardCube, rotatedCube, statsXY, statsXYZ, statsZ, channelProgressStride, std::cout, pixelProgressStride, depth, height, width)
-            for (hsize_t j = 0; j < height; j++) {
+            hsize_t j;
+#pragma omp parallel for default(none) private(j) shared(pixelProgressStride, std::cout)
+            for (j = 0; j < height; j++) {
                 for (hsize_t k = 0; k < width; k++) {
                     StatsCounter counterZ;
                     
@@ -191,8 +192,8 @@ void FastConverter::copyAndCalculate() {
         statsXY.clearHistogramBuffers();
         statsXYZ.clearHistogramBuffers();
 
-#pragma omp parallel for default(none) private(i) shared(standardCube, statsXY, statsXYZ, channelProgressStride, std::cout, pixelProgressStride, depth, height, width, cubeHist, cubeMin, cubeRange)
-        for (hsize_t i = 0; i < depth; i++) {
+#pragma omp parallel for default(none) private(i) shared(channelProgressStride, std::cout, cubeHist, cubeMin, cubeRange)
+        for (i = 0; i < depth; i++) {
             PROGRESS_DECIMATED(i, channelProgressStride, "|");
             
             auto& indexXY = i;
@@ -280,8 +281,9 @@ void FastConverter::copyAndCalculate() {
         PROGRESS("\tMipmaps\t\t");
         TIMER(timer.start("Mipmaps"););
         
-#pragma omp parallel for default(none) private(mipMaps) shared(standardCube, channelProgressStride, std::cout, pixelProgressStride, depth, height, width)
-        for (hsize_t c = 0; c < depth; c++) {
+        hsize_t c;
+#pragma omp parallel for default(none) private(c) shared(channelProgressStride, std::cout)
+        for (c = 0; c < depth; c++) {
             PROGRESS_DECIMATED(c, channelProgressStride, "|");
             for (hsize_t y = 0; y < height; y++) {
                 for (hsize_t x = 0; x < width; x++) {

From f058036dcb590c32c117ec283bfc1299b32782da Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 20 May 2024 12:42:01 +0200
Subject: [PATCH 25/26] Changed how 0 memory limit is handled

---
 src/Converter.cc | 2 +-
 src/main.cc      | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/Converter.cc b/src/Converter.cc
index 9744af2..02699a4 100644
--- a/src/Converter.cc
+++ b/src/Converter.cc
@@ -55,7 +55,7 @@ Converter::~Converter() {
 std::unique_ptr<Converter> Converter::getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips, int memoryLimitInMb = 0) {
     if (slow) {
         return std::unique_ptr<Converter>(new SlowConverter(inputFileName, outputFileName, progress, zMips));
-    } else if (smart) {
+    } else if (smart && memoryLimitInMb > 0) {
         return std::unique_ptr<Converter>(new SmartConverter(inputFileName, outputFileName, progress, zMips, memoryLimitInMb));
     } else {
         return std::unique_ptr<Converter>(new FastConverter(inputFileName, outputFileName, progress, zMips));
diff --git a/src/main.cc b/src/main.cc
index 06f96ac..17b6f25 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -42,13 +42,10 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
                 // use smart method
                 if (optarg) { // If an argument is provided
                     int arg = std::stoi(optarg); // Convert the argument to an integer
-                    if (arg < 1) {
-                        break;  // use fast mode if the limit is less than 1 mb
-                    }
                     smart = true; // Enable smart mode
                     memoryLimitInMb = arg; // Assign the argument to the memory limit for smart mode
                 } //else, fast mode will be used by default because memory limit is not concern for user
-                break;
+                memoryLimitInMb = 0; // Assign 0 to the memory limit for smart mode because no argument is provided
             case 'p':
                 progress = true;
                 break;

From c5479f217635ee218510054c6f51d0a2f51d5e8d Mon Sep 17 00:00:00 2001
From: Alex Sivitilli <alexandersivitilli@gmail.com>
Date: Mon, 20 May 2024 12:43:36 +0200
Subject: [PATCH 26/26] Add in allowance for memory limitation smaller than
 tile size

---
 src/Converter.h       |   4 +-
 src/SmartConverter.cc | 284 +++++++++++++++++++++++++++++-------------
 src/Stats.h           |  10 ++
 3 files changed, 207 insertions(+), 91 deletions(-)

diff --git a/src/Converter.h b/src/Converter.h
index c90b530..a3abebc 100644
--- a/src/Converter.h
+++ b/src/Converter.h
@@ -97,8 +97,10 @@ class SmartConverter : public Converter {
     
 protected:
     void copyAndCalculate() override;
-    void ReadRotateWrite(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
+    void ReadRotateWriteFullDepth(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
         hsize_t xLimit, hsize_t yLimit, hsize_t xIncrement, hsize_t yIncrement);
+    void ReadRotateWritePartialDepth(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
+        hsize_t zStart, hsize_t xLimit, hsize_t yLimit, hsize_t zLimit, hsize_t xIncrement, hsize_t yIncrement, hsize_t zIncrement);
 };
 
 class SlowConverter : public Converter {
diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
index 599074d..064fa20 100644
--- a/src/SmartConverter.cc
+++ b/src/SmartConverter.cc
@@ -345,111 +345,132 @@ void SmartConverter::copyAndCalculate() {
         PROGRESS("Tiled rotation & Z stats" << std::endl);
         TIMER(timer.start("Allocate"););
         
-        hsize_t tileSize = product(trimAxes({stokes, depth, TILE_SIZE, TILE_SIZE}, N));
-        if (2 * tileSize > memoryLimitInPixels) {
-            hsize_t memoryRequiredInMb = 2 * tileSize * sizeof(float) / 1024 / 1024;
-            std::cerr << "Memory limit too low for tiled rotation. Minimum: " << std::to_string(memoryRequiredInMb) << " mb.";
-            std::exit(EXIT_FAILURE);
-        }
-        hsize_t memoryLimitInTiles = std::floor((float)memoryLimitInPixels / (float)tileSize / 2); //divide by 2 because we have standard and rotated slices
-        float widthInTiles = (float)width / (float)TILE_SIZE;
-        float heightInTiles = (float)height / (float)TILE_SIZE;
-        hsize_t TotalFullTiles = std::ceil(widthInTiles) * std::ceil(heightInTiles);
-        hsize_t maxTilesAtATime = std::min(TotalFullTiles, memoryLimitInTiles);
+        hsize_t tileSize = product(trimAxes({stokes, depth, std::min(width, TILE_SIZE) , std::min(height, TILE_SIZE)}, N));
         
-        //Allocate memory for main and rotated dataset slices and Z statistics
-        float* standardSlice = new float[tileSize * maxTilesAtATime];
-        float* rotatedSlice = new float[tileSize * maxTilesAtATime];
-        statsZ.createBuffers({TILE_SIZE, TILE_SIZE});
+        float* standardSlice;
+        float* rotatedSlice;
         
-        for (unsigned int s = 0; s < stokes; s++) {
-            DEBUG(std::cout << "Processing Stokes " << s << "..." << std::endl;);
-            PROGRESS("\tStokes " << s << "\t");
-            
-            hsize_t tileCount(0);
-            
-            // Calculate excess pixels that won't fit into full tiles
-            hsize_t rightStripeWidthInPixels = width % TILE_SIZE;
-            hsize_t bottomStripeHeightInPixels = height % TILE_SIZE;
-            
-            hsize_t mainBlockWidthInTiles = (width - rightStripeWidthInPixels) / TILE_SIZE;
-            hsize_t mainBlockHeightInTiles = (height - bottomStripeHeightInPixels) / TILE_SIZE;
-            
-            hsize_t xTileIncrement = 1;
-            hsize_t yTileIncrement = 1;
-            
-            // Main block of full tiles
-            if (!(mainBlockWidthInTiles == 0 && mainBlockHeightInTiles == 0)) {
-                if (maxTilesAtATime < mainBlockWidthInTiles) {
-                    hsize_t possibleIncrement = 1;
-                    while (possibleIncrement < maxTilesAtATime) {
-                        possibleIncrement++;
-                        if (mainBlockWidthInTiles % possibleIncrement == 0) {
-                            xTileIncrement = possibleIncrement;
+        
+        // if the memory limit is too low for a single tile at full-depth, handle partial tiles in z-direction
+        if (2 * tileSize > memoryLimitInPixels) {
+            hsize_t maxSlicesAtOnce = memoryLimitInPixels / (2 * std::min(width, TILE_SIZE) * std::min(height, TILE_SIZE));
+            // Allocate memory for main and rotated dataset slices and Z statistics
+            standardSlice = new float[std::min(width, TILE_SIZE) * std::min(height, TILE_SIZE) * maxSlicesAtOnce];
+            rotatedSlice = new float[std::min(width, TILE_SIZE) * std::min(height, TILE_SIZE) * maxSlicesAtOnce];
+            statsZ.createBuffers({width, height});
+            
+            for (unsigned int s = 0; s < stokes; s++) {
+                DEBUG(std::cout << "Processing Stokes " << s << "..." << std::endl;);
+                PROGRESS("\tStokes " << s << "\t");
+
+                hsize_t tileCount(0);
+                ReadRotateWritePartialDepth(standardSlice, rotatedSlice, s, 0, 0, 0, width, height, depth, std::min(width, TILE_SIZE), std::min(height, TILE_SIZE), maxSlicesAtOnce);
+                
+            }
+            // otherwise, handle full-depth groups of tiles at a time that fit within the memory limit
+        } else {
+            hsize_t memoryLimitInTiles =
+                std::floor((float)memoryLimitInPixels / (float)tileSize / 2); // divide by 2 because we have standard and rotated slices
+            float widthInTiles = (float)width / (float)TILE_SIZE;
+            float heightInTiles = (float)height / (float)TILE_SIZE;
+            hsize_t TotalFullTiles = std::ceil(widthInTiles) * std::ceil(heightInTiles);
+            hsize_t maxTilesAtATime = std::min(TotalFullTiles, memoryLimitInTiles);
+
+            // Allocate memory for main and rotated dataset slices and Z statistics
+            standardSlice = new float[tileSize * maxTilesAtATime];
+            rotatedSlice = new float[tileSize * maxTilesAtATime];
+            statsZ.createBuffers({TILE_SIZE, TILE_SIZE});
+
+            for (unsigned int s = 0; s < stokes; s++) {
+                DEBUG(std::cout << "Processing Stokes " << s << "..." << std::endl;);
+                PROGRESS("\tStokes " << s << "\t");
+
+                hsize_t tileCount(0);
+
+                // Calculate excess pixels that won't fit into full tiles
+                hsize_t rightStripeWidthInPixels = width % TILE_SIZE;
+                hsize_t bottomStripeHeightInPixels = height % TILE_SIZE;
+
+                hsize_t mainBlockWidthInTiles = (width - rightStripeWidthInPixels) / TILE_SIZE;
+                hsize_t mainBlockHeightInTiles = (height - bottomStripeHeightInPixels) / TILE_SIZE;
+
+                hsize_t xTileIncrement = 1;
+                hsize_t yTileIncrement = 1;
+
+                // Main block of full tiles
+                if (!(mainBlockWidthInTiles == 0 && mainBlockHeightInTiles == 0)) {
+                    if (maxTilesAtATime < mainBlockWidthInTiles) {
+                        hsize_t possibleIncrement = 1;
+                        while (possibleIncrement < maxTilesAtATime) {
+                            possibleIncrement++;
+                            if (mainBlockWidthInTiles % possibleIncrement == 0) {
+                                xTileIncrement = possibleIncrement;
+                            }
                         }
-                    }
-                }
-                else {
-                    xTileIncrement = mainBlockWidthInTiles;
-                    hsize_t possibleIncrement = 1;
-                    while (possibleIncrement * xTileIncrement < maxTilesAtATime) {
-                        possibleIncrement++;
-                        if (mainBlockHeightInTiles % possibleIncrement == 0) {
-                            yTileIncrement = possibleIncrement;
+                    } else {
+                        xTileIncrement = mainBlockWidthInTiles;
+                        hsize_t possibleIncrement = 1;
+                        while (possibleIncrement * xTileIncrement < maxTilesAtATime) {
+                            possibleIncrement++;
+                            if (mainBlockHeightInTiles % possibleIncrement == 0) {
+                                yTileIncrement = possibleIncrement;
+                            }
                         }
                     }
+                    DEBUG(std::cout << "+ Processing main block tiles at " << 0 << ", " << 0 << "..." << std::flush;);
+                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                    ReadRotateWriteFullDepth(standardSlice, rotatedSlice, s, 0, 0, mainBlockWidthInTiles * TILE_SIZE,
+                        mainBlockHeightInTiles * TILE_SIZE, xTileIncrement * TILE_SIZE, yTileIncrement * TILE_SIZE);
+                    tileCount += xTileIncrement * yTileIncrement;
                 }
-                DEBUG(std::cout << "+ Processing main block tiles at " << 0 << ", " << 0 << "..." << std::flush;);
-                PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
-                ReadRotateWrite(standardSlice, rotatedSlice, s, 0, 0, mainBlockWidthInTiles * TILE_SIZE, mainBlockHeightInTiles * TILE_SIZE,
-                    xTileIncrement * TILE_SIZE, yTileIncrement * TILE_SIZE);
-                tileCount += xTileIncrement * yTileIncrement;
-            }
-            
-            // Bottom stripe of partial tiles
-            if (bottomStripeHeightInPixels > 0 && mainBlockWidthInTiles > 0) {
-                DEBUG(std::cout << "+ Processing bottom stripe of tiles at " << 0 << ", " << mainBlockHeightInTiles * TILE_SIZE << "..." << std::flush;);
-                yTileIncrement = 1;
-                if (maxTilesAtATime > mainBlockWidthInTiles)
-                    xTileIncrement = mainBlockWidthInTiles;
-                else
-                    xTileIncrement = maxTilesAtATime;
-                ReadRotateWrite(standardSlice, rotatedSlice, s, 0, mainBlockHeightInTiles * TILE_SIZE, mainBlockWidthInTiles * TILE_SIZE,
-                    height, xTileIncrement * TILE_SIZE, bottomStripeHeightInPixels);
-                tileCount += mainBlockWidthInTiles;
-                PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
-            }
-            
-            //Right-side stripe of partial tiles
-            if (rightStripeWidthInPixels > 0 && mainBlockHeightInTiles > 0) {
-                DEBUG(std::cout << "+ Processing right stripe of tiles at " << mainBlockWidthInTiles * TILE_SIZE << ", " << 0 << "..." << std::flush;);
-                xTileIncrement = 1;
-                if (maxTilesAtATime > mainBlockHeightInTiles)
-                    yTileIncrement = mainBlockHeightInTiles;
-                else
-                    yTileIncrement = maxTilesAtATime;
-                ReadRotateWrite(standardSlice, rotatedSlice, s, mainBlockWidthInTiles * TILE_SIZE, 0, width,
-                    mainBlockHeightInTiles * TILE_SIZE, rightStripeWidthInPixels, yTileIncrement * TILE_SIZE);
-                tileCount += mainBlockHeightInTiles;
+
+                // Bottom stripe of partial tiles
+                if (bottomStripeHeightInPixels > 0 && mainBlockWidthInTiles > 0) {
+                    DEBUG(std::cout << "+ Processing bottom stripe of tiles at " << 0 << ", " << mainBlockHeightInTiles * TILE_SIZE << "..."
+                                    << std::flush;);
+                    yTileIncrement = 1;
+                    if (maxTilesAtATime > mainBlockWidthInTiles)
+                        xTileIncrement = mainBlockWidthInTiles;
+                    else
+                        xTileIncrement = maxTilesAtATime;
+                    ReadRotateWriteFullDepth(standardSlice, rotatedSlice, s, 0, mainBlockHeightInTiles * TILE_SIZE,
+                        mainBlockWidthInTiles * TILE_SIZE, height, xTileIncrement * TILE_SIZE, bottomStripeHeightInPixels);
+                    tileCount += mainBlockWidthInTiles;
+                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                }
+
+                // Right-side stripe of partial tiles
+                if (rightStripeWidthInPixels > 0 && mainBlockHeightInTiles > 0) {
+                    DEBUG(std::cout << "+ Processing right stripe of tiles at " << mainBlockWidthInTiles * TILE_SIZE << ", " << 0 << "..."
+                                    << std::flush;);
+                    xTileIncrement = 1;
+                    if (maxTilesAtATime > mainBlockHeightInTiles)
+                        yTileIncrement = mainBlockHeightInTiles;
+                    else
+                        yTileIncrement = maxTilesAtATime;
+                    ReadRotateWriteFullDepth(standardSlice, rotatedSlice, s, mainBlockWidthInTiles * TILE_SIZE, 0, width,
+                        mainBlockHeightInTiles * TILE_SIZE, rightStripeWidthInPixels, yTileIncrement * TILE_SIZE);
+                    tileCount += mainBlockHeightInTiles;
+                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                }
+
+                // Bottom-right partial tile that remains
+                DEBUG(std::cout << "+ Processing remainder partial tile in bottom right " << mainBlockWidthInTiles * TILE_SIZE << ", "
+                                << mainBlockHeightInTiles * TILE_SIZE << "..." << std::flush;);
+                ReadRotateWriteFullDepth(standardSlice, rotatedSlice, s, mainBlockWidthInTiles * TILE_SIZE,
+                    mainBlockHeightInTiles * TILE_SIZE, width, height, rightStripeWidthInPixels, bottomStripeHeightInPixels);
+                tileCount++;
                 PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                PROGRESS(std::endl);
             }
-            
-            // Bottom-right partial tile that remains
-            DEBUG(std::cout << "+ Processing remainder partial tile in bottom right " << mainBlockWidthInTiles * TILE_SIZE << ", " << mainBlockHeightInTiles * TILE_SIZE << "..." << std::flush;);
-            ReadRotateWrite(standardSlice, rotatedSlice, s, mainBlockWidthInTiles * TILE_SIZE, mainBlockHeightInTiles * TILE_SIZE, width,
-                height, rightStripeWidthInPixels, bottomStripeHeightInPixels);
-            tileCount++;
-            PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
-            PROGRESS(std::endl);
         }
-        
         TIMER(timer.start("Free"););
         DEBUG(std::cout << "Freeing memory from main and rotated dataset slices... " << std::endl;);
         delete[] standardSlice;
         delete[] rotatedSlice;
     }
 }
-void SmartConverter::ReadRotateWrite(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
+void SmartConverter::ReadRotateWriteFullDepth(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
     hsize_t xLimit, hsize_t yLimit, hsize_t xIncrement, hsize_t yIncrement) {
     
     for (hsize_t yOffset = yStart; yOffset < yLimit; yOffset += yIncrement) {
@@ -528,3 +549,86 @@ void SmartConverter::ReadRotateWrite(float* standardSliceToRead, float* rotatedS
         }
     }
 }
+
+//TODO: could probably generalize this function with the full depth version
+void SmartConverter::ReadRotateWritePartialDepth(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart, hsize_t zStart,
+    hsize_t xLimit, hsize_t yLimit, hsize_t zLimit, hsize_t xIncrement, hsize_t yIncrement, hsize_t zIncrement) {
+    for (hsize_t zOffset = zStart; zOffset < zLimit; zOffset += zIncrement) {
+        // If remaining slices are less than the increment, adjust the size of the processed slice
+        hsize_t zSize = zOffset + zIncrement > zLimit ? zLimit - zOffset : zIncrement;
+        for (hsize_t yOffset = yStart; yOffset < yLimit; yOffset += yIncrement) {
+            // If remaining rows are less than the increment, adjust the size of the processed slice
+            hsize_t ySize = yOffset + yIncrement > yLimit ? yLimit - yOffset : yIncrement;
+            for (hsize_t xOffset = xStart; xOffset < xLimit; xOffset += xIncrement) {
+                // If remaining columns are less than the increment, adjust the size of the processed slice
+                hsize_t xSize = xOffset + xIncrement > xLimit ? xLimit - xOffset : xIncrement;
+                // read tile slice
+                DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+                TIMER(timer.start("Read"););
+
+                auto standardMemDims = trimAxes({1, zSize, ySize, xSize}, N);
+                auto standardCount = trimAxes({1, zSize, ySize, xSize}, N);
+                auto standardStart = trimAxes({s, zOffset, yOffset, xOffset}, N);
+
+                readHdf5Data(standardDataSet, standardSliceToRead, standardMemDims, standardCount, standardStart);
+
+                // rotate tile slice
+                DEBUG(std::cout << " Calculating rotation..." << std::flush;);
+                TIMER(timer.start("Rotation"););
+
+                for (hsize_t i = 0; i < zSize; i++) {
+                    for (hsize_t j = 0; j < ySize; j++) {
+                        for (hsize_t k = 0; k < xSize; k++) {
+                            auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                            auto& val = standardSliceToRead[sourceIndex];
+
+                            // rotation
+                            auto destIndex = i + zSize * j + (ySize * zSize) * k;
+                            rotatedSliceToWrite[destIndex] = val;
+                        }
+                    }
+
+                    // A separate pass over the same slice depth-last
+                    DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
+                    TIMER(timer.start("Z statistics"););
+
+                    for (hsize_t j = 0; j < ySize; j++) {
+                        for (hsize_t k = 0; k < xSize; k++) {
+                            StatsCounter counterZ;
+                            auto indexZ = xOffset + k + width * (yOffset + j);
+
+                            for (hsize_t i = 0; i < zSize; i++) {
+                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                                auto& val = standardSliceToRead[sourceIndex];
+
+                                if (std::isfinite(val)) {
+                                    // Not lazy; too much risk of encountering an ascending / descending sequence.
+                                    counterZ.accumulateFinite(val);
+                                } else {
+                                    counterZ.accumulateNonFinite();
+                                }
+                            }
+
+                            statsZ.accumulateStatsFromCounter(indexZ, counterZ);
+                        }
+                    }
+                }
+
+                // write tile slice
+                DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
+                TIMER(timer.start("Write"););
+
+                auto swizzledMemDims = trimAxes({1, xSize, ySize, zSize}, N);
+                auto swizzledCount = trimAxes({1, xSize, ySize, zSize}, N);
+                auto swizzledStart = trimAxes({s, xOffset, yOffset, zOffset}, N);
+
+                writeHdf5Data(swizzledDataSet, rotatedSliceToWrite, swizzledMemDims, swizzledCount, swizzledStart);
+
+
+            }
+        }
+    }
+    DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
+    // write Z statistics
+    statsZ.write({height, width}, {1, height, width}, {s, 0, 0});
+}
diff --git a/src/Stats.h b/src/Stats.h
index ca8449b..13f55b4 100644
--- a/src/Stats.h
+++ b/src/Stats.h
@@ -87,6 +87,16 @@ struct Stats {
         counter.nanCount += nanCounts[index];
     }
     
+    void accumulateStatsFromCounter(hsize_t index, const StatsCounter& counter) {
+        if (std::isfinite(counter.maxVal)) {
+            minVals[index] = fmin(counter.minVal, minVals[index]);
+            maxVals[index] = fmax(counter.maxVal, maxVals[index]);
+            sums[index] += counter.sum;
+            sumsSq[index] += counter.sumSq;
+        }
+        nanCounts[index] += counter.nanCount;
+    }
+    
     void copyStatsFromCounter(hsize_t index, hsize_t totalVals, const StatsCounter& counter) {
         if ((hsize_t)counter.nanCount == totalVals) {
             minVals[index] = NAN;