diff --git a/CMakeLists.txt b/CMakeLists.txt
index 292a228..41b3d0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,8 @@ set(SOURCE_FILES
     src/Converter.cc
     src/FastConverter.cc
     src/SlowConverter.cc
-    src/Util.cc)
+    src/SmartConverter.cc
+    src/Util.cc )
 
 add_executable(fits2idia ${SOURCE_FILES})
 target_link_libraries(fits2idia ${LINK_LIBS})
diff --git a/README.md b/README.md
index f3788d6..4732e42 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,7 @@ important are:
 ```
 -o      Output filename
 -s      Use slower but less memory-intensive method (enable if memory allocation fails)
+-r      Use smart mode with an optional indicated size of max memory allocation in MB (example: -r2000) 
 -p      Print progress output (by default the program is silent)
 -m      Report predicted memory usage and exit without performing the conversion
 -z      Include calculation of mipmaps along 3rd axis of dataset
diff --git a/scripts/convertertest.py b/scripts/convertertest.py
old mode 100755
new mode 100644
index 6f630bf..4983158
--- a/scripts/convertertest.py
+++ b/scripts/convertertest.py
@@ -44,7 +44,7 @@ def compare_fits_hdf5(fitsname, hdf5name, zmips=False):
 
     axes = {v: k for k, v in enumerate(reversed(axis_names))}
     dims = {v: fitsdata.shape[k] for k, v in enumerate(reversed(axis_names))}
-    
+
     width, height, depth, stokes = dims["X"], dims["Y"], dims.get("Z", 1), dims.get("W", 1)
 
     # CHECK SWIZZLES
@@ -57,7 +57,7 @@ def compare_fits_hdf5(fitsname, hdf5name, zmips=False):
         swizzled_name = "ZYXW"
 
     if swizzled_name:
-        swizzled_shape = tuple(dims[a] for a in reversed(swizzled_name))    
+        swizzled_shape = tuple(dims[a] for a in reversed(swizzled_name))
         assert swizzled_name in hdf5file["0/SwizzledData"], "No swizzled dataset found."
         assert hdf5file["0/SwizzledData"][swizzled_name].shape == swizzled_shape, "Swizzled dataset has incorrect dimensions."
 
@@ -68,34 +68,34 @@ def compare_fits_hdf5(fitsname, hdf5name, zmips=False):
     if ndim >= 3 and dims["Z"] > 1:
         stats.append("Z")
         stats.append("XYZ")
-        
+
     for s in stats:
         assert s in hdf5file["0/Statistics"], "%s statistics missing." % s
-        
+
         sdata = hdf5file["0/Statistics"][s]
         stats_axis = tuple(axes[a] for a in s)
-        
+
         # CHECK BASIC STATS
-        
+
         def assert_close(stat, func, data=fitsdata):
             if stat in sdata:
                 assert_allclose(sdata[stat], func(data, axis=stats_axis), rtol=1e-5, err_msg = "%s/%s is incorrect. DIFF: %r" % (s, stat, func(data, axis=stats_axis) - sdata[stat]))
-        
+
         assert_close("SUM", np.nansum, fitsdata.astype(np.float64))
         assert_close("SUM_SQ", np.nansum, fitsdata.astype(np.float64)**2)
         assert_close("MEAN", np.nanmean)
         assert_close("MIN", np.nanmin)
         assert_close("MAX", np.nanmax)
-        
+
         assert (np.count_nonzero(np.isnan(fitsdata), axis=stats_axis) == sdata["NAN_COUNT"]).all(), "%s/NAN_COUNT is incorrect." % s
-        
+
         if s in ["XY", "XYZ"]:
             # CHECK HISTOGRAMS
-            
+
             hist = np.array(sdata["HISTOGRAM"])
             num_bins = int(max(np.sqrt(width * height), 2))
             d = np.array(hdf5data)
-            
+
             # Note: we produce a zero histogram for datasets with only one not-nan value. Numpy changes the bounds when calculating the histogram for a single value, so it always bins it somewhere.
             if ndim == len(s):
                 d = d[~np.isnan(d)]
@@ -106,17 +106,17 @@ def assert_close(stat, func, data=fitsdata):
                 rest_shape = tuple(dims[a] for a in reversed(axis_names) if a not in s)
                 rest_merged_shape = np.multiply.reduce(rest_shape)
                 hist_shape = rest_shape + (num_bins,)
-                                            
+
                 reference = np.apply_along_axis(lambda a: np.histogram(a[~np.isnan(a)].astype(np.float64), bins=num_bins)[0] if a[~np.isnan(a)].size > 1 else np.zeros(num_bins), 1, d.reshape(rest_merged_shape, stats_merged_shape)).reshape(hist_shape)
-            
+
             diff = reference - hist
-            
+
             assert hist.shape == reference.shape, "%s histogram shape %r does not match expected shape %r" % (s, hist.shape, reference.shape)
             # Note: we can't replicate the converter's binning exactly because of a precision issue, so there are occasional off-by-one bin increments.
             assert diff.sum() == 0 and diff[diff != 0].size / diff.size < 0.01, "Too many %s histogram values do not match expected values.\nSum of difference: %d\nDifference:\n%s" % (s, diff.sum(), pprint_sparse_diff(hist, reference))
-    
+
     # CHECK MIPMAPS
-    
+
     if "MipMaps" in hdf5file["0"]:
         for mname, mipmap in sorted(hdf5file["0/MipMaps"]["DATA"].items(), key=lambda x: x[1].size, reverse=True):
             if (re.search("_XYZ_", mname)):
@@ -146,7 +146,7 @@ def assert_close(stat, func, data=fitsdata):
                 assert mdepth == np.ceil(depth / zfactor)
 
             # check mipmap contents
-            
+
             def assert_mipmap_channel_equal(name, channel, d, got):
                 got = np.array(got)
                 if zmips and len(mipmap.shape) > 2:
@@ -154,7 +154,7 @@ def assert_mipmap_channel_equal(name, channel, d, got):
                 else:
                     expected = np.array([[np.nanmean(d[y*xyfactor:(y+1)*xyfactor, x*xyfactor:(x+1)*xyfactor]) for x in range(mwidth)] for y in range(mheight)])
                 assert_allclose(expected, got, rtol=1e-5, atol=1e-7, equal_nan=True, err_msg = "Mipmap %s channel %r is incorrect. \nEXPECTED:\n%r\nGOT:\n%r\nDIFF:\n%s" % (name, channel, expected, got, pprint_sparse_diff(expected, got)))
-            
+
             d = np.array(hdf5data)
             rest = mipmap.shape[:-3] if zmips else mipmap.shape[:-2]
 
@@ -176,17 +176,17 @@ def compare_hdf5_hdf5(file1, file2, fail_msg, ignore_converter_attrs=True):
     h5diff = subprocess.run(["h5diff", file1, file2], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     if h5diff.returncode == 0:
         return
-    
+
     if not ignore_converter_attrs:
         assert False, fail_msg
     else:
         ignore_attributes = ("SCHEMA_VERSION", "HDF5_CONVERTER", "HDF5_CONVERTER_VERSION")
-        
+
         output = h5diff.stdout.decode('ascii')
-        
+
         for attribute in ignore_attributes:
             output = re.sub(r"attribute: <%s of </0>> and <%s of </0>>\n\d+ differences found\n" % (attribute, attribute), "", output)
-            
+
         if output:
             print(output)
             print("(Permitted converter version attributes have been ignored.)")
@@ -204,68 +204,89 @@ def make_image(outfile, *dims, **params):
                 cmd.append(str(values))
     cmd.append("--")
     cmd.extend(str(d) for d in dims)
-    
+
     print(*cmd)
-    
+
     result = subprocess.run(cmd)
     assert result.returncode == 0, "Image generation failed."
-    
-def convert(infile, outfile, executable, slow=False, zMips=False):
+
+def convert(infile, outfile, executable, slow=False, smart=False, z_mips=False, mem_limit_in_mb=200):
     cmd = [executable]
     if slow:
+        if smart:
+            raise "Only slow or smart can be selected at once."
         cmd.append("-s")
-    if zMips:
+    if smart:
+        cmd.append("-r" + str(mem_limit_in_mb))
+    if z_mips:
         cmd.append("-z")
     cmd.extend(["-o", outfile, infile])
-    
+
     print(*cmd)
-    
+
     result = subprocess.run(cmd)
     assert result.returncode == 0, "Conversion failed."
-    
-def time(infile, executable, slow=False):
+
+def time(infile, executable, slow=False, smart=False, mem_limit_in_mb=4000):
     os.system("sudo sh -c 'sync && echo 3 > /proc/sys/vm/drop_caches'")
-    
+
     cmd = [executable]
     if slow:
         cmd.append("-s")
     cmd.extend(["-o", "TIMED.hdf5", infile])
-    
+
+    if smart:
+        cmd.append("-r" + str(mem_limit_in_mb))
+
     print(*cmd)
-    
+
     start = timer()
     result = subprocess.run(cmd)
     end = timer()
-    
+
     if result.returncode > 0:
         return None
 
     subprocess.run(["rm", "test.fits", "TIMED.hdf5"])
-        
+
     return end - start
 
 def test_converter_correctness(infile, new_converter):
     convert(infile, "FAST.hdf5", new_converter)
     convert(infile, "SLOW.hdf5", new_converter, True)
-    
+    convert(infile, "SMART.hdf5", new_converter, smart=True, mem_limit_in_mb=200)
+
     # Do this first so that we fail more quickly
     compare_hdf5_hdf5("FAST.hdf5", "SLOW.hdf5", "Fast and slow versions differ.", False)
-    
+
+    compare_hdf5_hdf5("FAST.hdf5", "SMART.hdf5", "Fast and smart versions differ.", False)
+
+    compare_hdf5_hdf5("SLOW.hdf5", "SMART.hdf5", "Slow and smart versions differ.", False)
+
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=RuntimeWarning)
         compare_fits_hdf5("test.fits", "FAST.hdf5")
-    
-    subprocess.run(["rm", "test.fits", "FAST.hdf5", "SLOW.hdf5"])
+
+    subprocess.run(["rm", "test.fits", "FAST.hdf5", "SLOW.hdf5", "SMART.hdf5"])
 
 def test_new_old_converter(infile, slow, old_converter, new_converter):
     convert(infile, "OLD.hdf5", old_converter, slow)
     convert(infile, "NEW.hdf5", new_converter, slow)
-    
+
     compare_hdf5_hdf5("OLD.hdf5", "NEW.hdf5", "Old and new versions differ.", True)
 
+# Test the smart converter for several levels of memory limits
+def test_smart_converter_levels(infile, executable):
+    for mem_limit_in_mb in [0, 5, 50, 100, 500]:
+        out_file = "SMART_" + str(mem_limit_in_mb) + "_.hdf5"
+        convert(infile, out_file, executable, smart=True, mem_limit_in_mb=mem_limit_in_mb)
+        if mem_limit_in_mb > 0:
+            compare_hdf5_hdf5("SMART_0_.hdf5", out_file, "Smart converter with memory limit %dMB differs from smart converter with no limit." % mem_limit_in_mb, True)
+    compare_fits_hdf5(infile, "SMART_0_.hdf5")
+
 def small_nans_image_set():
     image_set = []
-    
+
     for N in (2, 3, 4):
         for nans in itertools.chain((None, ("image",)), itertools.chain.from_iterable(itertools.combinations(("pixel", "row", "column", "channel", "stokes"), n) for n in range(1, 5+1))):
             for nan_density in (0, 33, 66, 100):
@@ -274,12 +295,12 @@ def small_nans_image_set():
                     dims.append(random.randint(50, 100))
                 if N > 3:
                     dims.append(random.randint(1, 4))
-                    
+
                 params = {
                     "--nans": nans,
                     "--nan-density": nan_density
                 }
-                
+
                 image_set.append((dims, params))
     return image_set
 
@@ -295,7 +316,7 @@ def small_dims_image_set():
 
 def large_mipmap_image_set():
     image_set = []
-    
+
     # A few bigger images to test multiple mipmaps
     for dims in ((5000, 200), (5000, 200, 10), (5000, 200, 10, 2)):
         for nans in (("pixel",),):
@@ -304,7 +325,22 @@ def large_mipmap_image_set():
                     "--nans": nans,
                     "--nan-density": nan_density
                 }
-                
+
+                image_set.append((dims, params))
+    return image_set
+
+def depth_mipmap_image_set():
+    image_set = []
+
+    # A few 3d images to test z mipmaps
+    for dims in ((130, 130), (130, 130, 130), (130, 130, 130, 2)):
+        for nans in (("pixel",),):
+            for nan_density in (50,):
+                params = {
+                    "--nans": nans,
+                    "--nan-density": nan_density
+                }
+
                 image_set.append((dims, params))
     return image_set
 
@@ -380,27 +416,27 @@ def dummy_image_set():
 
 def test_speed(args, *image_sets):
     executables = [args.executable]
-    if "compare" in args:
+    if args.compare is not None:
         executables.append(args.compare)
-    
+
     times = defaultdict(lambda: defaultdict(list))
-    
+
     for i in range(args.repeat):
         for executable in executables:
             for image_set in image_sets:
                 for dims, _ in image_set: # we only care about the dimensions
                     make_image("test.fits", *dims)
-                    t = time("test.fits", executable, args.slow)
+                    t = time("test.fits", executable, args.slow, args.smart)
                     if t is not None:
                         times[dims][executable].append(t)
-    
+
     winners = {}
     xs = set()
     zs = set()
-    
+
     print("Image dimensions", *executables, sep='\t')
     print()
-    
+
     for dims in sorted(times):
         best = [np.min(times[dims][e]) for e in executables]
         print(*dims, *best, sep='\t')
@@ -408,17 +444,17 @@ def test_speed(args, *image_sets):
         xs.add(x)
         zs.add(z)
         winners[(x, z)] = "A" if min(best) == best[0] else "B"
-    
+
     if len(executables) > 1:
         xs = sorted(xs)
         zs = sorted(zs)
-        
+
         for label, path in zip(["A", "B"], executables):
             print("%s: %s" % (label, path))
         print()
-        
+
         print('X \ Z', *zs, sep='\t')
-        
+
         for x in xs:
             print(x, end='\t')
             for z in zs:
@@ -438,14 +474,15 @@ def test_zmips_converter(infile, new_converter):
     parser.add_argument('-c', '--compare', help='A path to another converter executable to use as a reference. If a reference converter is given, random files converted using the fast algorithm with both converters will be compared to each other. If none is given, the output of the converter at the default path will be checked for correctness (THIS IS SLOW), and its fast and slow algorithm outputs will be compared for consistency.')
     parser.add_argument('-t', '--time', action='store_true', help='Time the converter(s) instead of checking the output.')
     parser.add_argument('-s', '--slow', action='store_true', help='Use the slow converter versions when comparing converters or timing converter(s).')
+    parser.add_argument('-m', '--smart', action='store_true', help='Use the smart converter versions when comparing converters or timing converter(s).')
     parser.add_argument('-i', '--image-set', nargs="+", help="The image set(s) to use. Any combination of %s. By default a small dummy set is used." % ", ".join(repr(o) for o in IMAGE_SETS) , default=["DUMMY"])
     parser.add_argument('-r', '--repeat', type=int, help="The number of times to repeat timed conversions (default: 3).", default=3)
     parser.add_argument('-z', '--zmips', action='store_true', help='Test if HDF5 conversion for depth MipMaps produces valid result.')
     parser.add_argument("executable", help="The path to the executable to test.")
     args = parser.parse_args()
-    
+
     image_sets = (IMAGE_SETS[i] for i in args.image_set)
-    
+
     if args.time:
         test_speed(args, *image_sets, args.executable)
     else:
@@ -456,5 +493,7 @@ def test_zmips_converter(infile, new_converter):
                     test_new_old_converter("test.fits", args.slow, args.compare, args.executable)
                 elif args.zmips:
                     test_zmips_converter("test.fits", args.executable);
+                elif args.smart:
+                    test_smart_converter_levels("test.fits", args.executable)
                 else:
                     test_converter_correctness("test.fits", args.executable)
diff --git a/src/Converter.cc b/src/Converter.cc
index b011fc0..02699a4 100644
--- a/src/Converter.cc
+++ b/src/Converter.cc
@@ -52,9 +52,11 @@ Converter::~Converter() {
     closeFitsFile(inputFilePtr);
 }
 
-std::unique_ptr<Converter> Converter::getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool progress, bool zMips) {
+std::unique_ptr<Converter> Converter::getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips, int memoryLimitInMb = 0) {
     if (slow) {
         return std::unique_ptr<Converter>(new SlowConverter(inputFileName, outputFileName, progress, zMips));
+    } else if (smart && memoryLimitInMb > 0) {
+        return std::unique_ptr<Converter>(new SmartConverter(inputFileName, outputFileName, progress, zMips, memoryLimitInMb));
     } else {
         return std::unique_ptr<Converter>(new FastConverter(inputFileName, outputFileName, progress, zMips));
     }
diff --git a/src/Converter.h b/src/Converter.h
index 78a45ad..a3abebc 100644
--- a/src/Converter.h
+++ b/src/Converter.h
@@ -11,6 +11,7 @@
 #include "MipMap.h"
 #include "Timer.h"
 #include "Util.h"
+#include "omp.h"
 
 struct MemoryUsage {
     MemoryUsage() : total(0) {}
@@ -24,9 +25,11 @@ class Converter {
 public:
     Converter() {}
     Converter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
+    Converter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips, int memoryLimitInMb);
     virtual ~Converter();
     
-    static std::unique_ptr<Converter> getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool progress, bool zMips);
+    static std::unique_ptr<Converter> getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips);
+    static std::unique_ptr<Converter> getConverter(std::string inputFileName, std::string outputFileName, bool slow, bool smart, bool progress, bool zMips, int memoryLimitInMb);
     void convert();
     void reportMemoryUsage();
     virtual MemoryUsage calculateMemoryUsage() = 0;
@@ -83,6 +86,23 @@ class FastConverter : public Converter {
 };
 
 
+class SmartConverter : public Converter {
+public:
+    SmartConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
+    SmartConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMip, int memoryLimitInMb);
+    MemoryUsage calculateMemoryUsage() override;
+    
+private:
+    int memoryLimitInMb;
+    
+protected:
+    void copyAndCalculate() override;
+    void ReadRotateWriteFullDepth(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
+        hsize_t xLimit, hsize_t yLimit, hsize_t xIncrement, hsize_t yIncrement);
+    void ReadRotateWritePartialDepth(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
+        hsize_t zStart, hsize_t xLimit, hsize_t yLimit, hsize_t zLimit, hsize_t xIncrement, hsize_t yIncrement, hsize_t zIncrement);
+};
+
 class SlowConverter : public Converter {
 public:
     SlowConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMips);
diff --git a/src/FastConverter.cc b/src/FastConverter.cc
index 325eeeb..3401678 100644
--- a/src/FastConverter.cc
+++ b/src/FastConverter.cc
@@ -75,10 +75,12 @@ void FastConverter::copyAndCalculate() {
 
         // First loop calculates stats for each XY slice and rotates the dataset
         
-#pragma omp parallel for
-        for (hsize_t i = 0; i < depth; i++) {
+        hsize_t i;
+        StatsCounter counterXY;
+#pragma omp parallel for default(none) private(i, counterXY) shared(channelProgressStride, std::cout)
+        for (i = 0; i < depth; i++) {
             PROGRESS_DECIMATED(i, channelProgressStride, "|");
-            StatsCounter counterXY;
+            counterXY.reset();
             
             auto& indexXY = i;
             std::function<void(float)> accumulate;
@@ -114,6 +116,7 @@ void FastConverter::copyAndCalculate() {
             }
             
             // Final correction of XY min and max
+
             statsXY.copyStatsFromCounter(indexXY, height * width, counterXY);
         }
         
@@ -140,8 +143,9 @@ void FastConverter::copyAndCalculate() {
             PROGRESS("\tZ stats\t\t");
             TIMER(timer.start("Z statistics"););
 
-#pragma omp parallel for
-            for (hsize_t j = 0; j < height; j++) {
+            hsize_t j;
+#pragma omp parallel for default(none) private(j) shared(pixelProgressStride, std::cout)
+            for (j = 0; j < height; j++) {
                 for (hsize_t k = 0; k < width; k++) {
                     StatsCounter counterZ;
                     
@@ -188,8 +192,8 @@ void FastConverter::copyAndCalculate() {
         statsXY.clearHistogramBuffers();
         statsXYZ.clearHistogramBuffers();
 
-#pragma omp parallel for
-        for (hsize_t i = 0; i < depth; i++) {
+#pragma omp parallel for default(none) private(i) shared(channelProgressStride, std::cout, cubeHist, cubeMin, cubeRange)
+        for (i = 0; i < depth; i++) {
             PROGRESS_DECIMATED(i, channelProgressStride, "|");
             
             auto& indexXY = i;
@@ -277,8 +281,9 @@ void FastConverter::copyAndCalculate() {
         PROGRESS("\tMipmaps\t\t");
         TIMER(timer.start("Mipmaps"););
         
-#pragma omp parallel for
-        for (hsize_t c = 0; c < depth; c++) {
+        hsize_t c;
+#pragma omp parallel for default(none) private(c) shared(channelProgressStride, std::cout)
+        for (c = 0; c < depth; c++) {
             PROGRESS_DECIMATED(c, channelProgressStride, "|");
             for (hsize_t y = 0; y < height; y++) {
                 for (hsize_t x = 0; x < width; x++) {
diff --git a/src/MipMap.h b/src/MipMap.h
index c73f694..17a5d43 100644
--- a/src/MipMap.h
+++ b/src/MipMap.h
@@ -19,11 +19,16 @@ struct MipMap {
     void createBuffers(std::vector<hsize_t>& bufferDims);
     
     void accumulate(double val, hsize_t x, hsize_t y, hsize_t z) {
-        hsize_t mipIndex = (z / mipZ) * width * height + (y / mipXY) * width + (x / mipXY);
+        hsize_t mipIndex;
+        XYZToMipIndex(x, y, z, mipIndex);
         vals[mipIndex] += val;
         count[mipIndex]++;
     }
     
+    void XYZToMipIndex(hsize_t x, hsize_t y, hsize_t z, hsize_t& mipIndex) {
+        mipIndex = (z / mipZ) * width * height + (y / mipXY) * width + (x / mipXY);
+    }
+    
     void calculate() {
         for (hsize_t mipIndex = 0; mipIndex < bufferSize; mipIndex++) {
             if (count[mipIndex]) {
@@ -68,7 +73,9 @@ struct MipMaps {
     void createBuffers(const std::vector<hsize_t>& standardBufferDims);
     
     void accumulate(double val, hsize_t x, hsize_t y, hsize_t z) {
-        for (auto& mipMap : mipMaps) {
+        int n = mipMaps.size();
+        for (int i = 0; i < n; i++) {
+            auto& mipMap = mipMaps[i];
             mipMap.accumulate(val, x, y, z);
         }
     }
diff --git a/src/SmartConverter.cc b/src/SmartConverter.cc
new file mode 100644
index 0000000..064fa20
--- /dev/null
+++ b/src/SmartConverter.cc
@@ -0,0 +1,634 @@
+/* This file is part of the FITS to IDIA file format converter: https://github.com/idia-astro/fits2idia
+   Copyright 2019, 2020, 2021 the Inter-University Institute for Data Intensive Astronomy (IDIA)
+   SPDX-License-Identifier: GPL-3.0-or-later
+*/
+
+#include "Converter.h"
+
+SmartConverter::SmartConverter(std::string inputFileName, std::string outputFileName, bool progress, bool zMip, int memoryLimitInMb = 0) : Converter(inputFileName, outputFileName, progress, zMips)
+{
+    this->memoryLimitInMb = memoryLimitInMb;
+}
+
+MemoryUsage SmartConverter::calculateMemoryUsage() {
+    MemoryUsage m;
+
+    m.sizes["Main dataset"] = height * width * sizeof(float);
+    m.sizes["Mipmaps"] = MipMaps::size(standardDims, {1, height, width}, zMips);
+    m.sizes["XY stats"] = Stats::size({depth}, numBins);
+    
+    if (depth > 1) {
+        m.sizes["Rotation"] = 2 * product(trimAxes({stokes, depth, TILE_SIZE, TILE_SIZE}, N)) * sizeof(float);
+        m.sizes["XYZ stats"] = Stats::size({}, numBins, depth);
+        m.sizes["Z stats"] = Stats::size({TILE_SIZE, TILE_SIZE});
+    }
+    
+    for (auto& kv : m.sizes) {
+        m.total += kv.second;
+    }
+    
+    if (depth > 1) {
+        m.total -= std::min(m.sizes["Main dataset"], m.sizes["Rotation"] + m.sizes["Z stats"]);
+        m.note = " (Main dataset and slices for rotation and Z statistics are not allocated at the same time.)";
+    }
+
+    return m;
+}
+
+void SmartConverter::copyAndCalculate() {
+    const hsize_t channelProgressStride = std::max((hsize_t)1, (hsize_t)(depth / 100));
+    hsize_t numTiles = std::ceil(width / TILE_SIZE) * std::ceil(height / TILE_SIZE);
+    const hsize_t tileProgressStride = std::max((hsize_t)1, (hsize_t)(numTiles / 100));
+    
+    // 32 has produced best results in testing
+    const hsize_t REGION_MULTIPLIER = 32;
+    
+    // Allocate one batch of slices at a time, and no swizzled data.
+    // Batch size in slices determined by memory limit.
+  
+    hsize_t memoryLimitInBytes = memoryLimitInMb * 1024 * 1024;
+    hsize_t sliceSizeInPixels = height * width;
+    hsize_t memoryLimitInPixels = memoryLimitInBytes / sizeof(float);
+    hsize_t memoryLimitInSlices = std::ceil((float)memoryLimitInPixels / (float)sliceSizeInPixels);
+    
+    // Increment over the full depth if the memory size limit is larger than the depth
+    hsize_t sliceIncrement = std::min(memoryLimitInSlices, depth);
+    
+    hsize_t cubeSize = height * width * sliceIncrement;
+    TIMER(timer.start("Allocate"););
+    standardCube = new float[cubeSize];
+    
+    // Allocate one stokes of stats at a time
+    statsXY.createBuffers({depth}, height);
+    
+    if (depth > 1) {
+        statsXYZ.createBuffers({}, height);
+    }
+    
+    mipMaps.createBuffers({1, height, width});
+
+    std::vector<hsize_t> count = trimAxes({1, sliceIncrement, height, width}, N);
+    std::vector<hsize_t> memDims = {sliceIncrement, height, width};
+    
+    
+    std::string timerLabelStatsMipmaps = depth > 1 ? "XY and XYZ statistics and mipmaps" : "XY statistics and mipmaps";
+
+    
+    for (unsigned int s = 0; s < stokes; s++) {
+        DEBUG(std::cout << "Processing Stokes " << s << "... " << std::endl;);
+        PROGRESS("Stokes " << s << ":" << std::endl);
+        
+        PROGRESS("\tMain loop\t");
+        
+        StatsCounter counterXYZ;
+        
+        // "working" versions of these variables are used to handle the last increment of slices that may be smaller than the batch size
+        hsize_t workingIncrement = sliceIncrement;
+        hsize_t workingCubeSize = cubeSize;
+        std::vector<hsize_t> workingCount = count;
+        std::vector<hsize_t> workingMemDims = memDims;
+        
+        for (hsize_t c = 0; c < depth; c = c + workingIncrement) {
+            hsize_t leftOverSlices = depth - c;
+            size_t actualIncrement = std::min(sliceIncrement, leftOverSlices);
+            
+            // If the last increment is smaller than the batch size, adjust the "working" variables
+            if (actualIncrement < sliceIncrement) {
+                workingIncrement = leftOverSlices;
+                workingCubeSize = height * width * leftOverSlices;
+                workingCount = trimAxes({1, leftOverSlices, height, width}, N);
+                workingMemDims = {leftOverSlices, height, width};
+            }
+            PROGRESS_DECIMATED(c, channelProgressStride, "|");
+            // read one channel
+            DEBUG(std::cout << "+ Processing channel " << c << "... " << std::flush;);
+            DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+            TIMER(timer.start("Read"););
+            readFitsData(inputFilePtr, c, s, workingCubeSize, standardCube);
+            
+            // Write the standard dataset
+            
+            DEBUG(std::cout << " Writing main dataset..." << std::flush;);
+            TIMER(timer.start("Write"););
+            
+            std::vector<hsize_t> start = trimAxes({s, c, 0, 0}, N);
+            writeHdf5Data(standardDataSet, standardCube, workingMemDims, workingCount, start);
+            
+            DEBUG(std::cout << " Accumulating XY stats and mipmaps..." << std::flush;);
+            TIMER(timer.start(timerLabelStatsMipmaps););
+
+            for(int slice = 0; slice < actualIncrement; slice++) {
+            
+            StatsCounter counterXY;
+            
+            auto indexXY = c + slice;
+            std::function<void(float)> accumulate;
+            
+            //
+            
+            auto lazy_accumulate = [&] (float val) {
+                counterXY.accumulateFiniteLazy(val);
+            };
+            
+            auto first_accumulate = [&] (float val) {
+                counterXY.accumulateFiniteLazyFirst(val);
+                accumulate = lazy_accumulate;
+            };
+            
+            accumulate = first_accumulate;
+            
+            int regionIndex;
+            StatsCounter counterRegion;
+            
+            int regionRows = std::ceil((float)height / (float)REGION_MULTIPLIER);
+            int regionCols = std::ceil((float)width / (float)REGION_MULTIPLIER);
+            int cubeSizeInRegions = regionRows * regionCols;
+            
+#pragma omp parallel for default(none) private (regionIndex, counterRegion) shared (slice, standardCube, cubeSizeInRegions, mipMaps, counterXY, cubeSize, REGION_MULTIPLIER)
+            for (regionIndex = 0; regionIndex < cubeSizeInRegions; regionIndex += 1 ) {
+                counterRegion.reset();
+                hsize_t x0,y0,z0;
+                RegionIndexToXYZ(regionIndex, x0, y0, z0, width, height, REGION_MULTIPLIER, REGION_MULTIPLIER,
+                    1); //use index of higher-order mipmap-space to keep lower-order mipmaps thread-safe
+                for (hsize_t y = y0; y < y0 + REGION_MULTIPLIER; y++) {
+                    for (hsize_t x = x0; x < x0 + REGION_MULTIPLIER; x++) {
+                        auto pos = y * width + x + slice * width * height;
+                        if (x >= width || y >= height) {    //check if we are out of bounds
+                            continue;
+                        }
+                        auto& val = standardCube[pos];
+                        if (std::isfinite(val)) {
+                            
+                            // region statistics
+                            counterRegion.accumulateFinite(val);
+                    
+                            // Accumulate mipmaps
+                            mipMaps.accumulate(val, x, y, 0); //This will not conflict with the other threads as regions are separate
+                            
+                        } else {
+                            counterRegion.accumulateNonFinite();
+                        }
+                    }
+                }
+#pragma omp critical
+                counterXY.accumulateFromCounter(counterRegion);      // Accumulate to slice's XY stats from thread-local X stats
+            } // end of region loop
+            
+            // Final correction of XY min and max
+            DEBUG(std::cout << " Final XY stats..." << std::flush;);
+            statsXY.copyStatsFromCounter(indexXY, height * width, counterXY);
+            
+            // Accumulate XYZ statistics
+            if (depth > 1) {
+                DEBUG(std::cout << " Accumulating XYZ stats..." << std::flush;);
+                statsXY.accumulateStatsToCounter(counterXYZ, indexXY);
+            }
+            
+            // Final mipmap calculation
+            DEBUG(std::cout << " Final mipmaps..." << std::flush;);
+            mipMaps.calculate();
+            
+            
+            // Write the mipmaps
+            DEBUG(std::cout << " Writing mipmaps..." << std::flush;);
+            TIMER(timer.start("Write"););
+            mipMaps.write(s, c + slice);
+            
+            // Reset mipmaps before next channel
+            DEBUG(std::cout << " Resetting mipmap objects..." << std::endl;);
+            TIMER(timer.start(timerLabelStatsMipmaps););
+            mipMaps.resetBuffers();
+            
+        }
+        } // end of first channel loop
+        
+        PROGRESS(std::endl);
+        
+        if (depth > 1) {
+            // Final correction of XYZ min and max
+            DEBUG(std::cout << " Final XYZ stats..." << std::flush;);
+            PROGRESS("\tXYZ stats" << std::endl);
+            TIMER(timer.start(timerLabelStatsMipmaps););
+            statsXYZ.copyStatsFromCounter(0, depth * height * width, counterXYZ);
+        }
+        
+        // XY and XYZ histograms
+        // We need a second pass over all channels because we need cube min and max (and channel min and max per channel)
+        // We do the second pass backwards to take advantage of caching
+        DEBUG(std::cout << " Histograms..." << std::endl;);
+        PROGRESS("\tHistograms\t");
+        TIMER(timer.start("Histograms"););
+        
+        double cubeMin;
+        double cubeMax;
+        double cubeRange;
+        bool cubeHist(false);
+        
+        if (depth > 1) {
+            cubeMin = statsXYZ.minVals[0];
+            cubeMax = statsXYZ.maxVals[0];
+            cubeRange = cubeMax - cubeMin;
+            cubeHist = std::isfinite(cubeMin) && std::isfinite(cubeMax) && cubeRange > 0;
+        }
+        
+        statsXY.clearHistogramBuffers();
+        statsXYZ.clearHistogramBuffers();
+        
+        DEBUG(std::cout << "+ Will " << (cubeHist ? "" : "not ") << "calculate cube histogram." << std::endl;);
+        
+        hsize_t startingSlice = depth - workingIncrement;
+        for (hssize_t c = startingSlice; c > -1; c = c - sliceIncrement) {
+            
+            DEBUG(std::cout << "+ Processing channel " << c << "... " << std::flush;);
+            PROGRESS_DECIMATED(c, channelProgressStride, "|");
+            
+            // skip first batch of slices to prevent unnecessary file read
+            if (c < startingSlice) {
+                DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+                TIMER(timer.start("Read"););
+                readFitsData(inputFilePtr, c, s, workingCubeSize, standardCube);
+            }
+            
+            DEBUG(std::cout << " Calculating histogram(s)..." << std::endl;);
+            TIMER(timer.start("Histograms"););
+
+
+        for (hsize_t slice = 0; slice < workingIncrement; slice++) {
+            
+            auto indexXY = c + slice;
+            
+            workingCubeSize = height * width * workingIncrement;
+            
+            double chanMin = statsXY.minVals[indexXY];
+            double chanMax = statsXY.maxVals[indexXY];
+            double chanRange = chanMax - chanMin;
+            
+            bool chanHist(std::isfinite(chanMin) && std::isfinite(chanMax) && chanRange > 0);
+            DEBUG(std::cout << " Will " << (chanHist ? "" : "not ") << "calculate channel histogram." << std::flush;);
+            
+            if (!chanHist && !cubeHist) {
+                continue;
+            }
+            
+            auto doChannelHistogram = [&] (float val, hsize_t offset) {
+                // XY histogram
+                statsXY.accumulatePartialHistogram(val, chanMin, chanRange, offset);
+            };
+            
+            auto doCubeHistogram = [&] (float val, hsize_t offset) {
+                // XYZ histogram
+                statsXYZ.accumulatePartialHistogram(val, cubeMin, cubeRange, offset);
+            };
+            
+            auto doNothing = [&] (float val) {
+                UNUSED(val);
+            };
+            
+            auto doNothingOffset = [&] (float val, hsize_t offset) {
+                UNUSED(val);
+            };
+            
+            std::function<void(float,hsize_t)> channelHistogramFunc = doChannelHistogram;
+            std::function<void(float,hsize_t)> cubeHistogramFunc = doCubeHistogram;
+            
+            if (!chanHist) {
+                channelHistogramFunc = doNothingOffset;
+            }
+            
+            if (!cubeHist) {
+                cubeHistogramFunc = doNothingOffset;
+            }
+            
+            hsize_t y ;
+            
+#pragma omp parallel for default(none) private(y) shared(slice, standardCube, height, width, channelHistogramFunc, cubeHistogramFunc)
+                for (y = 0; y < height; y++) {
+                    for (hsize_t x = 0; x < width; x++) {
+                        auto pos = y * width + x + slice * width * height;
+                        auto& val = standardCube[pos];
+                        if (std::isfinite(val)) {
+                            channelHistogramFunc(val, y);
+                            cubeHistogramFunc(val, y);
+                        }
+                    }
+                } // end of XY loop
+
+                statsXY.consolidateAndClearPartialHistogram(c + slice);
+                statsXYZ.consolidateAndClearPartialHistogram(0);
+            } //for loop of sliceincrement ends here
+            workingIncrement = sliceIncrement; //reset workingIncrement to sliceIncrement for remainder of depth
+        } // end of second channel loop (XY and XYZ histograms)
+        
+        PROGRESS(std::endl);
+        
+        // Write the statistics
+        TIMER(timer.start("Write"););
+        PROGRESS("\tWrite stats & mipmaps" << std::endl);
+                
+        statsXY.write({1, depth}, {s, 0});
+        
+        if (depth > 1) {
+            statsXYZ.write({1}, {s});
+        }
+    
+    } // end of stokes
+    
+    // Free memory
+    DEBUG(std::cout << "Freeing memory from main dataset... " << std::endl;);
+    TIMER(timer.start("Free"););
+    
+    delete[] standardCube;
+    
+    // Swizzle
+    if (depth > 1) {
+        DEBUG(std::cout << "Performing tiled rotation." << std::endl;);
+        PROGRESS("Tiled rotation & Z stats" << std::endl);
+        TIMER(timer.start("Allocate"););
+        
+        hsize_t tileSize = product(trimAxes({stokes, depth, std::min(width, TILE_SIZE) , std::min(height, TILE_SIZE)}, N));
+        
+        float* standardSlice;
+        float* rotatedSlice;
+        
+        
+        // if the memory limit is too low for a single tile at full-depth, handle partial tiles in z-direction
+        if (2 * tileSize > memoryLimitInPixels) {
+            hsize_t maxSlicesAtOnce = memoryLimitInPixels / (2 * std::min(width, TILE_SIZE) * std::min(height, TILE_SIZE));
+            // Allocate memory for main and rotated dataset slices and Z statistics
+            standardSlice = new float[std::min(width, TILE_SIZE) * std::min(height, TILE_SIZE) * maxSlicesAtOnce];
+            rotatedSlice = new float[std::min(width, TILE_SIZE) * std::min(height, TILE_SIZE) * maxSlicesAtOnce];
+            statsZ.createBuffers({width, height});
+            
+            for (unsigned int s = 0; s < stokes; s++) {
+                DEBUG(std::cout << "Processing Stokes " << s << "..." << std::endl;);
+                PROGRESS("\tStokes " << s << "\t");
+
+                hsize_t tileCount(0);
+                ReadRotateWritePartialDepth(standardSlice, rotatedSlice, s, 0, 0, 0, width, height, depth, std::min(width, TILE_SIZE), std::min(height, TILE_SIZE), maxSlicesAtOnce);
+                
+            }
+            // otherwise, handle full-depth groups of tiles at a time that fit within the memory limit
+        } else {
+            hsize_t memoryLimitInTiles =
+                std::floor((float)memoryLimitInPixels / (float)tileSize / 2); // divide by 2 because we have standard and rotated slices
+            float widthInTiles = (float)width / (float)TILE_SIZE;
+            float heightInTiles = (float)height / (float)TILE_SIZE;
+            hsize_t TotalFullTiles = std::ceil(widthInTiles) * std::ceil(heightInTiles);
+            hsize_t maxTilesAtATime = std::min(TotalFullTiles, memoryLimitInTiles);
+
+            // Allocate memory for main and rotated dataset slices and Z statistics
+            standardSlice = new float[tileSize * maxTilesAtATime];
+            rotatedSlice = new float[tileSize * maxTilesAtATime];
+            statsZ.createBuffers({TILE_SIZE, TILE_SIZE});
+
+            for (unsigned int s = 0; s < stokes; s++) {
+                DEBUG(std::cout << "Processing Stokes " << s << "..." << std::endl;);
+                PROGRESS("\tStokes " << s << "\t");
+
+                hsize_t tileCount(0);
+
+                // Calculate excess pixels that won't fit into full tiles
+                hsize_t rightStripeWidthInPixels = width % TILE_SIZE;
+                hsize_t bottomStripeHeightInPixels = height % TILE_SIZE;
+
+                hsize_t mainBlockWidthInTiles = (width - rightStripeWidthInPixels) / TILE_SIZE;
+                hsize_t mainBlockHeightInTiles = (height - bottomStripeHeightInPixels) / TILE_SIZE;
+
+                hsize_t xTileIncrement = 1;
+                hsize_t yTileIncrement = 1;
+
+                // Main block of full tiles
+                if (!(mainBlockWidthInTiles == 0 && mainBlockHeightInTiles == 0)) {
+                    if (maxTilesAtATime < mainBlockWidthInTiles) {
+                        hsize_t possibleIncrement = 1;
+                        while (possibleIncrement < maxTilesAtATime) {
+                            possibleIncrement++;
+                            if (mainBlockWidthInTiles % possibleIncrement == 0) {
+                                xTileIncrement = possibleIncrement;
+                            }
+                        }
+                    } else {
+                        xTileIncrement = mainBlockWidthInTiles;
+                        hsize_t possibleIncrement = 1;
+                        while (possibleIncrement * xTileIncrement < maxTilesAtATime) {
+                            possibleIncrement++;
+                            if (mainBlockHeightInTiles % possibleIncrement == 0) {
+                                yTileIncrement = possibleIncrement;
+                            }
+                        }
+                    }
+                    DEBUG(std::cout << "+ Processing main block tiles at " << 0 << ", " << 0 << "..." << std::flush;);
+                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                    ReadRotateWriteFullDepth(standardSlice, rotatedSlice, s, 0, 0, mainBlockWidthInTiles * TILE_SIZE,
+                        mainBlockHeightInTiles * TILE_SIZE, xTileIncrement * TILE_SIZE, yTileIncrement * TILE_SIZE);
+                    tileCount += xTileIncrement * yTileIncrement;
+                }
+
+                // Bottom stripe of partial tiles
+                if (bottomStripeHeightInPixels > 0 && mainBlockWidthInTiles > 0) {
+                    DEBUG(std::cout << "+ Processing bottom stripe of tiles at " << 0 << ", " << mainBlockHeightInTiles * TILE_SIZE << "..."
+                                    << std::flush;);
+                    yTileIncrement = 1;
+                    if (maxTilesAtATime > mainBlockWidthInTiles)
+                        xTileIncrement = mainBlockWidthInTiles;
+                    else
+                        xTileIncrement = maxTilesAtATime;
+                    ReadRotateWriteFullDepth(standardSlice, rotatedSlice, s, 0, mainBlockHeightInTiles * TILE_SIZE,
+                        mainBlockWidthInTiles * TILE_SIZE, height, xTileIncrement * TILE_SIZE, bottomStripeHeightInPixels);
+                    tileCount += mainBlockWidthInTiles;
+                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                }
+
+                // Right-side stripe of partial tiles
+                if (rightStripeWidthInPixels > 0 && mainBlockHeightInTiles > 0) {
+                    DEBUG(std::cout << "+ Processing right stripe of tiles at " << mainBlockWidthInTiles * TILE_SIZE << ", " << 0 << "..."
+                                    << std::flush;);
+                    xTileIncrement = 1;
+                    if (maxTilesAtATime > mainBlockHeightInTiles)
+                        yTileIncrement = mainBlockHeightInTiles;
+                    else
+                        yTileIncrement = maxTilesAtATime;
+                    ReadRotateWriteFullDepth(standardSlice, rotatedSlice, s, mainBlockWidthInTiles * TILE_SIZE, 0, width,
+                        mainBlockHeightInTiles * TILE_SIZE, rightStripeWidthInPixels, yTileIncrement * TILE_SIZE);
+                    tileCount += mainBlockHeightInTiles;
+                    PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                }
+
+                // Bottom-right partial tile that remains
+                DEBUG(std::cout << "+ Processing remainder partial tile in bottom right " << mainBlockWidthInTiles * TILE_SIZE << ", "
+                                << mainBlockHeightInTiles * TILE_SIZE << "..." << std::flush;);
+                ReadRotateWriteFullDepth(standardSlice, rotatedSlice, s, mainBlockWidthInTiles * TILE_SIZE,
+                    mainBlockHeightInTiles * TILE_SIZE, width, height, rightStripeWidthInPixels, bottomStripeHeightInPixels);
+                tileCount++;
+                PROGRESS_DECIMATED(tileCount, tileProgressStride, "#");
+                PROGRESS(std::endl);
+            }
+        }
+        TIMER(timer.start("Free"););
+        DEBUG(std::cout << "Freeing memory from main and rotated dataset slices... " << std::endl;);
+        delete[] standardSlice;
+        delete[] rotatedSlice;
+    }
+}
+void SmartConverter::ReadRotateWriteFullDepth(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart,
+    hsize_t xLimit, hsize_t yLimit, hsize_t xIncrement, hsize_t yIncrement) {
+    
+    for (hsize_t yOffset = yStart; yOffset < yLimit; yOffset += yIncrement) {
+        // If remaining rows are less than the increment, adjust the size of the processed slice
+        hsize_t ySize = yOffset + yIncrement > yLimit ? yLimit - yOffset : yIncrement;
+        for (hsize_t xOffset = xStart; xOffset < xLimit; xOffset += xIncrement) {
+            // If remaining columns are less than the increment, adjust the size of the processed slice
+            hsize_t xSize = xOffset + xIncrement > xLimit ? xLimit - xOffset : xIncrement;
+            // read tile slice
+            DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+            TIMER(timer.start("Read"););
+            
+            auto standardMemDims = trimAxes({1, depth, ySize, xSize}, N);
+            auto standardCount = trimAxes({1, depth, ySize, xSize}, N);
+            auto standardStart = trimAxes({s, 0, yOffset, xOffset}, N);
+            
+            readHdf5Data(standardDataSet, standardSliceToRead, standardMemDims, standardCount, standardStart);
+            
+            // rotate tile slice
+            DEBUG(std::cout << " Calculating rotation..." << std::flush;);
+            TIMER(timer.start("Rotation"););
+            
+            hsize_t i;
+#pragma omp parallel for default(none) private (i) shared (depth, xSize, ySize, xStart, yStart, standardSliceToRead, rotatedSliceToWrite)
+            for (i = 0; i < depth; i++) {
+                for (hsize_t j = 0; j < ySize; j++) {
+                    for (hsize_t k = 0; k < xSize; k++) {
+                        auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                        auto& val = standardSliceToRead[sourceIndex];
+                
+                        // rotation
+                        auto destIndex = i + depth * j + (ySize * depth) * k;
+                        rotatedSliceToWrite[destIndex] = val;
+                    }
+                }
+            }
+            
+            // A separate pass over the same slice depth-last
+            DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
+            TIMER(timer.start("Z statistics"););
+            
+            for (hsize_t j = 0; j < ySize; j++) {
+                for (hsize_t k = 0; k < xSize; k++) {
+                    StatsCounter counterZ;
+                    auto indexZ = k + xSize * j;
+                    
+                    for (hsize_t i = 0; i < depth; i++) {
+                        auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                        auto& val = standardSliceToRead[sourceIndex];
+                        
+                        if (std::isfinite(val)) {
+                            // Not lazy; too much risk of encountering an ascending / descending sequence.
+                            counterZ.accumulateFinite(val);
+                        } else {
+                            counterZ.accumulateNonFinite();
+                        }
+                    }
+                    
+                    statsZ.copyStatsFromCounter(indexZ, depth, counterZ);
+                }
+            }
+
+            // write tile slice
+            DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
+            TIMER(timer.start("Write"););
+            
+            auto swizzledMemDims = trimAxes({1, xSize, ySize, depth}, N);
+            auto swizzledCount = trimAxes({1, xSize, ySize, depth}, N);
+            auto swizzledStart = trimAxes({s, xOffset, yOffset, 0}, N);
+            
+            writeHdf5Data(swizzledDataSet, rotatedSliceToWrite, swizzledMemDims, swizzledCount, swizzledStart);
+            
+            DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
+            // write Z statistics
+            statsZ.write({ySize, xSize}, {1, ySize, xSize}, {s, yOffset, xOffset});
+        }
+    }
+}
+
+//TODO: could probably generalize this function with the full depth version
+void SmartConverter::ReadRotateWritePartialDepth(float* standardSliceToRead, float* rotatedSliceToWrite, unsigned int s, hsize_t xStart, hsize_t yStart, hsize_t zStart,
+    hsize_t xLimit, hsize_t yLimit, hsize_t zLimit, hsize_t xIncrement, hsize_t yIncrement, hsize_t zIncrement) {
+    for (hsize_t zOffset = zStart; zOffset < zLimit; zOffset += zIncrement) {
+        // If remaining slices are less than the increment, adjust the size of the processed slice
+        hsize_t zSize = zOffset + zIncrement > zLimit ? zLimit - zOffset : zIncrement;
+        for (hsize_t yOffset = yStart; yOffset < yLimit; yOffset += yIncrement) {
+            // If remaining rows are less than the increment, adjust the size of the processed slice
+            hsize_t ySize = yOffset + yIncrement > yLimit ? yLimit - yOffset : yIncrement;
+            for (hsize_t xOffset = xStart; xOffset < xLimit; xOffset += xIncrement) {
+                // If remaining columns are less than the increment, adjust the size of the processed slice
+                hsize_t xSize = xOffset + xIncrement > xLimit ? xLimit - xOffset : xIncrement;
+                // read tile slice
+                DEBUG(std::cout << " Reading main dataset..." << std::flush;);
+                TIMER(timer.start("Read"););
+
+                auto standardMemDims = trimAxes({1, zSize, ySize, xSize}, N);
+                auto standardCount = trimAxes({1, zSize, ySize, xSize}, N);
+                auto standardStart = trimAxes({s, zOffset, yOffset, xOffset}, N);
+
+                readHdf5Data(standardDataSet, standardSliceToRead, standardMemDims, standardCount, standardStart);
+
+                // rotate tile slice
+                DEBUG(std::cout << " Calculating rotation..." << std::flush;);
+                TIMER(timer.start("Rotation"););
+
+                for (hsize_t i = 0; i < zSize; i++) {
+                    for (hsize_t j = 0; j < ySize; j++) {
+                        for (hsize_t k = 0; k < xSize; k++) {
+                            auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                            auto& val = standardSliceToRead[sourceIndex];
+
+                            // rotation
+                            auto destIndex = i + zSize * j + (ySize * zSize) * k;
+                            rotatedSliceToWrite[destIndex] = val;
+                        }
+                    }
+
+                    // A separate pass over the same slice depth-last
+                    DEBUG(std::cout << " Calculating Z statistics..." << std::flush;);
+                    TIMER(timer.start("Z statistics"););
+
+                    for (hsize_t j = 0; j < ySize; j++) {
+                        for (hsize_t k = 0; k < xSize; k++) {
+                            StatsCounter counterZ;
+                            auto indexZ = xOffset + k + width * (yOffset + j);
+
+                            for (hsize_t i = 0; i < zSize; i++) {
+                                auto sourceIndex = k + xSize * j + (ySize * xSize) * i;
+                                auto& val = standardSliceToRead[sourceIndex];
+
+                                if (std::isfinite(val)) {
+                                    // Not lazy; too much risk of encountering an ascending / descending sequence.
+                                    counterZ.accumulateFinite(val);
+                                } else {
+                                    counterZ.accumulateNonFinite();
+                                }
+                            }
+
+                            statsZ.accumulateStatsFromCounter(indexZ, counterZ);
+                        }
+                    }
+                }
+
+                // write tile slice
+                DEBUG(std::cout << " Writing rotated dataset..." << std::endl;);
+                TIMER(timer.start("Write"););
+
+                auto swizzledMemDims = trimAxes({1, xSize, ySize, zSize}, N);
+                auto swizzledCount = trimAxes({1, xSize, ySize, zSize}, N);
+                auto swizzledStart = trimAxes({s, xOffset, yOffset, zOffset}, N);
+
+                writeHdf5Data(swizzledDataSet, rotatedSliceToWrite, swizzledMemDims, swizzledCount, swizzledStart);
+
+
+            }
+        }
+    }
+    DEBUG(std::cout << " Writing Z statistics..." << std::endl;);
+    // write Z statistics
+    statsZ.write({height, width}, {1, height, width}, {s, 0, 0});
+}
diff --git a/src/Stats.h b/src/Stats.h
index 6b176d3..13f55b4 100644
--- a/src/Stats.h
+++ b/src/Stats.h
@@ -19,6 +19,14 @@ struct StatsCounter {
         sum += val;
         sumSq += val * val;
     }
+    
+    void reset() {
+        minVal = std::numeric_limits<float>::max();
+        maxVal = -std::numeric_limits<float>::max();
+        sum = 0;
+        sumSq = 0;
+        nanCount = 0;
+    }
 
     void accumulateFiniteLazy(float val) {
         if (val < minVal) {
@@ -40,6 +48,14 @@ struct StatsCounter {
     void accumulateNonFinite() {
         nanCount++;
     }
+    
+    void accumulateFromCounter(StatsCounter otherCounter) {
+        minVal = fmin(minVal, otherCounter.minVal);
+        maxVal = fmax(maxVal, otherCounter.maxVal);
+        sum += otherCounter.sum;
+        sumSq += otherCounter.sumSq;
+        nanCount += otherCounter.nanCount;
+    }
 
     float minVal;
     float maxVal;
@@ -71,6 +87,16 @@ struct Stats {
         counter.nanCount += nanCounts[index];
     }
     
+    void accumulateStatsFromCounter(hsize_t index, const StatsCounter& counter) {
+        if (std::isfinite(counter.maxVal)) {
+            minVals[index] = fmin(counter.minVal, minVals[index]);
+            maxVals[index] = fmax(counter.maxVal, maxVals[index]);
+            sums[index] += counter.sum;
+            sumsSq[index] += counter.sumSq;
+        }
+        nanCounts[index] += counter.nanCount;
+    }
+    
     void copyStatsFromCounter(hsize_t index, hsize_t totalVals, const StatsCounter& counter) {
         if ((hsize_t)counter.nanCount == totalVals) {
             minVals[index] = NAN;
@@ -89,6 +115,7 @@ struct Stats {
     // Histograms
     
     void clearHistogramBuffers();
+    void clearPartialHistogramBuffer();
 
     void accumulateHistogram(float val, double min, double range, hsize_t offset) {
         int binIndex = std::min(numBins - 1, (hsize_t)(numBins * (val - min) / range));
@@ -108,6 +135,15 @@ struct Stats {
         }
     }
     
+    void consolidateAndClearPartialHistogram(hsize_t mainOffset) {
+        for (hsize_t offset = 0; offset < partialHistMultiplier; offset++) {
+            for (hsize_t binIndex = 0; binIndex < numBins; binIndex++) {
+                histograms[mainOffset * numBins + binIndex] += partialHistograms[offset * numBins + binIndex];
+                partialHistograms[offset * numBins + binIndex] = 0;     //reset partial histogram after consolidation
+            }
+        }
+    }
+    
     // Writing
     void write();
     void write(const std::vector<hsize_t>& count, const std::vector<hsize_t>& start);
diff --git a/src/Util.cc b/src/Util.cc
index e338b00..6f2ce6f 100644
--- a/src/Util.cc
+++ b/src/Util.cc
@@ -254,3 +254,11 @@ void readHdf5Data(H5::DataSet& dataset, float* data, const std::vector<hsize_t>&
     }
     dataset.read(data, H5::PredType::NATIVE_FLOAT, memSpace, fileSpace);
 }
+
+void RegionIndexToXYZ(hsize_t mipIndex, hsize_t& x, hsize_t& y, hsize_t& z, hsize_t width, hsize_t height, hsize_t factorX, hsize_t factorY, hsize_t factorZ) {
+    int mipHeight = (height + factorY - 1)/ factorY;      //round result up
+    int mipWidth = (width + factorX - 1) / factorX;       //round result up
+    x = (mipIndex % (mipWidth * mipHeight)) % mipWidth * factorX;
+    y = (mipIndex % (mipWidth * mipHeight)) / mipWidth * factorY;
+    z = mipIndex / (mipWidth * mipHeight) * factorZ;
+}
\ No newline at end of file
diff --git a/src/Util.h b/src/Util.h
index be9519e..3abec82 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -48,4 +48,7 @@ void writeHdf5Data(H5::DataSet& dataset, int64_t* data, const std::vector<hsize_
 
 void readHdf5Data(H5::DataSet& dataset, float* data, const std::vector<hsize_t>& dims, const std::vector<hsize_t>& count = EMPTY_DIMS, const std::vector<hsize_t>& start = EMPTY_DIMS);
 
+void RegionIndexToXYZ(hsize_t mipIndex, hsize_t& x, hsize_t& y, hsize_t& z, hsize_t width, hsize_t height, hsize_t factorX, hsize_t factorY, hsize_t factorZ);
+
+
 #endif
diff --git a/src/main.cc b/src/main.cc
index 2c03133..17b6f25 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -9,7 +9,7 @@
 #include <sstream>
 #include "Converter.h"
 
-bool getOptions(int argc, char** argv, std::string& inputFileName, std::string& outputFileName, bool& slow, bool& progress, bool& onlyReportMemory, bool& zMips) {
+bool getOptions(int argc, char** argv, std::string& inputFileName, std::string& outputFileName, bool& slow, bool& smart, bool& progress, bool& onlyReportMemory, bool& zMips, int& memoryLimitInMb) {
     extern int optind;
     extern char *optarg;
     
@@ -22,13 +22,14 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
     << "Usage: fits2idia [-o output_filename] [-s] [-p] [-m] [-z] input_filename" << std::endl << std::endl
     << "Options:" << std::endl 
     << "-o\tOutput filename" << std::endl 
-    << "-s\tUse slower but less memory-intensive method (enable if memory allocation fails)" << std::endl 
+    << "-s\tUse slower but less memory-intensive method (enable if memory allocation fails)" << std::endl
+    << "-r\tUse smart method" << std::endl
     << "-p\tPrint progress output (by default the program is silent)" << std::endl
     << "-m\tReport predicted memory usage and exit without performing the conversion" << std::endl
     << "-q\tSuppress all non-error output. Deprecated; this is now the default." << std::endl
     << "-z\tInclude axis 3 in mipmap calculation (currently not compatible with -s mode)." << std::endl;
 
-    while ((opt = getopt(argc, argv, ":o:spqmz")) != -1) {
+    while ((opt = getopt(argc, argv, ":o:sr::pqmz")) != -1) {
         switch (opt) {
             case 'o':
                 outputFileName.assign(optarg);
@@ -37,6 +38,14 @@ bool getOptions(int argc, char** argv, std::string& inputFileName, std::string&
                 // use slower but less memory-intensive method
                 slow = true;
                 break;
+            case 'r':
+                // use smart method
+                if (optarg) { // If an argument is provided
+                    int arg = std::stoi(optarg); // Convert the argument to an integer
+                    smart = true; // Enable smart mode
+                    memoryLimitInMb = arg; // Assign the argument to the memory limit for smart mode
+                } //else, fast mode will be used by default because memory limit is not concern for user
+                memoryLimitInMb = 0; // Assign 0 to the memory limit for smart mode because no argument is provided
             case 'p':
                 progress = true;
                 break;
@@ -96,11 +105,13 @@ int main(int argc, char** argv) {
     std::string inputFileName;
     std::string outputFileName;
     bool slow(false);
+    bool smart(false);
     bool progress(false);
     bool onlyReportMemory(false);
     bool zMips(false);
+    int memoryLimitInMb(0);
     
-    if (!getOptions(argc, argv, inputFileName, outputFileName, slow, progress, onlyReportMemory, zMips)) {
+    if (!getOptions(argc, argv, inputFileName, outputFileName, slow, smart, progress, onlyReportMemory, zMips, memoryLimitInMb)) {
         return 1;
     }
 
@@ -131,7 +142,7 @@ int main(int argc, char** argv) {
     std::unique_ptr<Converter> converter;
         
     try {
-        converter = Converter::getConverter(inputFileName, outputFileName, slow, progress, zMips);
+        converter = Converter::getConverter(inputFileName, outputFileName, slow, smart, progress, zMips, memoryLimitInMb);
         
         if (onlyReportMemory) {
             converter->reportMemoryUsage();