From 2ad6db2dd6fc47a9e9489bf6d69811cc35655cca Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 23 Feb 2025 18:54:35 +0000
Subject: [PATCH 001/115] Import torch packages to handle distributed computing
 in models

---
 src/cdtools/models/base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index b31061d7..aa4dfb5b 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -30,6 +30,8 @@
 
 import torch as t
 from torch.utils import data as torchdata
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel
 from matplotlib import pyplot as plt
 from matplotlib.widgets import Slider
 from matplotlib import ticker

From c393f47bd4b747a295d8f6464f686cc758093e5d Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 23 Feb 2025 20:43:42 +0000
Subject: [PATCH 002/115] Duplicated fancy ptycho example for multi gpu demo
 with DistributedDataParallel

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 44 ++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 examples/fancy_ptycho_multi_gpu_ddp.py

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
new file mode 100644
index 00000000..1cf58550
--- /dev/null
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -0,0 +1,44 @@
+import cdtools
+from matplotlib import pyplot as plt
+
+filename = 'example_data/lab_ptycho_data.cxi'
+dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+# FancyPtycho is the workhorse model
+model = cdtools.models.FancyPtycho.from_dataset(
+    dataset,
+    n_modes=3, # Use 3 incoherently mixing probe modes
+    oversampling=2, # Simulate the probe on a 2xlarger real-space array
+    probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
+    propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
+    units='mm', # Set the units for the live plots
+    obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
+)
+
+device = 'cuda'
+model.to(device=device)
+dataset.get_as(device=device)
+
+# The learning rate parameter sets the alpha for Adam.
+# The beta parameters are (0.9, 0.999) by default
+# The batch size sets the minibatch size
+for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=10):
+    print(model.report())
+    # Plotting is expensive, so we only do it every tenth epoch
+    if model.epoch % 10 == 0:
+        model.inspect(dataset)
+
+# It's common to chain several different reconstruction loops. Here, we
+# started with an aggressive refinement to find the probe, and now we
+# polish the reconstruction with a lower learning rate and larger minibatch
+for loss in model.Adam_optimize(50, dataset,  lr=0.005, batch_size=50):
+    print(model.report())
+    if model.epoch % 10 == 0:
+        model.inspect(dataset)
+
+# This orthogonalizes the recovered probe modes
+model.tidy_probes()
+
+model.inspect(dataset)
+model.compare(dataset)
+plt.show()

From fc40b77043b96e068122507ea40aa0e3c0331f8f Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 23 Feb 2025 20:48:54 +0000
Subject: [PATCH 003/115] Reworked fancy ptycho example for multi GPU testing
 with DDP

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 169 +++++++++++++++++++------
 1 file changed, 128 insertions(+), 41 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 1cf58550..b689b1b8 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -1,44 +1,131 @@
 import cdtools
 from matplotlib import pyplot as plt
 
-filename = 'example_data/lab_ptycho_data.cxi'
-dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
-
-# FancyPtycho is the workhorse model
-model = cdtools.models.FancyPtycho.from_dataset(
-    dataset,
-    n_modes=3, # Use 3 incoherently mixing probe modes
-    oversampling=2, # Simulate the probe on a 2xlarger real-space array
-    probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
-    propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
-    units='mm', # Set the units for the live plots
-    obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
-)
-
-device = 'cuda'
-model.to(device=device)
-dataset.get_as(device=device)
-
-# The learning rate parameter sets the alpha for Adam.
-# The beta parameters are (0.9, 0.999) by default
-# The batch size sets the minibatch size
-for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=10):
-    print(model.report())
-    # Plotting is expensive, so we only do it every tenth epoch
-    if model.epoch % 10 == 0:
-        model.inspect(dataset)
-
-# It's common to chain several different reconstruction loops. Here, we
-# started with an aggressive refinement to find the probe, and now we
-# polish the reconstruction with a lower learning rate and larger minibatch
-for loss in model.Adam_optimize(50, dataset,  lr=0.005, batch_size=50):
-    print(model.report())
-    if model.epoch % 10 == 0:
-        model.inspect(dataset)
-
-# This orthogonalizes the recovered probe modes
-model.tidy_probes()
-
-model.inspect(dataset)
-model.compare(dataset)
-plt.show()
+# To use multiple GPUs, we need to import a few additional packages
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group, barrier
+import torch.multiprocessing as mp
+
+# While not strictly necessary, it's super useful to have in the event
+# the computation hangs by defining a timeout period. 
+import datetime 
+timeout = datetime.timedelta(seconds=60)   # Terminate if things hang for 60s.
+
+# We will need to specify what multiprocessing backend we want to use.
+# PyTorch supports a few backends (gloo, MPI, NCCL). We will use NCCL, or
+# NVIDIA Collective Communications Library, as it's the fastest one.
+#
+# It's also the only one that works with the current multi-GPU implementation...
+BACKEND = 'nccl'
+
+# We need to wrap the script inside a function in order to use "mp.spawn"
+# which will help distribute the work to multiple GPUs
+#
+# In fancier terms, we will use mp.spawn to create several processes
+# that will work on the model using N-number of GPUs, (a.k.a., 'WORLD_SIZE')
+# Each process will be given to one GPU that's assigned a number called 
+# a RANK (which ranges from 0 to WORLD_SIZE-1).
+def multi_gpu_reconstruct(rank: int, 
+                          world_size: int):
+    """Perform the reconstruction using several GPUs
+    Parameters:
+        rank: int
+            The rank of the GPU to be used. Value should be within
+            [0, world_size-1]
+
+        world_size: int
+            The total number of GPUs to use
+    """
+    # We need to initialize the distributed process group
+    # before calling any other method
+    init_process_group(backend=BACKEND,
+                       rank=rank,
+                       world_size=world_size,
+                       timeout=timeout)
+    
+    filename = 'example_data/lab_ptycho_data.cxi'
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+    model = cdtools.models.FancyPtycho.from_dataset(
+        dataset,
+        n_modes=3,
+        oversampling=2, 
+        probe_support_radius=120, 
+        propagation_distance=5e-3,
+        units='mm', 
+        obj_view_crop=-50,
+    )
+
+    # We need to adjust the device string to also indicate which GPU this
+    # process is using
+    device = f'cuda:{rank}'
+    model.to(device=device)
+    dataset.get_as(device=device)
+
+    # We now wrap the model with DistributedDataParallel (DDP), which allows
+    # data parallelism by synchronizing gradients across each copy of the
+    # model in the different GPUs.
+    model = DDP(model,
+                device_ids=[rank],  # Tells DDP which GPU the model lives in
+                output_device=rank, # Tells DDP which GPU to output to
+                find_unused_parameters=True) # TODO: Understand what this is really doing...
+
+    # As a sanity check, we wait for all GPUs to catch up to barrier() before
+    # running optimization
+    barrier()
+    
+
+    # Since our model is now wrapped in DDP, all CDTools methods have to be
+    # called using 'model.module' rather than just 'model'.
+    #
+    # We also need to pass the rank and world_size to Adam_optimize
+    for loss in model.module.Adam_optimize(50, 
+                                           dataset, 
+                                           lr=0.02, 
+                                           batch_size=10,
+                                           rank=rank,
+                                           num_workers=world_size):
+        
+        # We can still perform model.inspect and model.report, but we want
+        # to only let 1 GPU handle plotting/printing rather than get N copies
+        # from all N GPUs.
+        if rank == 0:
+            print(model.module.report())
+        
+        # We set up the model.inspect this way to only let GPU 0 plot and
+        # prevent the other GPUs from running far ahead of GPU 0, which
+        # seems to cause bugs (GPU processes dissapear from nvidia-smi)
+        if model.module..epoch % 10 == 0:
+            if rank == 0:
+                model.module.inspect(dataset)
+            barrier()
+
+
+    # We set up another barrier to make sure all GPUs catch up before
+    # starting another reconstruction loop
+    barrier()
+
+    for loss in model.module.Adam_optimize(50, 
+                                           dataset,  
+                                           lr=0.005, 
+                                           batch_size=50,
+                                           rank=rank,
+                                           num_workers=world_size):
+        if rank == 0:
+            print(model.module.report())
+        
+        if model.epoch % 10 == 0:
+            if rank == 0:
+                model.module.inspect(dataset)
+            barrier()
+
+    # Again, set up another barrier to let all GPUs catch up
+    barrier()
+    
+    model.module.tidy_probes() # TODO: Check how the multi-GPU implementation handles tidying probes.
+
+    # Only let one GPU handle plotting stuff.
+    if rank == 0:
+        model.module.inspect(dataset)
+        model.module.compare(dataset)
+        plt.show()
\ No newline at end of file

From e8ed0100c60ce34089f78e8bbd213f96230753b6 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 23 Feb 2025 21:03:37 +0000
Subject: [PATCH 004/115] Added a name-main block to the fancy ptycho multi gpu
 example

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 33 +++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index b689b1b8..df51345d 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -5,6 +5,7 @@
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.distributed import init_process_group, destroy_process_group, barrier
 import torch.multiprocessing as mp
+import os
 
 # While not strictly necessary, it's super useful to have in the event
 # the computation hangs by defining a timeout period. 
@@ -128,4 +129,34 @@ def multi_gpu_reconstruct(rank: int,
     if rank == 0:
         model.module.inspect(dataset)
         model.module.compare(dataset)
-        plt.show()
\ No newline at end of file
+        plt.show()
+
+# This will execute the multi_gpu_reconstruct upon running this file
+if __name__ == '__main__':
+    # We need to add some stuff to the enviromnent 
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '8888'
+
+    # Define the number of GPUs to use.
+    world_size = 4
+
+    # Write a try/except statement to help the subprocesses (and GPUs)
+    # terminate gracefully. Otherwise, you may have stuff loaded on
+    # several GPU even after terminating.
+    try:
+        # Spawn the processes
+        mp.spawn(multi_gpu_reconstruct,
+                args=(world_size,),
+                nprocs=world_size,
+                join=True)
+        
+        # Always destroy the process group when you're done
+        destroy_process_group()
+
+    except Exception as e:
+        # If something breaks, we try to make sure that the
+        # process group is destroyed before the program fully
+        # terminates
+        print(e)
+        destroy_process_group()
+    

From 94dcc3fedd7a47bd1f071606bad9ef4b37fd27d3 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 23 Feb 2025 21:28:57 +0000
Subject: [PATCH 005/115] Fixed a double-period typo

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index df51345d..edc5f04d 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -96,7 +96,7 @@ def multi_gpu_reconstruct(rank: int,
         # We set up the model.inspect this way to only let GPU 0 plot and
         # prevent the other GPUs from running far ahead of GPU 0, which
         # seems to cause bugs (GPU processes dissapear from nvidia-smi)
-        if model.module..epoch % 10 == 0:
+        if model.module.epoch % 10 == 0:
             if rank == 0:
                 model.module.inspect(dataset)
             barrier()

From ad3d8166dd33cb3d9144a836d46f70c9f50dd732 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 23 Feb 2025 22:42:37 +0000
Subject: [PATCH 006/115] Added basic multi GPU support for AD_optimize and
 Adam_optimize in CDIModel

---
 src/cdtools/models/base.py | 49 +++++++++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index aa4dfb5b..775c7330 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -359,6 +359,10 @@ def AD_optimize(self, iterations, data_loader,  optimizer,\
             The summed loss over the latest epoch, divided by the total diffraction pattern intensity
         """
 
+        # Check if multi-GPU operations are being conducted (i.e.,
+        # a process group is initialized)
+        is_multi_GPU = t.distributed.is_initialized()
+
         def run_epoch(stop_event=None):
             """Runs one full epoch of the reconstruction."""
             # First, initialize some tracking variables
@@ -446,6 +450,10 @@ def closure():
                         yield float('nan')
                     continue
                 
+                # If we're using DistributedSampler (likely the case if
+                # you're using multiple GPUs), we need to tell it
+                # which epoch we're on before running an epoch
+                if is_multi_GPU: data_loader.sampler.set_epoch(self.epoch)
                 yield run_epoch()
                     
                 
@@ -472,6 +480,11 @@ def target():
                         yield float('nan')
                     continue
                 
+                # If we're using DistributedSampler, (likely the case if
+                # you're using multiple GPUs), we need to tell it which
+                # epoch we're on before running an epoch
+                if is_multi_GPU: data_loader.sampler.set_epoch(self.epoch)
+
                 calc = threading.Thread(target=target, name='calculator', daemon=True)
                 try:
                     calc.start()
@@ -511,7 +524,9 @@ def Adam_optimize(
             subset=None,
             regularization_factor=None,
             thread=True,
-            calculation_width=10
+            calculation_width=10,
+            num_workers=1,
+            rank=None
     ):
         """Runs a round of reconstruction using the Adam optimizer
 
@@ -542,7 +557,11 @@ def Adam_optimize(
             Default True, whether to run the computation in a separate thread to allow interaction with plots during computation
         calculation_width : int
             Default 10, how many translations to pass through at once for each round of gradient accumulation. Does not affect the result, only the calculation speed
-
+        num_workers: int
+            Default 1, how many GPUs to distribute calculations over
+        rank: int
+            Default None, the rank of the GPU to be used when performing multi-gpu operations. Value should be within [0, world_size-1]
+        
         """
 
         self.training_history += (
@@ -558,10 +577,28 @@ def Adam_optimize(
                 subset = [subset]
             dataset = torchdata.Subset(dataset, subset)
 
-        # Make a dataloader
-        data_loader = torchdata.DataLoader(dataset,
-                                           batch_size=batch_size,
-                                           shuffle=True)
+        # Make a dataloader suited for either single-GPU use or cases
+        # where a process group (i.e., multiple GPUs) has been initialized
+        if num_workers > 1:
+            # First, create a sampler to load subsets of dataset to the GPUs
+            # TODO: Test out drop_last to see how much that influences reconstructions
+            sampler = DistributedSampler(dataset,
+                                         num_replicas=num_workers,
+                                         rank=rank,
+                                         shuffle=True,
+                                         drop_last=False)
+            # Now create the dataloader
+            data_loader = torchdata.DataLoader(dataset,
+                                               batch_size=batch_size, # TODO: Recalculate the batch_size for multi-GPU operation
+                                               shuffle=False, # Shuffling is now handled by sampler
+                                               num_workers=0, # I'm not 100% sure what this does, but apparently making this >0 can cause bugs
+                                               drop_last=False, # TODO: Test out how this influences reconstructions
+                                               pin_memory=False,# I'm not 100% sure what this does, but apparently making this True can cause bugs
+                                               sampler=sampler)
+        else:
+            data_loader = torchdata.DataLoader(dataset,
+                                            batch_size=batch_size,
+                                            shuffle=True)
 
         # Define the optimizer
         optimizer = t.optim.Adam(

From efe2ac1429236489d071939fe15b40cd79071dea Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 23 Feb 2025 22:50:14 +0000
Subject: [PATCH 007/115] Fixes for fancy_ptycho_multi_gpu_ddp.py

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index edc5f04d..24882503 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -10,10 +10,10 @@
 # While not strictly necessary, it's super useful to have in the event
 # the computation hangs by defining a timeout period. 
 import datetime 
-timeout = datetime.timedelta(seconds=60)   # Terminate if things hang for 60s.
+TIMEOUT = datetime.timedelta(seconds=60)   # Terminate if things hang for 60s.
 
 # We will need to specify what multiprocessing backend we want to use.
-# PyTorch supports a few backends (gloo, MPI, NCCL). We will use NCCL, or
+# PyTorch supports a few backends (such as gloo, MPI, NCCL). We will use NCCL, or
 # NVIDIA Collective Communications Library, as it's the fastest one.
 #
 # It's also the only one that works with the current multi-GPU implementation...
@@ -42,9 +42,9 @@ def multi_gpu_reconstruct(rank: int,
     init_process_group(backend=BACKEND,
                        rank=rank,
                        world_size=world_size,
-                       timeout=timeout)
+                       timeout=TIMEOUT)
     
-    filename = 'example_data/lab_ptycho_data.cxi'
+    filename = r'example_data/lab_ptycho_data.cxi'
     dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
     model = cdtools.models.FancyPtycho.from_dataset(
@@ -75,10 +75,8 @@ def multi_gpu_reconstruct(rank: int,
     # running optimization
     barrier()
     
-
     # Since our model is now wrapped in DDP, all CDTools methods have to be
     # called using 'model.module' rather than just 'model'.
-    #
     # We also need to pass the rank and world_size to Adam_optimize
     for loss in model.module.Adam_optimize(50, 
                                            dataset, 
@@ -115,7 +113,7 @@ def multi_gpu_reconstruct(rank: int,
         if rank == 0:
             print(model.module.report())
         
-        if model.epoch % 10 == 0:
+        if model.module.epoch % 10 == 0:
             if rank == 0:
                 model.module.inspect(dataset)
             barrier()
@@ -130,6 +128,12 @@ def multi_gpu_reconstruct(rank: int,
         model.module.inspect(dataset)
         model.module.compare(dataset)
         plt.show()
+    
+    # Again, set up another barrier to let all GPUs catch up
+    barrier()
+
+    # Always destroy the process group when you're done
+    destroy_process_group()
 
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
@@ -138,7 +142,7 @@ def multi_gpu_reconstruct(rank: int,
     os.environ['MASTER_PORT'] = '8888'
 
     # Define the number of GPUs to use.
-    world_size = 4
+    world_size = 2
 
     # Write a try/except statement to help the subprocesses (and GPUs)
     # terminate gracefully. Otherwise, you may have stuff loaded on
@@ -149,9 +153,6 @@ def multi_gpu_reconstruct(rank: int,
                 args=(world_size,),
                 nprocs=world_size,
                 join=True)
-        
-        # Always destroy the process group when you're done
-        destroy_process_group()
 
     except Exception as e:
         # If something breaks, we try to make sure that the

From 1eb870465ce063e3f89ec71928bd5ae9106b4f9a Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 24 Feb 2025 04:28:07 +0000
Subject: [PATCH 008/115] Created fancy_ptycho_multi_gpu_ddp_speed_test.py to
 compare reconstruction speed and losses as a function of GPU counts

---
 .../fancy_ptycho_multi_gpu_ddp_speed_test.py  | 215 ++++++++++++++++++
 1 file changed, 215 insertions(+)
 create mode 100644 examples/fancy_ptycho_multi_gpu_ddp_speed_test.py

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
new file mode 100644
index 00000000..905297c7
--- /dev/null
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -0,0 +1,215 @@
+'''This is a testing script to study how the reconstruction speed
+and convergence rate scales with the number of GPUs utilized.
+
+The test is set up so that you can run n-trials for each number of GPUs
+you want to study and plot statistics of loss-versus-time as a function
+of GPU counts. 
+
+This test is based on fancy_ptycho_multi_gpu_ddp.py and fancy_ptycho.py.
+
+'''
+
+import cdtools
+from matplotlib import pyplot as plt
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group, barrier
+import torch.multiprocessing as mp
+import os
+import datetime 
+import time
+import numpy as np
+
+TIMEOUT = datetime.timedelta(seconds=10)   # Auto-terminate if things hang
+BACKEND = 'nccl'
+
+
+# Multi-GPU supported reconstruction
+def multi_gpu_reconstruct(rank: int, 
+                          world_size: int,
+                          conn,
+                          schedule=False) -> tuple[np.array, np.array]:
+    """Perform the reconstruction using several GPUs
+    If only one GPU is used, we don't bother loading the the process group
+    or doing any of the fancy stuff associated with multi-GPU operation.
+
+    Parameters:
+        rank: int
+            The rank of the GPU to be used. Value should be within
+            [0, world_size-1]
+        world_size: int
+            The total number of GPUs to use
+        conn: mp.Pipe
+            Connection to parent
+        schedule: bool
+            Toggles the use of the scheduler
+    
+    Returns:
+        time_history: np.array
+            Array of when each loss was measured
+        loss_history: np.array
+            The total history of the model
+    """
+    # Create a list to keep track of when each module report was printed
+    t_list = []
+    # Start counting time
+    t_start = time.time()
+
+    # Load the dataset
+    filename = r'example_data/lab_ptycho_data.cxi'
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+    if world_size > 1:
+        # We need to initialize the distributed process group
+        # before calling any other method for multi-GPU usage
+        init_process_group(backend=BACKEND,
+                        rank=rank,
+                        world_size=world_size,
+                        timeout=TIMEOUT)
+    
+    # Create the model
+    model = cdtools.models.FancyPtycho.from_dataset(
+        dataset,
+        n_modes=3,
+        oversampling=2, 
+        probe_support_radius=120, 
+        propagation_distance=5e-3,
+        units='mm', 
+        obj_view_crop=-50,
+    )
+
+    # Assign devices
+    device = f'cuda:{rank}'
+    model.to(device=device)
+    dataset.get_as(device=device)
+
+    # Perform reconstructions on either single or multi-GPU workflows.
+    if world_size > 1:
+        # For multi-GPU workflows, we have to use this mess.
+        model = DDP(model,
+                    device_ids=[rank],  # Tells DDP which GPU the model lives in
+                    output_device=rank, # Tells DDP which GPU to output to
+                    find_unused_parameters=True) # TODO: Understand what this is really doing...
+        barrier()
+
+        for loss in model.module.Adam_optimize(50, 
+                                            dataset, 
+                                            lr=0.02, 
+                                            batch_size=10,
+                                            rank=rank,
+                                            num_workers=world_size,
+                                            schedule=schedule):
+            if rank == 0:
+                print(model.module.report())
+                t_list.append(time.time() - t_start)
+        barrier()
+
+        for loss in model.module.Adam_optimize(50, 
+                                            dataset,  
+                                            lr=0.005, 
+                                            batch_size=50,
+                                            rank=rank,
+                                            num_workers=world_size,
+                                            schedule=schedule):
+            if rank == 0:
+                print(model.module.report())
+                t_list.append(time.time() - t_start)
+        # Again, set up another barrier to let all GPUs catch up
+        barrier()
+        # Always destroy the process group when you're done
+        destroy_process_group()
+
+        # We need to send the time_history and loss_history through
+        # the child connection to the parent (sitting in the name-main block)
+        if rank == 0:
+            loss_history = np.array(model.module.loss_history)
+            time_history = np.array(t_list)
+            conn.send((time_history, loss_history))
+
+    else:
+        # For single-GPU workloads, we use the vanilla-way of performing
+        # reconstructions in CDTools
+        for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=10, schedule=schedule):
+            print(model.report())
+            t_list.append(time.time() - t_start)
+        for loss in model.Adam_optimize(50, dataset,  lr=0.005, batch_size=50, schedule=schedule):
+            print(model.report())
+            t_list.append(time.time() - t_start)
+
+        loss_history = np.array(model.loss_history)
+        time_history = np.array(t_list)
+        # Return the measured time and loss history
+        return time_history, loss_history
+
+# This will execute the multi_gpu_reconstruct upon running this file
+if __name__ == '__main__':
+    # We need to add some stuff to the enviromnent 
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '8888'  # You can use any open port number
+
+    # Set up a parent/child connection to get some info from the GPU-accelerated
+    # function
+    parent_conn, child_conn = mp.Pipe()
+
+    # Define the number of GPUs to use.
+    world_sizes = [2, 1] 
+
+    # Define if we want to use the scheduler or not
+    schedule=True
+
+    # Define how many iterations we want to perform of the reconstructions
+    # for statistics
+    runs = 2
+
+    # Write a try/except statement to help the subprocesses (and GPUs)
+    # terminate gracefully. Otherwise, you may have stuff loaded on
+    # several GPU even after terminating.
+    try:
+        for world_size in world_sizes:
+            print(f'Number of GPU(s): {world_size}')
+            # Make a list to store the values
+            time_list = []
+            loss_hist_list = []
+
+            for i in range(runs):
+                print(f'Starting run {i+1}/{runs} on {world_size} GPU(s)')
+                if world_size == 1:
+                    final_time, loss_history = multi_gpu_reconstruct(0, world_size,schedule)
+                    time_list.append(final_time)
+                    loss_hist_list.append(loss_history)
+                else:
+                    # Spawn the processes
+                    mp.spawn(multi_gpu_reconstruct,
+                             args=(world_size, child_conn, schedule),
+                             nprocs=world_size,
+                             join=True)
+                    while parent_conn.poll():
+                        final_time, loss_history = parent_conn.recv()
+                        time_list.append(final_time)
+                        loss_hist_list.append(loss_history)
+                
+            
+            # Calculate the statistics
+            time_mean = np.array(time_list).mean(axis=0)/60
+            time_std = np.array(time_list).std(axis=0)/60
+            loss_mean = np.array(loss_hist_list).mean(axis=0)
+            loss_std = np.array(loss_hist_list).std(axis=0)
+
+            # Plot
+            plt.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
+                     label=f'{world_size} GPUs')
+            plt.yscale('log')
+            plt.xscale('linear')
+        
+        plt.legend()
+        plt.xlabel('Time (min)')
+        plt.ylabel('Loss')
+        plt.show()
+
+
+    except KeyboardInterrupt as e:
+        # If something breaks, we try to make sure that the
+        # process group is destroyed before the program fully
+        # terminates
+        print('Hang on a sec...')
+        destroy_process_group()
+    

From 7c766265e8af93152fd5a847847bf9dca69961f6 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 27 Feb 2025 05:25:21 +0000
Subject: [PATCH 009/115] Moved dataset import outside of function

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 24882503..a49d23d8 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -19,6 +19,9 @@
 # It's also the only one that works with the current multi-GPU implementation...
 BACKEND = 'nccl'
 
+filename = r'example_data/lab_ptycho_data.cxi'
+dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
 # We need to wrap the script inside a function in order to use "mp.spawn"
 # which will help distribute the work to multiple GPUs
 #
@@ -44,8 +47,6 @@ def multi_gpu_reconstruct(rank: int,
                        world_size=world_size,
                        timeout=TIMEOUT)
     
-    filename = r'example_data/lab_ptycho_data.cxi'
-    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
     model = cdtools.models.FancyPtycho.from_dataset(
         dataset,

From 9831b19a088916045f810595d9693c2e8b5b3013 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 27 Feb 2025 05:26:46 +0000
Subject: [PATCH 010/115] Disabled NCCL peer-2-peer communication in
 fancy_ptycho_multi_gpu_ddp.py

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index a49d23d8..1f95dc34 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -141,7 +141,8 @@ def multi_gpu_reconstruct(rank: int,
     # We need to add some stuff to the enviromnent 
     os.environ['MASTER_ADDR'] = 'localhost'
     os.environ['MASTER_PORT'] = '8888'
-
+    os.environ['NCCL_P2P_DISABLE'] = '1'
+    
     # Define the number of GPUs to use.
     world_size = 2
 

From 057233c43551c5a0ed89b0ed7efb899424a3e6b7 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 27 Feb 2025 05:43:12 +0000
Subject: [PATCH 011/115] Disabled NCCL P2P for the multi gpu speed test

---
 examples/fancy_ptycho_multi_gpu_ddp_speed_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index 905297c7..8001e57c 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -145,6 +145,7 @@ def multi_gpu_reconstruct(rank: int,
     # We need to add some stuff to the enviromnent 
     os.environ['MASTER_ADDR'] = 'localhost'
     os.environ['MASTER_PORT'] = '8888'  # You can use any open port number
+    os.environ['NCCL_P2P_DISABLE'] = '1'
 
     # Set up a parent/child connection to get some info from the GPU-accelerated
     # function

From 902d2729137cab520cea56a2dc39d17dabdcb613 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 1 Mar 2025 23:55:26 +0000
Subject: [PATCH 012/115] Created new module cdtools.tools.distributed for
 multi-GPU applications

---
 src/cdtools/tools/distributed/__init__.py    |  2 +
 src/cdtools/tools/distributed/distributed.py | 62 ++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 src/cdtools/tools/distributed/__init__.py
 create mode 100644 src/cdtools/tools/distributed/distributed.py

diff --git a/src/cdtools/tools/distributed/__init__.py b/src/cdtools/tools/distributed/__init__.py
new file mode 100644
index 00000000..f2380d39
--- /dev/null
+++ b/src/cdtools/tools/distributed/__init__.py
@@ -0,0 +1,2 @@
+from cdtools.tools.distributed.distributed import *
+from cdtools.tools.distributed.distributed import __all__, __doc__
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
new file mode 100644
index 00000000..d2d8d137
--- /dev/null
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -0,0 +1,62 @@
+"""Contains functions for setting up and executing multi-gpu reconstructions
+
+
+"""
+
+import numpy as np
+import torch as t
+from torch.distributed import init_process_group, destroy_process_group, barrier
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.multiprocessing as mp
+import datetime
+import os
+from functools import partial
+
+__all__ = ['spawn', 'multi_gpu']
+
+
+# Set the default timeout to 60s
+WAIT_TIME = datetime.timedelta(seconds=60)
+
+def spawn(reconstructor,
+          world_size: int = 2,
+          master_addr: str = 'localhost',
+          master_port: str = '8888',
+          nccl_p2p_disable: bool = True):
+    """A wrapper around torch.multiprocessing.spawn. 
+    
+    It includes the setup of OS environmental variables needed for
+    initializing the distributed backend
+
+    Parameters:
+        reconstructor:
+            The wrapped reconstruction loop
+        world_size: int
+            Number of GPUs to use
+        master_addr: str
+            IP address of the machine that will host the process with rank 0
+        master_port: str
+            A free port on the machine that will host the process with rank 0
+        nccl_p2p_disable: bool
+            Disable NCCL peer-2-peer communication
+    """
+    # Set up environment variables
+    os.environ['MASTER_ADDR'] = master_addr
+    os.environ['MASTER_PORT'] = master_port
+    os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
+
+    # Ensure a "graceful" termination of subprocesses if something goes wrong.
+    try:
+        print('\nStarting up multi-GPU reconstructions...')
+        mp.spawn(reconstructor,     
+                 args=(world_size,),
+                 nprocs=world_size,
+                 join=True)
+        print('Reconstructions complete. Stopping processes...')
+
+    except Exception as e:
+        # If something breaks, we try to make sure that the
+        # process group is destroyed before the program fully
+        # terminates
+        print(e)
+        destroy_process_group()

From aedc0359b478ffa41fe3ec883ff5941934e1b0d5 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 1 Mar 2025 23:57:52 +0000
Subject: [PATCH 013/115] Spawning handled by cdtools.tools.distributed module
 in multi-gpu example script

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 1f95dc34..0056b49f 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -6,6 +6,7 @@
 from torch.distributed import init_process_group, destroy_process_group, barrier
 import torch.multiprocessing as mp
 import os
+from cdtools.tools.distributed import distributed
 
 # While not strictly necessary, it's super useful to have in the event
 # the computation hangs by defining a timeout period. 
@@ -138,28 +139,5 @@ def multi_gpu_reconstruct(rank: int,
 
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
-    # We need to add some stuff to the enviromnent 
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '8888'
-    os.environ['NCCL_P2P_DISABLE'] = '1'
-    
-    # Define the number of GPUs to use.
-    world_size = 2
-
-    # Write a try/except statement to help the subprocesses (and GPUs)
-    # terminate gracefully. Otherwise, you may have stuff loaded on
-    # several GPU even after terminating.
-    try:
-        # Spawn the processes
-        mp.spawn(multi_gpu_reconstruct,
-                args=(world_size,),
-                nprocs=world_size,
-                join=True)
-
-    except Exception as e:
-        # If something breaks, we try to make sure that the
-        # process group is destroyed before the program fully
-        # terminates
-        print(e)
-        destroy_process_group()
+    distributed.spawn(multi_gpu_reconstruct, world_size = 4)
     

From 4521f35ede256aaa6c090ec4904e274a28cc3b1b Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 2 Mar 2025 01:01:59 +0000
Subject: [PATCH 014/115] Created a wrapper for managing process groups in 
 reconstruction scripts

---
 src/cdtools/tools/distributed/distributed.py | 45 +++++++++++++++++---
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index d2d8d137..3075ea9c 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -1,4 +1,5 @@
-"""Contains functions for setting up and executing multi-gpu reconstructions
+"""Contains wrapper functions to make reconstruction scripts compatible
+with multi-GPU distributive approaches in PyTorch.
 
 
 """
@@ -10,7 +11,7 @@
 import torch.multiprocessing as mp
 import datetime
 import os
-from functools import partial
+import functools
 
 __all__ = ['spawn', 'multi_gpu']
 
@@ -18,8 +19,42 @@
 # Set the default timeout to 60s
 WAIT_TIME = datetime.timedelta(seconds=60)
 
+
+def process_manager(rank, reconstructor, world_size, backend, timeout):
+    """A wrapper around the reconstruction script defined in a high-level
+    user interface. 
+    
+    This wraps around the reconstructor function, enabling multi-GPU operations
+    to be set up afterwards through torch.multiprocessing.spawn or
+    cdtools.tools.distributed.distributed.spawn
+
+    Parameters:
+        rank: int
+        reconstructor:
+            The wrapped reconstruction loop
+        world_size: int
+            Number of GPUs to use
+        master_addr: str
+            IP address of the machine that will host the process with rank 0
+        master_port: str
+            A free port on the machine that will host the process with rank 0
+        nccl_p2p_disable: bool
+            Disable NCCL peer-2-peer communication
+    """
+    init_process_group(backend=backend,
+                        rank=rank,
+                        world_size=world_size,
+                        timeout=timeout)
+    
+    reconstructor(rank, world_size)
+    barrier()
+    destroy_process_group()
+
+
 def spawn(reconstructor,
-          world_size: int = 2,
+          world_size: int,
+          backend: str = 'nccl',
+          timeout: datetime = WAIT_TIME,
           master_addr: str = 'localhost',
           master_port: str = '8888',
           nccl_p2p_disable: bool = True):
@@ -48,8 +83,8 @@ def spawn(reconstructor,
     # Ensure a "graceful" termination of subprocesses if something goes wrong.
     try:
         print('\nStarting up multi-GPU reconstructions...')
-        mp.spawn(reconstructor,     
-                 args=(world_size,),
+        mp.spawn(process_manager,
+                 args=(reconstructor, world_size, backend, timeout),
                  nprocs=world_size,
                  join=True)
         print('Reconstructions complete. Stopping processes...')

From 00ee7070e478deb869a9dcc59921ec4fb74c6f70 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 2 Mar 2025 01:03:05 +0000
Subject: [PATCH 015/115] Removed process group managing functions from
 fancy_ptycho_multi_gpu_ddp.py

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 0056b49f..46047919 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -43,11 +43,6 @@ def multi_gpu_reconstruct(rank: int,
     """
     # We need to initialize the distributed process group
     # before calling any other method
-    init_process_group(backend=BACKEND,
-                       rank=rank,
-                       world_size=world_size,
-                       timeout=TIMEOUT)
-    
 
     model = cdtools.models.FancyPtycho.from_dataset(
         dataset,
@@ -131,11 +126,6 @@ def multi_gpu_reconstruct(rank: int,
         model.module.compare(dataset)
         plt.show()
     
-    # Again, set up another barrier to let all GPUs catch up
-    barrier()
-
-    # Always destroy the process group when you're done
-    destroy_process_group()
 
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':

From db01ac86a1f98d4f1157efaaaefb4206e985405a Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 2 Mar 2025 01:06:44 +0000
Subject: [PATCH 016/115] Changed type hint for multi_gpu_reconstruct from
 array to ndarray

---
 examples/fancy_ptycho_multi_gpu_ddp_speed_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index 8001e57c..661ec82c 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -27,7 +27,7 @@
 def multi_gpu_reconstruct(rank: int, 
                           world_size: int,
                           conn,
-                          schedule=False) -> tuple[np.array, np.array]:
+                          schedule=False) -> tuple[np.ndarray, ndnp.array]:
     """Perform the reconstruction using several GPUs
     If only one GPU is used, we don't bother loading the the process group
     or doing any of the fancy stuff associated with multi-GPU operation.

From 5be1770afe8cf4f906d04722775e118afa62bc72 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 2 Mar 2025 01:13:17 +0000
Subject: [PATCH 017/115] Fixed dumb typo for ndarray

---
 examples/fancy_ptycho_multi_gpu_ddp_speed_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index 661ec82c..df7651e3 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -27,7 +27,7 @@
 def multi_gpu_reconstruct(rank: int, 
                           world_size: int,
                           conn,
-                          schedule=False) -> tuple[np.ndarray, ndnp.array]:
+                          schedule=False) -> tuple[np.ndarray, np.ndarray]:
     """Perform the reconstruction using several GPUs
     If only one GPU is used, we don't bother loading the the process group
     or doing any of the fancy stuff associated with multi-GPU operation.

From ad3c1e37ef831fcb06f2fed78adc2f9bbb5d1c67 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 2 Mar 2025 01:37:42 +0000
Subject: [PATCH 018/115] Device loading of the model and dataset is now
 handled by process_manager

---
 examples/fancy_ptycho_multi_gpu_ddp.py       | 33 +++++++++++---------
 src/cdtools/tools/distributed/distributed.py | 30 ++++++++++++++----
 2 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 46047919..b3a6d8fb 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -23,6 +23,17 @@
 filename = r'example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
+model = cdtools.models.FancyPtycho.from_dataset(
+    dataset,
+    n_modes=3,
+    oversampling=2, 
+    probe_support_radius=120, 
+    propagation_distance=5e-3,
+    units='mm', 
+    obj_view_crop=-50,
+)
+
+
 # We need to wrap the script inside a function in order to use "mp.spawn"
 # which will help distribute the work to multiple GPUs
 #
@@ -30,7 +41,9 @@
 # that will work on the model using N-number of GPUs, (a.k.a., 'WORLD_SIZE')
 # Each process will be given to one GPU that's assigned a number called 
 # a RANK (which ranges from 0 to WORLD_SIZE-1).
-def multi_gpu_reconstruct(rank: int, 
+def multi_gpu_reconstruct(model,
+                          dataset,
+                          rank: int, 
                           world_size: int):
     """Perform the reconstruction using several GPUs
     Parameters:
@@ -44,21 +57,10 @@ def multi_gpu_reconstruct(rank: int,
     # We need to initialize the distributed process group
     # before calling any other method
 
-    model = cdtools.models.FancyPtycho.from_dataset(
-        dataset,
-        n_modes=3,
-        oversampling=2, 
-        probe_support_radius=120, 
-        propagation_distance=5e-3,
-        units='mm', 
-        obj_view_crop=-50,
-    )
 
     # We need to adjust the device string to also indicate which GPU this
     # process is using
-    device = f'cuda:{rank}'
-    model.to(device=device)
-    dataset.get_as(device=device)
+    
 
     # We now wrap the model with DistributedDataParallel (DDP), which allows
     # data parallelism by synchronizing gradients across each copy of the
@@ -129,5 +131,8 @@ def multi_gpu_reconstruct(rank: int,
 
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
-    distributed.spawn(multi_gpu_reconstruct, world_size = 4)
+    distributed.spawn(multi_gpu_reconstruct, 
+                      model=model,
+                      dataset=dataset,
+                      world_size = 4)
     
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 3075ea9c..9c8528ec 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -13,14 +13,14 @@
 import os
 import functools
 
-__all__ = ['spawn', 'multi_gpu']
+__all__ = ['spawn']
 
 
 # Set the default timeout to 60s
 WAIT_TIME = datetime.timedelta(seconds=60)
 
 
-def process_manager(rank, reconstructor, world_size, backend, timeout):
+def process_manager(rank, reconstructor, model, dataset, world_size, backend, timeout):
     """A wrapper around the reconstruction script defined in a high-level
     user interface. 
     
@@ -30,8 +30,13 @@ def process_manager(rank, reconstructor, world_size, backend, timeout):
 
     Parameters:
         rank: int
+            Rank of the GPU, with value ranging from [0, world_size-1]
         reconstructor:
             The wrapped reconstruction loop
+        model: t.nn.Module
+            The CDIModel
+        dataset: t.utils.data.Dataset
+            The CDataset
         world_size: int
             Number of GPUs to use
         master_addr: str
@@ -41,17 +46,25 @@ def process_manager(rank, reconstructor, world_size, backend, timeout):
         nccl_p2p_disable: bool
             Disable NCCL peer-2-peer communication
     """
+    # Initialize the process group
     init_process_group(backend=backend,
                         rank=rank,
                         world_size=world_size,
                         timeout=timeout)
     
-    reconstructor(rank, world_size)
-    barrier()
-    destroy_process_group()
+    # Load the model to the appropriate GPU rank the process is using
+    device = f'cuda:{rank}'
+    model.to(device=device)
+    dataset.get_as(device=device) 
+    
+    reconstructor(model, dataset, rank, world_size)         # Start the reconstruction loop
+    barrier()                               # Wait for all GPUs to finish reconstructing
+    destroy_process_group()                 # Destroy the process group
 
 
 def spawn(reconstructor,
+          model,
+          dataset,
           world_size: int,
           backend: str = 'nccl',
           timeout: datetime = WAIT_TIME,
@@ -84,7 +97,12 @@ def spawn(reconstructor,
     try:
         print('\nStarting up multi-GPU reconstructions...')
         mp.spawn(process_manager,
-                 args=(reconstructor, world_size, backend, timeout),
+                 args=(reconstructor, 
+                       model,
+                       dataset,
+                       world_size,
+                       backend, 
+                       timeout),
                  nprocs=world_size,
                  join=True)
         print('Reconstructions complete. Stopping processes...')

From c9165ea387f07e9ff39050a06c2964dfad81ebad Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 2 Mar 2025 02:37:20 +0000
Subject: [PATCH 019/115] Refactor: process_manager renamed to
 reconstructor_wrapper. reconstructor_wrapper also handles DDP and takes
 timeout as an integer

---
 examples/fancy_ptycho_multi_gpu_ddp.py       | 30 --------------
 src/cdtools/tools/distributed/distributed.py | 41 +++++++++++++-------
 2 files changed, 27 insertions(+), 44 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index b3a6d8fb..78687985 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -11,7 +11,6 @@
 # While not strictly necessary, it's super useful to have in the event
 # the computation hangs by defining a timeout period. 
 import datetime 
-TIMEOUT = datetime.timedelta(seconds=60)   # Terminate if things hang for 60s.
 
 # We will need to specify what multiprocessing backend we want to use.
 # PyTorch supports a few backends (such as gloo, MPI, NCCL). We will use NCCL, or
@@ -45,35 +44,6 @@ def multi_gpu_reconstruct(model,
                           dataset,
                           rank: int, 
                           world_size: int):
-    """Perform the reconstruction using several GPUs
-    Parameters:
-        rank: int
-            The rank of the GPU to be used. Value should be within
-            [0, world_size-1]
-
-        world_size: int
-            The total number of GPUs to use
-    """
-    # We need to initialize the distributed process group
-    # before calling any other method
-
-
-    # We need to adjust the device string to also indicate which GPU this
-    # process is using
-    
-
-    # We now wrap the model with DistributedDataParallel (DDP), which allows
-    # data parallelism by synchronizing gradients across each copy of the
-    # model in the different GPUs.
-    model = DDP(model,
-                device_ids=[rank],  # Tells DDP which GPU the model lives in
-                output_device=rank, # Tells DDP which GPU to output to
-                find_unused_parameters=True) # TODO: Understand what this is really doing...
-
-    # As a sanity check, we wait for all GPUs to catch up to barrier() before
-    # running optimization
-    barrier()
-    
     # Since our model is now wrapped in DDP, all CDTools methods have to be
     # called using 'model.module' rather than just 'model'.
     # We also need to pass the rank and world_size to Adam_optimize
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 9c8528ec..6d96f714 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -16,23 +16,23 @@
 __all__ = ['spawn']
 
 
-# Set the default timeout to 60s
-WAIT_TIME = datetime.timedelta(seconds=60)
 
-
-def process_manager(rank, reconstructor, model, dataset, world_size, backend, timeout):
-    """A wrapper around the reconstruction script defined in a high-level
-    user interface. 
-    
-    This wraps around the reconstructor function, enabling multi-GPU operations
-    to be set up afterwards through torch.multiprocessing.spawn or
-    cdtools.tools.distributed.distributed.spawn
+def reconstructor_wrapper(rank, 
+                          reconstructor, 
+                          model, 
+                          dataset, 
+                          world_size, 
+                          backend, 
+                          timeout):
+    """Wraps functions containing reconstruction loops (i.e., for loss in model.Adam_optimize)
+    to enable multi-GPU operations to be set up. The wrapped function needs to passed to
+    torch.multiprocessing.spawn or cdtools.tools.distributed.distributed.spawn
 
     Parameters:
         rank: int
             Rank of the GPU, with value ranging from [0, world_size-1]
         reconstructor:
-            The wrapped reconstruction loop
+            The reconstruction loop function
         model: t.nn.Module
             The CDIModel
         dataset: t.utils.data.Dataset
@@ -56,6 +56,15 @@ def process_manager(rank, reconstructor, model, dataset, world_size, backend, ti
     device = f'cuda:{rank}'
     model.to(device=device)
     dataset.get_as(device=device) 
+
+    # Wrap the model with DistributedDataParallel
+    model = DDP(model,
+                device_ids=[rank],  # Tells DDP which GPU the model lives in
+                output_device=rank, # Tells DDP which GPU to output to
+                find_unused_parameters=True) # TODO: Understand what this is really doing...
+    
+    # Dayne's special sanity check: Don't start reconstructing until all GPUs have synced.
+    barrier()
     
     reconstructor(model, dataset, rank, world_size)         # Start the reconstruction loop
     barrier()                               # Wait for all GPUs to finish reconstructing
@@ -67,11 +76,12 @@ def spawn(reconstructor,
           dataset,
           world_size: int,
           backend: str = 'nccl',
-          timeout: datetime = WAIT_TIME,
+          timeout: int = 60,
           master_addr: str = 'localhost',
           master_port: str = '8888',
           nccl_p2p_disable: bool = True):
-    """A wrapper around torch.multiprocessing.spawn. 
+    """Spawns world_size processes that runs a reconstructor loop function.
+    A wrapper around torch.multiprocessing.spawn. 
     
     It includes the setup of OS environmental variables needed for
     initializing the distributed backend
@@ -88,6 +98,9 @@ def spawn(reconstructor,
         nccl_p2p_disable: bool
             Disable NCCL peer-2-peer communication
     """
+    # Convert timeout from int to datetime
+    timeout = datetime.timedelta(seconds=timeout)
+
     # Set up environment variables
     os.environ['MASTER_ADDR'] = master_addr
     os.environ['MASTER_PORT'] = master_port
@@ -96,7 +109,7 @@ def spawn(reconstructor,
     # Ensure a "graceful" termination of subprocesses if something goes wrong.
     try:
         print('\nStarting up multi-GPU reconstructions...')
-        mp.spawn(process_manager,
+        mp.spawn(reconstructor_wrapper,
                  args=(reconstructor, 
                        model,
                        dataset,

From c9650013d86d02645e0edd61d0a99e66d7b27afb Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 3 Mar 2025 06:25:29 +0000
Subject: [PATCH 020/115] DDP is now handled by distributed_wrapper.
 reconstructor_wrapper was renamed to distributed_wrapper.

---
 examples/fancy_ptycho_multi_gpu_ddp.py       |  97 ++++++------
 src/cdtools/tools/distributed/distributed.py | 151 ++++++++++---------
 2 files changed, 128 insertions(+), 120 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 78687985..632fd435 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -1,24 +1,10 @@
 import cdtools
 from matplotlib import pyplot as plt
 
-# To use multiple GPUs, we need to import a few additional packages
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.distributed import init_process_group, destroy_process_group, barrier
-import torch.multiprocessing as mp
-import os
+# We need to import 2 additional functions
+from torch.distributed import barrier
 from cdtools.tools.distributed import distributed
 
-# While not strictly necessary, it's super useful to have in the event
-# the computation hangs by defining a timeout period. 
-import datetime 
-
-# We will need to specify what multiprocessing backend we want to use.
-# PyTorch supports a few backends (such as gloo, MPI, NCCL). We will use NCCL, or
-# NVIDIA Collective Communications Library, as it's the fastest one.
-#
-# It's also the only one that works with the current multi-GPU implementation...
-BACKEND = 'nccl'
-
 filename = r'example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
@@ -29,24 +15,32 @@
     probe_support_radius=120, 
     propagation_distance=5e-3,
     units='mm', 
-    obj_view_crop=-50,
-)
+    obj_view_crop=-50)
+
+# Remove or comment out lines moving the dataset and model to GPU.
+# This process will be handled by the cdtools.tools.distributed methods.
 
+#device = 'cuda'
+#model.to(device=device)
+#dataset.get_as(device=device)
 
-# We need to wrap the script inside a function in order to use "mp.spawn"
-# which will help distribute the work to multiple GPUs
+
+# Wrap the rest of the script inside of a function. This function will be
+# distributed across several GPUs for multiprocessing at the end.
+#
+# CDTools multi-GPU methods expects the function to be declared as...
+# 
+#       def func(model, dataset, rank, world_size):
 #
-# In fancier terms, we will use mp.spawn to create several processes
-# that will work on the model using N-number of GPUs, (a.k.a., 'WORLD_SIZE')
-# Each process will be given to one GPU that's assigned a number called 
-# a RANK (which ranges from 0 to WORLD_SIZE-1).
-def multi_gpu_reconstruct(model,
-                          dataset,
-                          rank: int, 
-                          world_size: int):
-    # Since our model is now wrapped in DDP, all CDTools methods have to be
-    # called using 'model.module' rather than just 'model'.
+# ...where rank is an integer from [0, world_size-1] assigned to each
+# GPU, and world_size is the total number of GPUs used.
+
+def multi_gpu_reconstruct(model, dataset, rank, world_size):
+
+    # All CDTools methods have to be called using 'model.module' 
+    # rather than 'model'.
     # We also need to pass the rank and world_size to Adam_optimize
+    # as shown below
     for loss in model.module.Adam_optimize(50, 
                                            dataset, 
                                            lr=0.02, 
@@ -55,22 +49,18 @@ def multi_gpu_reconstruct(model,
                                            num_workers=world_size):
         
         # We can still perform model.inspect and model.report, but we want
-        # to only let 1 GPU handle plotting/printing rather than get N copies
-        # from all N GPUs.
-        if rank == 0:
-            print(model.module.report())
+        # only 1 GPU handling plotting/printing.
+        if rank == 0: print(model.module.report())
         
         # We set up the model.inspect this way to only let GPU 0 plot and
-        # prevent the other GPUs from running far ahead of GPU 0, which
+        # prevent the other GPUs from running ahead of GPU 0, which
         # seems to cause bugs (GPU processes dissapear from nvidia-smi)
         if model.module.epoch % 10 == 0:
             if rank == 0:
                 model.module.inspect(dataset)
-            barrier()
-
+            barrier()   # Make all GPUs wait until everyone is caught up
 
-    # We set up another barrier to make sure all GPUs catch up before
-    # starting another reconstruction loop
+    # Make sure all GPUs catch up before starting another reconstruction loop
     barrier()
 
     for loss in model.module.Adam_optimize(50, 
@@ -79,30 +69,39 @@ def multi_gpu_reconstruct(model,
                                            batch_size=50,
                                            rank=rank,
                                            num_workers=world_size):
-        if rank == 0:
-            print(model.module.report())
+        if rank == 0: print(model.module.report())
         
         if model.module.epoch % 10 == 0:
             if rank == 0:
                 model.module.inspect(dataset)
             barrier()
-
-    # Again, set up another barrier to let all GPUs catch up
     barrier()
-    
-    model.module.tidy_probes() # TODO: Check how the multi-GPU implementation handles tidying probes.
 
-    # Only let one GPU handle plotting stuff.
+    # Get the model back from the distributed processing
     if rank == 0:
+        model.module.tidy_probes()
         model.module.inspect(dataset)
         model.module.compare(dataset)
         plt.show()
     
-
 # This will execute the multi_gpu_reconstruct upon running this file
+# Here, we're...
+#   - ...setting up `world_size=4` GPUs to run
+#   - ...telling CDTools the machine setting up all the connections (called
+#        the "rank 0 node/machine") is on address `master_addr`
+#   - ...telling CDTools we have a free port on `master_port` on the machine
+#        with rank 0.
+#   - ...going to wait 60 seconds for the GPUs to do something before 
+#        we terminate the reconstruction. If you want to inspect/compare
+#        the model after reconstruction, consider increasing the timeout.
+#
+# If you're using a single node (single machine/computer), you can try setting
+# master_addr = 'localhost'.
 if __name__ == '__main__':
     distributed.spawn(multi_gpu_reconstruct, 
                       model=model,
                       dataset=dataset,
-                      world_size = 4)
-    
+                      world_size = 4,
+                      master_addr='localhost',
+                      master_port='8888',
+                      timeout=60)
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 6d96f714..6cbf3a5c 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -11,46 +11,55 @@
 import torch.multiprocessing as mp
 import datetime
 import os
-import functools
 
-__all__ = ['spawn']
+__all__ = ['distributed_wrapper', 'spawn']
 
 
 
-def reconstructor_wrapper(rank, 
-                          reconstructor, 
-                          model, 
-                          dataset, 
-                          world_size, 
-                          backend, 
-                          timeout):
-    """Wraps functions containing reconstruction loops (i.e., for loss in model.Adam_optimize)
-    to enable multi-GPU operations to be set up. The wrapped function needs to passed to
-    torch.multiprocessing.spawn or cdtools.tools.distributed.distributed.spawn
+def distributed_wrapper(rank, 
+                        func, 
+                        model, 
+                        dataset, 
+                        world_size,
+                        backend='nccl', 
+                        timeout=600):
+    """Wraps functions containing reconstruction loops (i.e., for loss in 
+    model.Adam_optimize) to enable multi-GPU operations to be set up. The 
+    wrapped function needs to passed to `torch.multiprocessing.spawn` or 
+    `cdtools.tools.distributed.distributed.spawn`
 
     Parameters:
         rank: int
-            Rank of the GPU, with value ranging from [0, world_size-1]
-        reconstructor:
-            The reconstruction loop function
-        model: t.nn.Module
-            The CDIModel
-        dataset: t.utils.data.Dataset
-            The CDataset
+            Rank of the GPU, with value ranging from [0, world_size-1]. This
+            is defined by the spawning methods and not directly by the user.
+        func:
+            Function wrapping user-defined reconstruction loops
+        model: CDIModel
+            Model for CDI/ptychography reconstruction
+        dataset: Ptycho2DDataset
+            The dataset to reconstruct against
         world_size: int
             Number of GPUs to use
-        master_addr: str
-            IP address of the machine that will host the process with rank 0
-        master_port: str
-            A free port on the machine that will host the process with rank 0
-        nccl_p2p_disable: bool
-            Disable NCCL peer-2-peer communication
+        backend: str
+            Multi-gpu communication backend to use. Default is the 'nccl' backend,
+            which is the only supported backend for CDTools.
+            See https://pytorch.org/docs/stable/distributed.html for additional info
+            about PyTorch-supported backends.
+        timeout: int
+            Timeout for operations executed against the process group in seconds. 
+            Default is 10 minutes. After timeout has been reached, all subprocesses
+            will be aborted and the process calling this method will crash. 
+    
+    NOTE: While this would have been nice as a decorator function (integrated
+    with the spawner), this seems to cause problems with mp.spawn, which needs to
+    pickle the function.
     """
+    # Convert timeout from int to datetime
+    timeout = datetime.timedelta(seconds=timeout)
+
     # Initialize the process group
-    init_process_group(backend=backend,
-                        rank=rank,
-                        world_size=world_size,
-                        timeout=timeout)
+    init_process_group(backend=backend, rank=rank, 
+                       world_size=world_size, timeout=timeout)
     
     # Load the model to the appropriate GPU rank the process is using
     device = f'cuda:{rank}'
@@ -58,71 +67,71 @@ def reconstructor_wrapper(rank,
     dataset.get_as(device=device) 
 
     # Wrap the model with DistributedDataParallel
-    model = DDP(model,
-                device_ids=[rank],  # Tells DDP which GPU the model lives in
-                output_device=rank, # Tells DDP which GPU to output to
-                find_unused_parameters=True) # TODO: Understand what this is really doing...
+    model_DDP = DDP(model,
+                    device_ids=[rank],  # Tells DDP which GPU the model lives in
+                    output_device=rank, # Tells DDP which GPU to output to
+                    find_unused_parameters=True) # TODO: Understand what this is really doing...
     
-    # Dayne's special sanity check: Don't start reconstructing until all GPUs have synced.
-    barrier()
-    
-    reconstructor(model, dataset, rank, world_size)         # Start the reconstruction loop
-    barrier()                               # Wait for all GPUs to finish reconstructing
-    destroy_process_group()                 # Destroy the process group
+    # Don't start reconstructing until all GPUs have synced.
+    barrier()   
+    # Start the reconstruction loop
+    func(model_DDP, dataset, rank, world_size)        
+    # Wait for all GPUs to finish reconstructing
+    barrier()                               
+    # Destroy process group
+    destroy_process_group()        
 
 
-def spawn(reconstructor,
+def spawn(func,
           model,
           dataset,
           world_size: int,
+          master_addr: str,
+          master_port: str,
           backend: str = 'nccl',
-          timeout: int = 60,
-          master_addr: str = 'localhost',
-          master_port: str = '8888',
+          timeout: int = 600,
           nccl_p2p_disable: bool = True):
-    """Spawns world_size processes that runs a reconstructor loop function.
-    A wrapper around torch.multiprocessing.spawn. 
+    """Spawns subprocesses on `world_size` GPUs that runs reconstruction
+    loops wrapped around a function `func`.
     
-    It includes the setup of OS environmental variables needed for
-    initializing the distributed backend
+    This is a wrapper around `torch.multiprocessing.spawn` which includes 
+    the setup of OS environmental variables needed for initializing the 
+    distributed backend.
 
     Parameters:
-        reconstructor:
-            The wrapped reconstruction loop
+        func:
+            Function wrapping user-defined reconstruction loops
+        model: CDIModel
+            Model for CDI/ptychography reconstruction
+        dataset: Ptycho2DDataset
+            The dataset to reconstruct against
         world_size: int
             Number of GPUs to use
         master_addr: str
             IP address of the machine that will host the process with rank 0
         master_port: str
             A free port on the machine that will host the process with rank 0
+        backend: str
+            Multi-gpu communication backend to use. Default is the 'nccl' backend,
+            which is the only supported backend for CDTools.
+            See https://pytorch.org/docs/stable/distributed.html for additional info
+            about PyTorch-supported backends.
+        timeout: int
+            Timeout for operations executed against the process group in seconds. 
+            Default is 10 minutes. After timeout has been reached, all subprocesses
+            will be aborted and the process calling this method will crash.   
         nccl_p2p_disable: bool
             Disable NCCL peer-2-peer communication
     """
-    # Convert timeout from int to datetime
-    timeout = datetime.timedelta(seconds=timeout)
-
     # Set up environment variables
     os.environ['MASTER_ADDR'] = master_addr
     os.environ['MASTER_PORT'] = master_port
     os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
 
     # Ensure a "graceful" termination of subprocesses if something goes wrong.
-    try:
-        print('\nStarting up multi-GPU reconstructions...')
-        mp.spawn(reconstructor_wrapper,
-                 args=(reconstructor, 
-                       model,
-                       dataset,
-                       world_size,
-                       backend, 
-                       timeout),
-                 nprocs=world_size,
-                 join=True)
-        print('Reconstructions complete. Stopping processes...')
-
-    except Exception as e:
-        # If something breaks, we try to make sure that the
-        # process group is destroyed before the program fully
-        # terminates
-        print(e)
-        destroy_process_group()
+    print('\nStarting up multi-GPU reconstructions...')
+    mp.spawn(distributed_wrapper,
+                args=(func, model, dataset, world_size, backend, timeout),
+                nprocs=world_size,
+                join=True)
+    print('Reconstructions complete...')

From dafddf7d1f80a7f1b98d563017d5d5ce67cc5a21 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 3 Mar 2025 06:39:04 +0000
Subject: [PATCH 021/115] Added description to
 cdtools.tools.distributed.distributed

---
 src/cdtools/tools/distributed/distributed.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 6cbf3a5c..36d56634 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -1,7 +1,19 @@
-"""Contains wrapper functions to make reconstruction scripts compatible
+"""Contains functions to make reconstruction scripts compatible
 with multi-GPU distributive approaches in PyTorch.
 
+The functions in this module require parts of the user-written
+reconstruction script to be first wrapped in a function (as shown in 
+examples/fancy_ptycho_multi_gpu_ddp.py). The functions in this module
+are designed to wrap around/call these user-defined functions, enabling
+reconstructions to be performed across several GPUs.
 
+As of 20250302, the methods here are based on 
+torch.nn.parallel.DistributedDataParallel, which implements distributed
+data parallelism. In this scheme, replicas of the CDI/ptychography model
+are given to each device. These devices will synchronize gradients across
+each model replica. These methods however do not define how the Dataset is
+distributed across each device; this process can be handled by using
+DistributedSampler with the DataLoader.
 """
 
 import numpy as np
@@ -15,7 +27,6 @@
 __all__ = ['distributed_wrapper', 'spawn']
 
 
-
 def distributed_wrapper(rank, 
                         func, 
                         model, 

From 5d6739af5e6599e744d62f9ea66787aecafb9753 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 3 Mar 2025 07:51:35 +0000
Subject: [PATCH 022/115] Changed type annotation for multi_gpu_reconstruct

---
 examples/fancy_ptycho_multi_gpu_ddp_speed_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index df7651e3..dd66e6f2 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -10,6 +10,7 @@
 '''
 
 import cdtools
+from typing import Tuple
 from matplotlib import pyplot as plt
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.distributed import init_process_group, destroy_process_group, barrier
@@ -27,7 +28,7 @@
 def multi_gpu_reconstruct(rank: int, 
                           world_size: int,
                           conn,
-                          schedule=False) -> tuple[np.ndarray, np.ndarray]:
+                          schedule=False) -> Tuple[np.ndarray, np.ndarray]:
     """Perform the reconstruction using several GPUs
     If only one GPU is used, we don't bother loading the the process group
     or doing any of the fancy stuff associated with multi-GPU operation.

From a5c3095a0ea5fadc620f3154b9a05ed1bfe5d00a Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 04:32:45 +0000
Subject: [PATCH 023/115] CDIModel methods can just use model._ when using
 cdtools.tools.distributed.distributed

---
 examples/fancy_ptycho_multi_gpu_ddp.py       | 42 ++++++++++----------
 src/cdtools/tools/distributed/distributed.py |  5 ++-
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 632fd435..025d36c1 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -41,47 +41,47 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
     # rather than 'model'.
     # We also need to pass the rank and world_size to Adam_optimize
     # as shown below
-    for loss in model.module.Adam_optimize(50, 
-                                           dataset, 
-                                           lr=0.02, 
-                                           batch_size=10,
-                                           rank=rank,
-                                           num_workers=world_size):
+    for loss in model.Adam_optimize(50, 
+                                    dataset, 
+                                    lr=0.02, 
+                                    batch_size=10,
+                                    rank=rank,
+                                    num_workers=world_size):
         
         # We can still perform model.inspect and model.report, but we want
         # only 1 GPU handling plotting/printing.
-        if rank == 0: print(model.module.report())
+        if rank == 0: print(model.report())
         
         # We set up the model.inspect this way to only let GPU 0 plot and
         # prevent the other GPUs from running ahead of GPU 0, which
         # seems to cause bugs (GPU processes dissapear from nvidia-smi)
-        if model.module.epoch % 10 == 0:
+        if model.epoch % 10 == 0:
             if rank == 0:
-                model.module.inspect(dataset)
+                model.inspect(dataset)
             barrier()   # Make all GPUs wait until everyone is caught up
 
     # Make sure all GPUs catch up before starting another reconstruction loop
     barrier()
 
-    for loss in model.module.Adam_optimize(50, 
-                                           dataset,  
-                                           lr=0.005, 
-                                           batch_size=50,
-                                           rank=rank,
-                                           num_workers=world_size):
-        if rank == 0: print(model.module.report())
+    for loss in model.Adam_optimize(50, 
+                                    dataset,  
+                                    lr=0.005, 
+                                    batch_size=50,
+                                    rank=rank,
+                                    num_workers=world_size):
+        if rank == 0: print(model.report())
         
-        if model.module.epoch % 10 == 0:
+        if model.epoch % 10 == 0:
             if rank == 0:
-                model.module.inspect(dataset)
+                model.inspect(dataset)
             barrier()
     barrier()
 
     # Get the model back from the distributed processing
     if rank == 0:
-        model.module.tidy_probes()
-        model.module.inspect(dataset)
-        model.module.compare(dataset)
+        model.tidy_probes()
+        model.inspect(dataset)
+        model.compare(dataset)
         plt.show()
     
 # This will execute the multi_gpu_reconstruct upon running this file
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 36d56634..48f52483 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -85,8 +85,9 @@ def distributed_wrapper(rank,
     
     # Don't start reconstructing until all GPUs have synced.
     barrier()   
-    # Start the reconstruction loop
-    func(model_DDP, dataset, rank, world_size)        
+    # Start the reconstruction loop, but feed in model_DDP.module so we don't
+    # have to change `model._` to `model.module._` in the CDTools script
+    func(model_DDP.module, dataset, rank, world_size)        
     # Wait for all GPUs to finish reconstructing
     barrier()                               
     # Destroy process group

From 553f70dbd671b5ba70bb6d98a7108852c94291b2 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 04:44:20 +0000
Subject: [PATCH 024/115] Removed DistributedDataParallel from CDIModel

---
 src/cdtools/models/base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 775c7330..137cf10d 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -31,7 +31,6 @@
 import torch as t
 from torch.utils import data as torchdata
 from torch.utils.data.distributed import DistributedSampler
-from torch.nn.parallel import DistributedDataParallel
 from matplotlib import pyplot as plt
 from matplotlib.widgets import Slider
 from matplotlib import ticker
@@ -398,7 +397,7 @@ def closure():
                             exit()
 
                         # Run the simulation
-                        sim_patterns = self.forward(*inp)
+                        sim_patterns = self.forward(*inp) ## TODO: Do a deep dive plotting-per-iteration of this
 
                         # Calculate the loss
                         if hasattr(self, 'mask'):

From c74aaf4d43ab3665d3cc4336356f67d831a5b51d Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 05:38:43 +0000
Subject: [PATCH 025/115] Models calculated on multiple GPUs automatically
 perform plotting methods on only one subprocess

---
 examples/fancy_ptycho_multi_gpu_ddp.py       | 46 ++++++++------------
 src/cdtools/models/base.py                   | 14 ++++++
 src/cdtools/tools/distributed/distributed.py |  5 +++
 3 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 025d36c1..b010f248 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -5,7 +5,7 @@
 from torch.distributed import barrier
 from cdtools.tools.distributed import distributed
 
-filename = r'example_data/lab_ptycho_data.cxi'
+filename = r'/homes/dayne/repositories/cdtools_yoshikisd/cdtools/examples/example_data/lab_ptycho_data.cxi'#'example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
 model = cdtools.models.FancyPtycho.from_dataset(
@@ -37,33 +37,23 @@
 
 def multi_gpu_reconstruct(model, dataset, rank, world_size):
 
-    # All CDTools methods have to be called using 'model.module' 
-    # rather than 'model'.
-    # We also need to pass the rank and world_size to Adam_optimize
-    # as shown below
-    for loss in model.Adam_optimize(50, 
+    # We need to pass the rank and world_size to Adam_optimize as shown below
+    for loss in model.Adam_optimize(100, 
                                     dataset, 
                                     lr=0.02, 
                                     batch_size=10,
                                     rank=rank,
                                     num_workers=world_size):
         
-        # We can still perform model.inspect and model.report, but we want
-        # only 1 GPU handling plotting/printing.
-        if rank == 0: print(model.report())
+        # We can still perform model.report, but we want only 1 GPU printing stuff.
+        if rank == 0: 
+            print(model.report())
         
-        # We set up the model.inspect this way to only let GPU 0 plot and
-        # prevent the other GPUs from running ahead of GPU 0, which
-        # seems to cause bugs (GPU processes dissapear from nvidia-smi)
+        # You don't need to add the `if rank == 0` here. 
         if model.epoch % 10 == 0:
-            if rank == 0:
-                model.inspect(dataset)
-            barrier()   # Make all GPUs wait until everyone is caught up
+            model.inspect(dataset)
 
-    # Make sure all GPUs catch up before starting another reconstruction loop
-    barrier()
-
-    for loss in model.Adam_optimize(50, 
+    for loss in model.Adam_optimize(100, 
                                     dataset,  
                                     lr=0.005, 
                                     batch_size=50,
@@ -74,15 +64,15 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
         if model.epoch % 10 == 0:
             if rank == 0:
                 model.inspect(dataset)
-            barrier()
-    barrier()
 
-    # Get the model back from the distributed processing
-    if rank == 0:
-        model.tidy_probes()
-        model.inspect(dataset)
-        model.compare(dataset)
-        plt.show()
+    model.tidy_probes()
+    model.inspect(dataset)
+
+    # You don't need to add the `if rank == 0` here either...
+    model.compare(dataset)
+
+    # ...but you do have to add it here.
+    if rank == 0: plt.show()
     
 # This will execute the multi_gpu_reconstruct upon running this file
 # Here, we're...
@@ -101,7 +91,7 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
     distributed.spawn(multi_gpu_reconstruct, 
                       model=model,
                       dataset=dataset,
-                      world_size = 4,
+                      world_size = 8,
                       master_addr='localhost',
                       master_port='8888',
                       timeout=60)
diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 137cf10d..30116f71 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -62,6 +62,12 @@ def __init__(self):
         self.training_history = ''
         self.epoch = 0
 
+        # These properties indicate to the CDIModel methods whether or not 
+        # multiple GPUs will be used. The purpose is to allow only 1 GPU to call
+        # certain methods to prevent the creation of redundant plots/reports/saves
+        self.rank = None # Rank of the subprocess running the GPU
+        self.multi_gpu_used = False     
+
     def from_dataset(self, dataset):
         raise NotImplementedError()
 
@@ -781,6 +787,10 @@ def inspect(self, dataset=None, update=True):
             Whether to update existing plots or plot new ones
 
         """
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+
         # We find or create all the figures
         first_update = False
         if update and hasattr(self, 'figs') and self.figs:
@@ -891,6 +901,10 @@ def compare(self, dataset, logarithmic=False):
             Whether to plot the diffraction on a logarithmic scale
         """
 
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+
         fig, axes = plt.subplots(1,3,figsize=(12,5.3))
         fig.tight_layout(rect=[0.02, 0.09, 0.98, 0.96])
         axslider = plt.axes([0.15,0.06,0.75,0.03])
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 48f52483..8cd90434 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -77,6 +77,11 @@ def distributed_wrapper(rank,
     model.to(device=device)
     dataset.get_as(device=device) 
 
+    # Update the rank in the model and indicate we're using multiple GPUs
+    model.rank = rank
+    if world_size > 1: # In case we need to use 1 GPU for testing
+        model.multi_gpu_used = True
+
     # Wrap the model with DistributedDataParallel
     model_DDP = DDP(model,
                     device_ids=[rank],  # Tells DDP which GPU the model lives in

From 7278a3789e73c00cbf71b41191dd6cbed41524f0 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 05:49:54 +0000
Subject: [PATCH 026/115] model.Adam_optimize no longer need rank or world_size
 parameters for multi-GPU use

---
 src/cdtools/models/base.py                   | 21 ++++++++------------
 src/cdtools/tools/distributed/distributed.py | 11 +++++-----
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 30116f71..22b84b96 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -65,8 +65,9 @@ def __init__(self):
         # These properties indicate to the CDIModel methods whether or not 
         # multiple GPUs will be used. The purpose is to allow only 1 GPU to call
         # certain methods to prevent the creation of redundant plots/reports/saves
-        self.rank = None # Rank of the subprocess running the GPU
-        self.multi_gpu_used = False     
+        self.rank = None                # Rank of the subprocess running the GPU
+        self.world_size = 1             # Total number of GPUs being used.
+        self.multi_gpu_used = False     # Self explanatory
 
     def from_dataset(self, dataset):
         raise NotImplementedError()
@@ -529,9 +530,7 @@ def Adam_optimize(
             subset=None,
             regularization_factor=None,
             thread=True,
-            calculation_width=10,
-            num_workers=1,
-            rank=None
+            calculation_width=10
     ):
         """Runs a round of reconstruction using the Adam optimizer
 
@@ -561,11 +560,7 @@ def Adam_optimize(
         thread : bool
             Default True, whether to run the computation in a separate thread to allow interaction with plots during computation
         calculation_width : int
-            Default 10, how many translations to pass through at once for each round of gradient accumulation. Does not affect the result, only the calculation speed
-        num_workers: int
-            Default 1, how many GPUs to distribute calculations over
-        rank: int
-            Default None, the rank of the GPU to be used when performing multi-gpu operations. Value should be within [0, world_size-1]
+            Default 10, how many translations to pass through at once for each round of gradient accumulation. Does not affect the result, only the calculation speed 
         
         """
 
@@ -584,12 +579,12 @@ def Adam_optimize(
 
         # Make a dataloader suited for either single-GPU use or cases
         # where a process group (i.e., multiple GPUs) has been initialized
-        if num_workers > 1:
+        if self.multi_gpu_used:
             # First, create a sampler to load subsets of dataset to the GPUs
             # TODO: Test out drop_last to see how much that influences reconstructions
             sampler = DistributedSampler(dataset,
-                                         num_replicas=num_workers,
-                                         rank=rank,
+                                         num_replicas=self.world_size,
+                                         rank=self.rank,
                                          shuffle=True,
                                          drop_last=False)
             # Now create the dataloader
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 8cd90434..fae29765 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -68,6 +68,12 @@ def distributed_wrapper(rank,
     # Convert timeout from int to datetime
     timeout = datetime.timedelta(seconds=timeout)
 
+    # Update the rank in the model and indicate we're using multiple GPUs
+    model.rank = rank
+    model.world_size = world_size
+    if world_size > 1: # In case we need to use 1 GPU for testing
+        model.multi_gpu_used = True
+
     # Initialize the process group
     init_process_group(backend=backend, rank=rank, 
                        world_size=world_size, timeout=timeout)
@@ -77,11 +83,6 @@ def distributed_wrapper(rank,
     model.to(device=device)
     dataset.get_as(device=device) 
 
-    # Update the rank in the model and indicate we're using multiple GPUs
-    model.rank = rank
-    if world_size > 1: # In case we need to use 1 GPU for testing
-        model.multi_gpu_used = True
-
     # Wrap the model with DistributedDataParallel
     model_DDP = DDP(model,
                     device_ids=[rank],  # Tells DDP which GPU the model lives in

From 72b4d6a80ca607c424353f910b76e4388a50532b Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 06:15:50 +0000
Subject: [PATCH 027/115] Edit file name path for fancy_ptycho_multi_gpu_ddp.py

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index b010f248..66252aa9 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -5,7 +5,7 @@
 from torch.distributed import barrier
 from cdtools.tools.distributed import distributed
 
-filename = r'/homes/dayne/repositories/cdtools_yoshikisd/cdtools/examples/example_data/lab_ptycho_data.cxi'#'example_data/lab_ptycho_data.cxi'
+filename = r'example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
 model = cdtools.models.FancyPtycho.from_dataset(
@@ -37,31 +37,20 @@
 
 def multi_gpu_reconstruct(model, dataset, rank, world_size):
 
-    # We need to pass the rank and world_size to Adam_optimize as shown below
-    for loss in model.Adam_optimize(100, 
-                                    dataset, 
-                                    lr=0.02, 
-                                    batch_size=10,
-                                    rank=rank,
-                                    num_workers=world_size):
+    for loss in model.Adam_optimize(100, dataset, lr=0.02, batch_size=10):
         
         # We can still perform model.report, but we want only 1 GPU printing stuff.
         if rank == 0: 
             print(model.report())
         
         # You don't need to add the `if rank == 0` here. 
-        if model.epoch % 10 == 0:
+        if model.epoch % 20 == 0:
             model.inspect(dataset)
 
-    for loss in model.Adam_optimize(100, 
-                                    dataset,  
-                                    lr=0.005, 
-                                    batch_size=50,
-                                    rank=rank,
-                                    num_workers=world_size):
+    for loss in model.Adam_optimize(100, dataset, lr=0.005, batch_size=50):
         if rank == 0: print(model.report())
         
-        if model.epoch % 10 == 0:
+        if model.epoch % 20 == 0:
             if rank == 0:
                 model.inspect(dataset)
 

From 0ac1121542216248abf75a43ab0f8fc6d4153c34 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 06:48:58 +0000
Subject: [PATCH 028/115] Added type hints to distributed.py

---
 src/cdtools/tools/distributed/distributed.py | 34 ++++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index fae29765..a191a1c9 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -21,19 +21,23 @@
 from torch.distributed import init_process_group, destroy_process_group, barrier
 from torch.nn.parallel import DistributedDataParallel as DDP
 import torch.multiprocessing as mp
+from cdtools.models import CDIModel
+from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 import datetime
 import os
+from typing import Callable
 
 __all__ = ['distributed_wrapper', 'spawn']
 
 
-def distributed_wrapper(rank, 
-                        func, 
-                        model, 
-                        dataset, 
-                        world_size,
-                        backend='nccl', 
-                        timeout=600):
+def distributed_wrapper(rank: int, 
+                        func: Callable[[CDIModel, Ptycho2DDataset, int, int]], 
+                        model: CDIModel, 
+                        dataset: Ptycho2DDataset, 
+                        world_size: int,
+                        backend: str = 'nccl', 
+                        timeout: int = 600,
+                        pipe=None):
     """Wraps functions containing reconstruction loops (i.e., for loss in 
     model.Adam_optimize) to enable multi-GPU operations to be set up. The 
     wrapped function needs to passed to `torch.multiprocessing.spawn` or 
@@ -43,8 +47,9 @@ def distributed_wrapper(rank,
         rank: int
             Rank of the GPU, with value ranging from [0, world_size-1]. This
             is defined by the spawning methods and not directly by the user.
-        func:
-            Function wrapping user-defined reconstruction loops
+        func: Callable[[CDIModel, Ptycho2DDataset, int, int]]
+            Function wrapping user-defined reconstruction loops. The function must
+            have the following format: func(model, dataset, rank, world_size).
         model: CDIModel
             Model for CDI/ptychography reconstruction
         dataset: Ptycho2DDataset
@@ -100,9 +105,9 @@ def distributed_wrapper(rank,
     destroy_process_group()        
 
 
-def spawn(func,
-          model,
-          dataset,
+def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int]],
+          model: CDIModel,
+          dataset: Ptycho2DDataset,
           world_size: int,
           master_addr: str,
           master_port: str,
@@ -117,8 +122,9 @@ def spawn(func,
     distributed backend.
 
     Parameters:
-        func:
-            Function wrapping user-defined reconstruction loops
+        func: Callable[[CDIModel, Ptycho2DDataset, int, int]]
+            Function wrapping user-defined reconstruction loops. The function must
+            have the following format: func(model, dataset, rank, world_size).
         model: CDIModel
             Model for CDI/ptychography reconstruction
         dataset: Ptycho2DDataset

From 57dca982bf79d2879fdccc983daa887f0b46e499 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 17:55:48 +0000
Subject: [PATCH 029/115] cdtools.tools.distributed.distributed methods can
 take Connection objects as arguments

---
 src/cdtools/tools/distributed/distributed.py | 39 ++++++++++++++------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index a191a1c9..5fffa387 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -21,6 +21,7 @@
 from torch.distributed import init_process_group, destroy_process_group, barrier
 from torch.nn.parallel import DistributedDataParallel as DDP
 import torch.multiprocessing as mp
+from multiprocessing.connection import Connection
 from cdtools.models import CDIModel
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 import datetime
@@ -31,15 +32,15 @@
 
 
 def distributed_wrapper(rank: int, 
-                        func: Callable[[CDIModel, Ptycho2DDataset, int, int]], 
+                        func: Callable[[CDIModel, Ptycho2DDataset, int, int], None], 
                         model: CDIModel, 
                         dataset: Ptycho2DDataset, 
                         world_size: int,
                         backend: str = 'nccl', 
                         timeout: int = 600,
-                        pipe=None):
-    """Wraps functions containing reconstruction loops (i.e., for loss in 
-    model.Adam_optimize) to enable multi-GPU operations to be set up. The 
+                        pipe: Connection = None):
+    """Wraps functions containing reconstruction loops (i.e., `for loss in 
+    model.Adam_optimize`) to enable multi-GPU operations to be set up. The 
     wrapped function needs to passed to `torch.multiprocessing.spawn` or 
     `cdtools.tools.distributed.distributed.spawn`
 
@@ -65,10 +66,12 @@ def distributed_wrapper(rank: int,
             Timeout for operations executed against the process group in seconds. 
             Default is 10 minutes. After timeout has been reached, all subprocesses
             will be aborted and the process calling this method will crash. 
-    
-    NOTE: While this would have been nice as a decorator function (integrated
-    with the spawner), this seems to cause problems with mp.spawn, which needs to
-    pickle the function.
+        pipe: Connection
+            A Connection object representing one end of a communication pipe. This
+            parameter is needed if you're trying to get some values back from the
+            wrapped function.
+            BUG: Passing a CDIModel through connection generated with mp.Pipe or
+                 query will cause the connection to hang.
     """
     # Convert timeout from int to datetime
     timeout = datetime.timedelta(seconds=timeout)
@@ -98,14 +101,19 @@ def distributed_wrapper(rank: int,
     barrier()   
     # Start the reconstruction loop, but feed in model_DDP.module so we don't
     # have to change `model._` to `model.module._` in the CDTools script
-    func(model_DDP.module, dataset, rank, world_size)        
+    # We also need to check if we want to pass a pipe to the function
+    if pipe is None:
+        func(model_DDP.module, dataset, rank, world_size)    
+    else:
+        func(model_DDP.module, dataset, rank, world_size, pipe)   
+
     # Wait for all GPUs to finish reconstructing
     barrier()                               
     # Destroy process group
     destroy_process_group()        
 
 
-def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int]],
+def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],
           model: CDIModel,
           dataset: Ptycho2DDataset,
           world_size: int,
@@ -113,7 +121,8 @@ def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int]],
           master_port: str,
           backend: str = 'nccl',
           timeout: int = 600,
-          nccl_p2p_disable: bool = True):
+          nccl_p2p_disable: bool = True,
+          pipe: Connection = None):
     """Spawns subprocesses on `world_size` GPUs that runs reconstruction
     loops wrapped around a function `func`.
     
@@ -146,6 +155,12 @@ def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int]],
             will be aborted and the process calling this method will crash.   
         nccl_p2p_disable: bool
             Disable NCCL peer-2-peer communication
+        pipe: Connection
+            A Connection object representing one end of a communication pipe. This
+            parameter is needed if you're trying to get some values back from the
+            wrapped function.
+            BUG: Passing a CDIModel through connection generated with mp.Pipe or
+                 query will cause the connection to hang.
     """
     # Set up environment variables
     os.environ['MASTER_ADDR'] = master_addr
@@ -155,7 +170,7 @@ def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int]],
     # Ensure a "graceful" termination of subprocesses if something goes wrong.
     print('\nStarting up multi-GPU reconstructions...')
     mp.spawn(distributed_wrapper,
-                args=(func, model, dataset, world_size, backend, timeout),
+                args=(func, model, dataset, world_size, backend, timeout, pipe),
                 nprocs=world_size,
                 join=True)
     print('Reconstructions complete...')

From e6e8130208d630885084174604cab33919f797a7 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 17:56:20 +0000
Subject: [PATCH 030/115] Updated the multi-gpu speed test

---
 .../fancy_ptycho_multi_gpu_ddp_speed_test.py  | 155 +++++++-----------
 1 file changed, 62 insertions(+), 93 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index dd66e6f2..be1c98ce 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -10,25 +10,45 @@
 '''
 
 import cdtools
+from cdtools.models import CDIModel
+from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
+from cdtools.tools.distributed import distributed
+from multiprocessing.connection import Connection
 from typing import Tuple
 from matplotlib import pyplot as plt
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.distributed import init_process_group, destroy_process_group, barrier
+from torch.distributed import destroy_process_group
 import torch.multiprocessing as mp
 import os
 import datetime 
 import time
 import numpy as np
+from copy import deepcopy
 
 TIMEOUT = datetime.timedelta(seconds=10)   # Auto-terminate if things hang
 BACKEND = 'nccl'
 
+# Load the dataset
+filename = r'example_data/lab_ptycho_data.cxi'
+dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+# Create the model
+model = cdtools.models.FancyPtycho.from_dataset(
+    dataset,
+    n_modes=3,
+    oversampling=2, 
+    probe_support_radius=120, 
+    propagation_distance=5e-3,
+    units='mm', 
+    obj_view_crop=-50,
+)
 
 # Multi-GPU supported reconstruction
-def multi_gpu_reconstruct(rank: int, 
-                          world_size: int,
-                          conn,
-                          schedule=False) -> Tuple[np.ndarray, np.ndarray]:
+def reconstruct(model: CDIModel,
+                dataset: Ptycho2DDataset,
+                rank: int, 
+                world_size: int,
+                conn: Connection = None,
+                schedule: bool = False) -> Tuple[np.ndarray, np.ndarray]:
     """Perform the reconstruction using several GPUs
     If only one GPU is used, we don't bother loading the the process group
     or doing any of the fancy stuff associated with multi-GPU operation.
@@ -55,112 +75,52 @@ def multi_gpu_reconstruct(rank: int,
     # Start counting time
     t_start = time.time()
 
-    # Load the dataset
-    filename = r'example_data/lab_ptycho_data.cxi'
-    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
-
-    if world_size > 1:
-        # We need to initialize the distributed process group
-        # before calling any other method for multi-GPU usage
-        init_process_group(backend=BACKEND,
-                        rank=rank,
-                        world_size=world_size,
-                        timeout=TIMEOUT)
-    
-    # Create the model
-    model = cdtools.models.FancyPtycho.from_dataset(
-        dataset,
-        n_modes=3,
-        oversampling=2, 
-        probe_support_radius=120, 
-        propagation_distance=5e-3,
-        units='mm', 
-        obj_view_crop=-50,
-    )
-
-    # Assign devices
-    device = f'cuda:{rank}'
-    model.to(device=device)
-    dataset.get_as(device=device)
+    if world_size == 1:
+        device = 'cuda'
+        model.to(device=device)
+        dataset.get_as(device=device)
 
-    # Perform reconstructions on either single or multi-GPU workflows.
-    if world_size > 1:
-        # For multi-GPU workflows, we have to use this mess.
-        model = DDP(model,
-                    device_ids=[rank],  # Tells DDP which GPU the model lives in
-                    output_device=rank, # Tells DDP which GPU to output to
-                    find_unused_parameters=True) # TODO: Understand what this is really doing...
-        barrier()
-
-        for loss in model.module.Adam_optimize(50, 
-                                            dataset, 
-                                            lr=0.02, 
-                                            batch_size=10,
-                                            rank=rank,
-                                            num_workers=world_size,
-                                            schedule=schedule):
-            if rank == 0:
-                print(model.module.report())
-                t_list.append(time.time() - t_start)
-        barrier()
-
-        for loss in model.module.Adam_optimize(50, 
-                                            dataset,  
-                                            lr=0.005, 
-                                            batch_size=50,
-                                            rank=rank,
-                                            num_workers=world_size,
-                                            schedule=schedule):
-            if rank == 0:
-                print(model.module.report())
-                t_list.append(time.time() - t_start)
-        # Again, set up another barrier to let all GPUs catch up
-        barrier()
-        # Always destroy the process group when you're done
-        destroy_process_group()
 
-        # We need to send the time_history and loss_history through
-        # the child connection to the parent (sitting in the name-main block)
+    # Perform reconstructions on either single or multi-GPU workflows.
+    for loss in model.Adam_optimize(25, dataset, lr=0.02, batch_size=10, schedule=schedule):
         if rank == 0:
-            loss_history = np.array(model.module.loss_history)
-            time_history = np.array(t_list)
-            conn.send((time_history, loss_history))
-
-    else:
-        # For single-GPU workloads, we use the vanilla-way of performing
-        # reconstructions in CDTools
-        for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=10, schedule=schedule):
             print(model.report())
             t_list.append(time.time() - t_start)
-        for loss in model.Adam_optimize(50, dataset,  lr=0.005, batch_size=50, schedule=schedule):
+
+    for loss in model.Adam_optimize(25, dataset, lr=0.005, batch_size=50, schedule=schedule):
+        if rank == 0:
             print(model.report())
             t_list.append(time.time() - t_start)
 
+    # We need to send the time_history and loss_history through
+    # the child connection to the parent (sitting in the name-main block)
+    if rank == 0:
         loss_history = np.array(model.loss_history)
         time_history = np.array(t_list)
-        # Return the measured time and loss history
-        return time_history, loss_history
+
+        if conn is not None: 
+            conn.send((time_history, loss_history))
+
+    # Return the measured time and loss history if we're on a single GPU
+    if world_size == 1: return time_history, loss_history
+
 
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
-    # We need to add some stuff to the enviromnent 
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '8888'  # You can use any open port number
-    os.environ['NCCL_P2P_DISABLE'] = '1'
-
+    
     # Set up a parent/child connection to get some info from the GPU-accelerated
     # function
     parent_conn, child_conn = mp.Pipe()
 
     # Define the number of GPUs to use.
-    world_sizes = [2, 1] 
+    world_sizes = [8, 4, 2, 1] 
 
     # Define if we want to use the scheduler or not
     schedule=True
 
     # Define how many iterations we want to perform of the reconstructions
     # for statistics
-    runs = 2
+    runs = 5
 
     # Write a try/except statement to help the subprocesses (and GPUs)
     # terminate gracefully. Otherwise, you may have stuff loaded on
@@ -173,17 +133,26 @@ def multi_gpu_reconstruct(rank: int,
             loss_hist_list = []
 
             for i in range(runs):
+                print(f'Resetting the model...')
                 print(f'Starting run {i+1}/{runs} on {world_size} GPU(s)')
+                model_copy = deepcopy(model)
                 if world_size == 1:
-                    final_time, loss_history = multi_gpu_reconstruct(0, world_size,schedule)
+                    final_time, loss_history = reconstruct(model=model_copy, 
+                                                           dataset=dataset,
+                                                           rank=0,
+                                                           world_size=1)
                     time_list.append(final_time)
                     loss_hist_list.append(loss_history)
                 else:
                     # Spawn the processes
-                    mp.spawn(multi_gpu_reconstruct,
-                             args=(world_size, child_conn, schedule),
-                             nprocs=world_size,
-                             join=True)
+                    distributed.spawn(reconstruct,
+                                      model=model_copy,
+                                      dataset=dataset,
+                                      world_size=world_size,
+                                      master_addr = 'localhost',
+                                      master_port = '8888',
+                                      timeout=300,
+                                      pipe=child_conn)
                     while parent_conn.poll():
                         final_time, loss_history = parent_conn.recv()
                         time_list.append(final_time)

From 7514ca6e9a03a2dd5f5175e2c23e41d6c15b67f8 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 18:03:32 +0000
Subject: [PATCH 031/115] Updated function description and removed os
 dependency for the multi-gpu speed test

---
 .../fancy_ptycho_multi_gpu_ddp_speed_test.py  | 113 +++++++++---------
 1 file changed, 54 insertions(+), 59 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index be1c98ce..6c272741 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -18,7 +18,6 @@
 from matplotlib import pyplot as plt
 from torch.distributed import destroy_process_group
 import torch.multiprocessing as mp
-import os
 import datetime 
 import time
 import numpy as np
@@ -54,13 +53,19 @@ def reconstruct(model: CDIModel,
     or doing any of the fancy stuff associated with multi-GPU operation.
 
     Parameters:
+        model: CDIModel
+                Model for CDI/ptychography reconstruction
+        dataset: Ptycho2DDataset
+            The dataset to reconstruct against
         rank: int
             The rank of the GPU to be used. Value should be within
             [0, world_size-1]
         world_size: int
             The total number of GPUs to use
-        conn: mp.Pipe
-            Connection to parent
+        conn: Connection
+            A Connection object representing one end of a communication pipe. This
+            parameter is needed if you're trying to get some values back from the
+            wrapped function.
         schedule: bool
             Toggles the use of the scheduler
     
@@ -102,16 +107,12 @@ def reconstruct(model: CDIModel,
             conn.send((time_history, loss_history))
 
     # Return the measured time and loss history if we're on a single GPU
-    if world_size == 1: return time_history, loss_history
+    if world_size == 1: 
+        return time_history, loss_history
 
 
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
-    
-    # Set up a parent/child connection to get some info from the GPU-accelerated
-    # function
-    parent_conn, child_conn = mp.Pipe()
-
     # Define the number of GPUs to use.
     world_sizes = [8, 4, 2, 1] 
 
@@ -122,65 +123,59 @@ def reconstruct(model: CDIModel,
     # for statistics
     runs = 5
 
+    # Set up a parent/child connection to get some info from the GPU-accelerated function
+    parent_conn, child_conn = mp.Pipe()
+
     # Write a try/except statement to help the subprocesses (and GPUs)
     # terminate gracefully. Otherwise, you may have stuff loaded on
     # several GPU even after terminating.
-    try:
-        for world_size in world_sizes:
-            print(f'Number of GPU(s): {world_size}')
-            # Make a list to store the values
-            time_list = []
-            loss_hist_list = []
-
-            for i in range(runs):
-                print(f'Resetting the model...')
-                print(f'Starting run {i+1}/{runs} on {world_size} GPU(s)')
-                model_copy = deepcopy(model)
-                if world_size == 1:
-                    final_time, loss_history = reconstruct(model=model_copy, 
-                                                           dataset=dataset,
-                                                           rank=0,
-                                                           world_size=1)
+    for world_size in world_sizes:
+        print(f'Number of GPU(s): {world_size}')
+        # Make a list to store the values
+        time_list = []
+        loss_hist_list = []
+
+        for i in range(runs):
+            print(f'Resetting the model...')
+            print(f'Starting run {i+1}/{runs} on {world_size} GPU(s)')
+            model_copy = deepcopy(model)
+            if world_size == 1:
+                final_time, loss_history = reconstruct(model=model_copy, 
+                                                        dataset=dataset,
+                                                        rank=0,
+                                                        world_size=1)
+                time_list.append(final_time)
+                loss_hist_list.append(loss_history)
+            else:
+                # Spawn the processes
+                distributed.spawn(reconstruct,
+                                    model=model_copy,
+                                    dataset=dataset,
+                                    world_size=world_size,
+                                    master_addr = 'localhost',
+                                    master_port = '8888',
+                                    timeout=300,
+                                    pipe=child_conn)
+                while parent_conn.poll():
+                    final_time, loss_history = parent_conn.recv()
                     time_list.append(final_time)
                     loss_hist_list.append(loss_history)
-                else:
-                    # Spawn the processes
-                    distributed.spawn(reconstruct,
-                                      model=model_copy,
-                                      dataset=dataset,
-                                      world_size=world_size,
-                                      master_addr = 'localhost',
-                                      master_port = '8888',
-                                      timeout=300,
-                                      pipe=child_conn)
-                    while parent_conn.poll():
-                        final_time, loss_history = parent_conn.recv()
-                        time_list.append(final_time)
-                        loss_hist_list.append(loss_history)
-                
             
-            # Calculate the statistics
-            time_mean = np.array(time_list).mean(axis=0)/60
-            time_std = np.array(time_list).std(axis=0)/60
-            loss_mean = np.array(loss_hist_list).mean(axis=0)
-            loss_std = np.array(loss_hist_list).std(axis=0)
-
-            # Plot
-            plt.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
-                     label=f'{world_size} GPUs')
-            plt.yscale('log')
-            plt.xscale('linear')
+        
+        # Calculate the statistics
+        time_mean = np.array(time_list).mean(axis=0)/60
+        time_std = np.array(time_list).std(axis=0)/60
+        loss_mean = np.array(loss_hist_list).mean(axis=0)
+        loss_std = np.array(loss_hist_list).std(axis=0)
+
+        # Plot
+        plt.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
+                    label=f'{world_size} GPUs')
+        plt.yscale('log')
+        plt.xscale('linear')
         
         plt.legend()
         plt.xlabel('Time (min)')
         plt.ylabel('Loss')
         plt.show()
-
-
-    except KeyboardInterrupt as e:
-        # If something breaks, we try to make sure that the
-        # process group is destroyed before the program fully
-        # terminates
-        print('Hang on a sec...')
-        destroy_process_group()
     

From 6e4145f87677d6631ef540b12bff5bb9c581c6b8 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 18:31:20 +0000
Subject: [PATCH 032/115] Fixed dataset path for multi gpu speed test

---
 examples/fancy_ptycho_multi_gpu_ddp_speed_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index 6c272741..8d660f70 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -27,7 +27,7 @@
 BACKEND = 'nccl'
 
 # Load the dataset
-filename = r'example_data/lab_ptycho_data.cxi'
+filename = r'examples/example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
 # Create the model

From 436696f185491955d4243d267d15c4cc5830094a Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 8 Mar 2025 20:41:12 +0000
Subject: [PATCH 033/115] Moved data_loader.sampler.set_epoch() to the inside
 of run_epoch statement

---
 src/cdtools/models/base.py | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 22b84b96..ceab2c06 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -365,12 +365,14 @@ def AD_optimize(self, iterations, data_loader,  optimizer,\
             The summed loss over the latest epoch, divided by the total diffraction pattern intensity
         """
 
-        # Check if multi-GPU operations are being conducted (i.e.,
-        # a process group is initialized)
-        is_multi_GPU = t.distributed.is_initialized()
-
         def run_epoch(stop_event=None):
             """Runs one full epoch of the reconstruction."""
+            # If we're using DistributedSampler (likely the case if you're using 
+            # multiple GPUs), we need to tell it which epoch we're on. Otherwise
+            # data shuffling will not work properly
+            if self.multi_gpu_used: 
+                data_loader.sampler.set_epoch(self.epoch)
+
             # First, initialize some tracking variables
             normalization = 0
             loss = 0
@@ -427,7 +429,7 @@ def closure():
 
                 # This takes the step for this minibatch
                 loss += optimizer.step(closure).detach().cpu().numpy()
-
+                t.distributed.barrier() # Add this for checkpoint debugging
             
             loss /= normalization
 
@@ -455,11 +457,7 @@ def closure():
                     else:
                         yield float('nan')
                     continue
-                
-                # If we're using DistributedSampler (likely the case if
-                # you're using multiple GPUs), we need to tell it
-                # which epoch we're on before running an epoch
-                if is_multi_GPU: data_loader.sampler.set_epoch(self.epoch)
+
                 yield run_epoch()
                     
                 
@@ -485,11 +483,6 @@ def target():
                     else:
                         yield float('nan')
                     continue
-                
-                # If we're using DistributedSampler, (likely the case if
-                # you're using multiple GPUs), we need to tell it which
-                # epoch we're on before running an epoch
-                if is_multi_GPU: data_loader.sampler.set_epoch(self.epoch)
 
                 calc = threading.Thread(target=target, name='calculator', daemon=True)
                 try:
@@ -589,10 +582,9 @@ def Adam_optimize(
                                          drop_last=False)
             # Now create the dataloader
             data_loader = torchdata.DataLoader(dataset,
-                                               batch_size=batch_size, # TODO: Recalculate the batch_size for multi-GPU operation
-                                               shuffle=False, # Shuffling is now handled by sampler
-                                               num_workers=0, # I'm not 100% sure what this does, but apparently making this >0 can cause bugs
-                                               drop_last=False, # TODO: Test out how this influences reconstructions
+                                               batch_size=batch_size,
+                                               num_workers=0, # Creating extra threads in children processes may cause problems. Leave this at 0.
+                                               drop_last=False,
                                                pin_memory=False,# I'm not 100% sure what this does, but apparently making this True can cause bugs
                                                sampler=sampler)
         else:

From 47839159394b631639217a9c3e7961a3a0653fba Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 10 Mar 2025 05:19:22 +0000
Subject: [PATCH 034/115] Removed scheduler from the multi gpu speed test
 script

---
 examples/fancy_ptycho_multi_gpu_ddp_speed_test.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index 8d660f70..975f12e7 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -87,12 +87,12 @@ def reconstruct(model: CDIModel,
 
 
     # Perform reconstructions on either single or multi-GPU workflows.
-    for loss in model.Adam_optimize(25, dataset, lr=0.02, batch_size=10, schedule=schedule):
+    for loss in model.Adam_optimize(200, dataset, lr=0.02, batch_size=10):
         if rank == 0:
             print(model.report())
             t_list.append(time.time() - t_start)
 
-    for loss in model.Adam_optimize(25, dataset, lr=0.005, batch_size=50, schedule=schedule):
+    for loss in model.Adam_optimize(200, dataset, lr=0.005, batch_size=50):
         if rank == 0:
             print(model.report())
             t_list.append(time.time() - t_start)
@@ -116,19 +116,13 @@ def reconstruct(model: CDIModel,
     # Define the number of GPUs to use.
     world_sizes = [8, 4, 2, 1] 
 
-    # Define if we want to use the scheduler or not
-    schedule=True
-
-    # Define how many iterations we want to perform of the reconstructions
-    # for statistics
+    # How many reconstruction runs to perform for statistics
     runs = 5
 
     # Set up a parent/child connection to get some info from the GPU-accelerated function
     parent_conn, child_conn = mp.Pipe()
 
-    # Write a try/except statement to help the subprocesses (and GPUs)
-    # terminate gracefully. Otherwise, you may have stuff loaded on
-    # several GPU even after terminating.
+    # Execute
     for world_size in world_sizes:
         print(f'Number of GPU(s): {world_size}')
         # Make a list to store the values

From 77eb21adc30d0f41209a765d691cdebd9811f19e Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 10 Mar 2025 17:50:01 +0000
Subject: [PATCH 035/115] Cleaned up the multi gpu speed test

---
 .../fancy_ptycho_multi_gpu_ddp_speed_test.py  | 63 +++++++++++--------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index 975f12e7..3f05fc1c 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -16,18 +16,13 @@
 from multiprocessing.connection import Connection
 from typing import Tuple
 from matplotlib import pyplot as plt
-from torch.distributed import destroy_process_group
 import torch.multiprocessing as mp
-import datetime 
 import time
 import numpy as np
 from copy import deepcopy
 
-TIMEOUT = datetime.timedelta(seconds=10)   # Auto-terminate if things hang
-BACKEND = 'nccl'
-
 # Load the dataset
-filename = r'examples/example_data/lab_ptycho_data.cxi'
+filename = r'example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
 # Create the model
@@ -87,12 +82,17 @@ def reconstruct(model: CDIModel,
 
 
     # Perform reconstructions on either single or multi-GPU workflows.
-    for loss in model.Adam_optimize(200, dataset, lr=0.02, batch_size=10):
+    for loss in model.Adam_optimize(100, dataset, lr=0.02, batch_size=10):
+        if rank == 0:
+            print(model.report())
+            t_list.append(time.time() - t_start)
+
+    for loss in model.Adam_optimize(100, dataset, lr=0.005, batch_size=50):
         if rank == 0:
             print(model.report())
             t_list.append(time.time() - t_start)
 
-    for loss in model.Adam_optimize(200, dataset, lr=0.005, batch_size=50):
+    for loss in model.Adam_optimize(100, dataset, lr=0.001, batch_size=50):
         if rank == 0:
             print(model.report())
             t_list.append(time.time() - t_start)
@@ -111,18 +111,13 @@ def reconstruct(model: CDIModel,
         return time_history, loss_history
 
 
-# This will execute the multi_gpu_reconstruct upon running this file
-if __name__ == '__main__':
-    # Define the number of GPUs to use.
-    world_sizes = [8, 4, 2, 1] 
-
-    # How many reconstruction runs to perform for statistics
-    runs = 5
-
+def run_test(world_size, runs):
     # Set up a parent/child connection to get some info from the GPU-accelerated function
     parent_conn, child_conn = mp.Pipe()
-
+    
     # Execute
+    # Plot
+    fig, (ax1,ax2) = plt.subplots(1,2)
     for world_size in world_sizes:
         print(f'Number of GPU(s): {world_size}')
         # Make a list to store the values
@@ -155,21 +150,37 @@ def reconstruct(model: CDIModel,
                     time_list.append(final_time)
                     loss_hist_list.append(loss_history)
             
-        
         # Calculate the statistics
         time_mean = np.array(time_list).mean(axis=0)/60
         time_std = np.array(time_list).std(axis=0)/60
         loss_mean = np.array(loss_hist_list).mean(axis=0)
         loss_std = np.array(loss_hist_list).std(axis=0)
 
-        # Plot
-        plt.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
+        
+        ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
                     label=f'{world_size} GPUs')
-        plt.yscale('log')
-        plt.xscale('linear')
+        ax2.plot(loss_mean, label=f'{world_size} GPUs')
         
-        plt.legend()
-        plt.xlabel('Time (min)')
-        plt.ylabel('Loss')
-        plt.show()
+    
+    ax1.set_yscale('log')
+    ax1.set_xscale('linear')
+    ax2.set_yscale('log')
+    ax2.set_xscale('linear')
+    ax1.legend()
+    ax2.legend()
+    ax1.set_xlabel('Time (min)')
+    ax1.set_ylabel('Loss')
+    ax2.set_xlabel('Epochs')
+    plt.show()
+
+# This will execute the multi_gpu_reconstruct upon running this file
+if __name__ == '__main__':
+    # Define the number of GPUs to use.
+    world_sizes = [8, 4] 
+
+    # How many reconstruction runs to perform for statistics
+    runs = 1
+
+    run_test(world_sizes, runs)
+    
     

From 3e1bb176655a53bdf68677e0cda0237c2e67ac10 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 10 Mar 2025 17:54:30 +0000
Subject: [PATCH 036/115] Fixed the path in the multi GPU speed test (again)

---
 examples/fancy_ptycho_multi_gpu_ddp_speed_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index 3f05fc1c..07019389 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -22,7 +22,7 @@
 from copy import deepcopy
 
 # Load the dataset
-filename = r'example_data/lab_ptycho_data.cxi'
+filename = r'examples/example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
 # Create the model

From 5b11dc50ec5dad31bbc6a4cd5f9662b42414cf08 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 11 Mar 2025 20:21:08 +0000
Subject: [PATCH 037/115] Cleaned up fancy_ptycho_multi_gpu_ddp.py

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 66252aa9..904020e1 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -37,7 +37,7 @@
 
 def multi_gpu_reconstruct(model, dataset, rank, world_size):
 
-    for loss in model.Adam_optimize(100, dataset, lr=0.02, batch_size=10):
+    for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=10):
         
         # We can still perform model.report, but we want only 1 GPU printing stuff.
         if rank == 0: 
@@ -47,8 +47,9 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
         if model.epoch % 20 == 0:
             model.inspect(dataset)
 
-    for loss in model.Adam_optimize(100, dataset, lr=0.005, batch_size=50):
-        if rank == 0: print(model.report())
+    for loss in model.Adam_optimize(50, dataset, lr=0.005, batch_size=50):
+        if rank == 0: 
+            print(model.report())
         
         if model.epoch % 20 == 0:
             if rank == 0:
@@ -80,7 +81,7 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
     distributed.spawn(multi_gpu_reconstruct, 
                       model=model,
                       dataset=dataset,
-                      world_size = 8,
+                      world_size = 4,
                       master_addr='localhost',
                       master_port='8888',
-                      timeout=60)
+                      timeout=600)

From 00e04132e55ac3333a6f041c8abe6a6d8628b5b7 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 11 Mar 2025 20:22:14 +0000
Subject: [PATCH 038/115] Removed barrier statement from CDIModel

---
 src/cdtools/models/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index ceab2c06..8f3219ae 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -429,7 +429,6 @@ def closure():
 
                 # This takes the step for this minibatch
                 loss += optimizer.step(closure).detach().cpu().numpy()
-                t.distributed.barrier() # Add this for checkpoint debugging
             
             loss /= normalization
 

From dd07c4b9488683d0f2cc24b58cc418e5ac12f8e8 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Wed, 12 Mar 2025 23:26:28 +0000
Subject: [PATCH 039/115] Fix for unintended GPU usage; added ability to define
 which GPUs to use for multi-GPU reconstruction

---
 examples/fancy_ptycho_multi_gpu_ddp.py        |  6 +-
 .../fancy_ptycho_multi_gpu_ddp_speed_test.py  | 61 +++++++++++++------
 src/cdtools/models/base.py                    |  1 +
 src/cdtools/tools/distributed/distributed.py  | 37 ++++++-----
 4 files changed, 66 insertions(+), 39 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 904020e1..6f7a0212 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -5,7 +5,7 @@
 from torch.distributed import barrier
 from cdtools.tools.distributed import distributed
 
-filename = r'example_data/lab_ptycho_data.cxi'
+filename = r'examples/example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
 model = cdtools.models.FancyPtycho.from_dataset(
@@ -81,7 +81,7 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
     distributed.spawn(multi_gpu_reconstruct, 
                       model=model,
                       dataset=dataset,
-                      world_size = 4,
+                      device_ids = [1,3,6,7],
                       master_addr='localhost',
                       master_port='8888',
-                      timeout=600)
+                      timeout=6000)
diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index 07019389..ee0dfdb9 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -40,7 +40,7 @@
 def reconstruct(model: CDIModel,
                 dataset: Ptycho2DDataset,
                 rank: int, 
-                world_size: int,
+                device_ids: list[int],
                 conn: Connection = None,
                 schedule: bool = False) -> Tuple[np.ndarray, np.ndarray]:
     """Perform the reconstruction using several GPUs
@@ -54,9 +54,9 @@ def reconstruct(model: CDIModel,
             The dataset to reconstruct against
         rank: int
             The rank of the GPU to be used. Value should be within
-            [0, world_size-1]
-        world_size: int
-            The total number of GPUs to use
+            [0, #_of_GPUs_used-1]
+        device_ids: list[int]
+            List of GPU IDs to use
         conn: Connection
             A Connection object representing one end of a communication pipe. This
             parameter is needed if you're trying to get some values back from the
@@ -75,12 +75,13 @@ def reconstruct(model: CDIModel,
     # Start counting time
     t_start = time.time()
 
-    if world_size == 1:
-        device = 'cuda'
+    # Check if we're using only a single GPU
+    if not model.multi_gpu_used:
+        # Use the 1st GPU in device_ids
+        device = f'cuda:{device_ids[0]}'
         model.to(device=device)
         dataset.get_as(device=device)
 
-
     # Perform reconstructions on either single or multi-GPU workflows.
     for loss in model.Adam_optimize(100, dataset, lr=0.02, batch_size=10):
         if rank == 0:
@@ -107,19 +108,34 @@ def reconstruct(model: CDIModel,
             conn.send((time_history, loss_history))
 
     # Return the measured time and loss history if we're on a single GPU
-    if world_size == 1: 
+    if not model.multi_gpu_used:
         return time_history, loss_history
 
 
-def run_test(world_size, runs):
+def run_test(world_sizes: int, 
+             device_ids: int, 
+             runs: int):
+    """Runs a series of reconstructions (defined in the local function 
+    `reconstruct`) using several GPUs and several trials per GPU count.
+
+    Parameters:
+        world_sizes: list[int]
+            Number of GPUs to use. User can specify several GPU counts in a list.
+        device_ids: list[int] or int
+            List of the GPU ID numbers to use for the study
+        runs: int
+            How many repeat reconstructions to perform
+    """
     # Set up a parent/child connection to get some info from the GPU-accelerated function
     parent_conn, child_conn = mp.Pipe()
     
     # Execute
-    # Plot
     fig, (ax1,ax2) = plt.subplots(1,2)
     for world_size in world_sizes:
-        print(f'Number of GPU(s): {world_size}')
+        # Get the GPU IDs to use
+        dev_id = device_ids[0:world_size] 
+        print(f'\nNumber of GPU(s): {world_size} | Using GPU IDs {*dev_id,}')
+
         # Make a list to store the values
         time_list = []
         loss_hist_list = []
@@ -132,7 +148,7 @@ def run_test(world_size, runs):
                 final_time, loss_history = reconstruct(model=model_copy, 
                                                         dataset=dataset,
                                                         rank=0,
-                                                        world_size=1)
+                                                        device_ids=dev_id)
                 time_list.append(final_time)
                 loss_hist_list.append(loss_history)
             else:
@@ -140,7 +156,7 @@ def run_test(world_size, runs):
                 distributed.spawn(reconstruct,
                                     model=model_copy,
                                     dataset=dataset,
-                                    world_size=world_size,
+                                    device_ids = dev_id,
                                     master_addr = 'localhost',
                                     master_port = '8888',
                                     timeout=300,
@@ -156,12 +172,14 @@ def run_test(world_size, runs):
         loss_mean = np.array(loss_hist_list).mean(axis=0)
         loss_std = np.array(loss_hist_list).std(axis=0)
 
-        
+        # Add another plot
         ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
                     label=f'{world_size} GPUs')
-        ax2.plot(loss_mean, label=f'{world_size} GPUs')
+        ax2.errorbar(np.arange(0,loss_mean.shape[0]), loss_mean, yerr=loss_std,
+                    label=f'{world_size} GPUs')
         
-    
+    # Plot
+    fig.suptitle(f'Multi-GPU performance test | {runs} runs performed')
     ax1.set_yscale('log')
     ax1.set_xscale('linear')
     ax2.set_yscale('log')
@@ -176,11 +194,14 @@ def run_test(world_size, runs):
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
     # Define the number of GPUs to use.
-    world_sizes = [8, 4] 
+    world_sizes = [1, 2, 4] 
+
+    # Define which GPU IDs to use
+    device_ids = [7, 6, 5, 4]
 
     # How many reconstruction runs to perform for statistics
     runs = 1
 
-    run_test(world_sizes, runs)
-    
-    
+    # Run the test
+    run_test(world_sizes, device_ids, runs)
+    
\ No newline at end of file
diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 8f3219ae..2c9a73ec 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -66,6 +66,7 @@ def __init__(self):
         # multiple GPUs will be used. The purpose is to allow only 1 GPU to call
         # certain methods to prevent the creation of redundant plots/reports/saves
         self.rank = None                # Rank of the subprocess running the GPU
+        self.device_id = None           # ID of the GPU being used in multi-GPU 
         self.world_size = 1             # Total number of GPUs being used.
         self.multi_gpu_used = False     # Self explanatory
 
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 5fffa387..0476ab59 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -35,7 +35,7 @@ def distributed_wrapper(rank: int,
                         func: Callable[[CDIModel, Ptycho2DDataset, int, int], None], 
                         model: CDIModel, 
                         dataset: Ptycho2DDataset, 
-                        world_size: int,
+                        device_ids: list[int],
                         backend: str = 'nccl', 
                         timeout: int = 600,
                         pipe: Connection = None):
@@ -55,8 +55,8 @@ def distributed_wrapper(rank: int,
             Model for CDI/ptychography reconstruction
         dataset: Ptycho2DDataset
             The dataset to reconstruct against
-        world_size: int
-            Number of GPUs to use
+        device_ids: list[int]
+            List of GPU IDs to use
         backend: str
             Multi-gpu communication backend to use. Default is the 'nccl' backend,
             which is the only supported backend for CDTools.
@@ -78,23 +78,28 @@ def distributed_wrapper(rank: int,
 
     # Update the rank in the model and indicate we're using multiple GPUs
     model.rank = rank
-    model.world_size = world_size
-    if world_size > 1: # In case we need to use 1 GPU for testing
+    model.device_id = device_ids[model.rank]
+    model.world_size = len(device_ids)
+
+    # Allow the process to only see the GPU is has been assigned
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(model.device_id) 
+
+    if model.world_size > 1: # In case we need to use 1 GPU for testing
         model.multi_gpu_used = True
 
     # Initialize the process group
     init_process_group(backend=backend, rank=rank, 
-                       world_size=world_size, timeout=timeout)
+                       world_size=model.world_size, timeout=timeout)
     
     # Load the model to the appropriate GPU rank the process is using
-    device = f'cuda:{rank}'
+    device='cuda'
     model.to(device=device)
     dataset.get_as(device=device) 
 
     # Wrap the model with DistributedDataParallel
     model_DDP = DDP(model,
-                    device_ids=[rank],  # Tells DDP which GPU the model lives in
-                    output_device=rank, # Tells DDP which GPU to output to
+                    device_ids=[model.device_id],  # Tells DDP which GPU the model lives in
+                    output_device=model.device_id, # Tells DDP which GPU to output to
                     find_unused_parameters=True) # TODO: Understand what this is really doing...
     
     # Don't start reconstructing until all GPUs have synced.
@@ -103,9 +108,9 @@ def distributed_wrapper(rank: int,
     # have to change `model._` to `model.module._` in the CDTools script
     # We also need to check if we want to pass a pipe to the function
     if pipe is None:
-        func(model_DDP.module, dataset, rank, world_size)    
+        func(model_DDP.module, dataset, rank, model.world_size)    
     else:
-        func(model_DDP.module, dataset, rank, world_size, pipe)   
+        func(model_DDP.module, dataset, rank, model.world_size, pipe)   
 
     # Wait for all GPUs to finish reconstructing
     barrier()                               
@@ -116,7 +121,7 @@ def distributed_wrapper(rank: int,
 def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],
           model: CDIModel,
           dataset: Ptycho2DDataset,
-          world_size: int,
+          device_ids: list[int],
           master_addr: str,
           master_port: str,
           backend: str = 'nccl',
@@ -138,8 +143,8 @@ def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],
             Model for CDI/ptychography reconstruction
         dataset: Ptycho2DDataset
             The dataset to reconstruct against
-        world_size: int
-            Number of GPUs to use
+        device_ids: list[int]
+            List of GPU IDs to use
         master_addr: str
             IP address of the machine that will host the process with rank 0
         master_port: str
@@ -170,7 +175,7 @@ def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],
     # Ensure a "graceful" termination of subprocesses if something goes wrong.
     print('\nStarting up multi-GPU reconstructions...')
     mp.spawn(distributed_wrapper,
-                args=(func, model, dataset, world_size, backend, timeout, pipe),
-                nprocs=world_size,
+                args=(func, model, dataset, device_ids, backend, timeout, pipe),
+                nprocs=len(device_ids),
                 join=True)
     print('Reconstructions complete...')

From 464ed5a5a7972c047a7d9228dd37464c63bdc38e Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 13 Mar 2025 00:18:14 +0000
Subject: [PATCH 040/115] Fixed discrepancy between user-specified batch_size
 and effective batch_size in multi-GPU reconstructions.

---
 src/cdtools/models/base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 2c9a73ec..a27f5e68 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -574,7 +574,6 @@ def Adam_optimize(
         # where a process group (i.e., multiple GPUs) has been initialized
         if self.multi_gpu_used:
             # First, create a sampler to load subsets of dataset to the GPUs
-            # TODO: Test out drop_last to see how much that influences reconstructions
             sampler = DistributedSampler(dataset,
                                          num_replicas=self.world_size,
                                          rank=self.rank,
@@ -582,7 +581,7 @@ def Adam_optimize(
                                          drop_last=False)
             # Now create the dataloader
             data_loader = torchdata.DataLoader(dataset,
-                                               batch_size=batch_size,
+                                               batch_size=batch_size//self.world_size,
                                                num_workers=0, # Creating extra threads in children processes may cause problems. Leave this at 0.
                                                drop_last=False,
                                                pin_memory=False,# I'm not 100% sure what this does, but apparently making this True can cause bugs

From 8f1ee066c56f71cbe723d7760bbad825009d3d3b Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 13 Mar 2025 00:29:03 +0000
Subject: [PATCH 041/115] Fixed list type hint error for Python 3.8

---
 examples/fancy_ptycho_multi_gpu_ddp_speed_test.py | 6 +++---
 src/cdtools/tools/distributed/distributed.py      | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
index ee0dfdb9..a4a357ec 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
@@ -14,7 +14,7 @@
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.tools.distributed import distributed
 from multiprocessing.connection import Connection
-from typing import Tuple
+from typing import Tuple, List
 from matplotlib import pyplot as plt
 import torch.multiprocessing as mp
 import time
@@ -40,7 +40,7 @@
 def reconstruct(model: CDIModel,
                 dataset: Ptycho2DDataset,
                 rank: int, 
-                device_ids: list[int],
+                device_ids: List[int],
                 conn: Connection = None,
                 schedule: bool = False) -> Tuple[np.ndarray, np.ndarray]:
     """Perform the reconstruction using several GPUs
@@ -200,7 +200,7 @@ def run_test(world_sizes: int,
     device_ids = [7, 6, 5, 4]
 
     # How many reconstruction runs to perform for statistics
-    runs = 1
+    runs = 3
 
     # Run the test
     run_test(world_sizes, device_ids, runs)
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 0476ab59..21804998 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -26,7 +26,7 @@
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 import datetime
 import os
-from typing import Callable
+from typing import Callable, List
 
 __all__ = ['distributed_wrapper', 'spawn']
 
@@ -35,7 +35,7 @@ def distributed_wrapper(rank: int,
                         func: Callable[[CDIModel, Ptycho2DDataset, int, int], None], 
                         model: CDIModel, 
                         dataset: Ptycho2DDataset, 
-                        device_ids: list[int],
+                        device_ids: List[int],
                         backend: str = 'nccl', 
                         timeout: int = 600,
                         pipe: Connection = None):
@@ -121,7 +121,7 @@ def distributed_wrapper(rank: int,
 def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],
           model: CDIModel,
           dataset: Ptycho2DDataset,
-          device_ids: list[int],
+          device_ids: List[int],
           master_addr: str,
           master_port: str,
           backend: str = 'nccl',

From c29d28cf8692f9eac91a5ba344a53c5148d399c1 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 28 Apr 2025 17:44:25 +0000
Subject: [PATCH 042/115] Switched DDP with all_reduce implementation for
 distributive computing

---
 src/cdtools/models/base.py                   | 27 +++++++++++++++++++-
 src/cdtools/tools/distributed/distributed.py | 16 +++---------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index a27f5e68..7aaa8ea2 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -31,6 +31,7 @@
 import torch as t
 from torch.utils import data as torchdata
 from torch.utils.data.distributed import DistributedSampler
+import torch.distributed as dist
 from matplotlib import pyplot as plt
 from matplotlib.widgets import Slider
 from matplotlib import ticker
@@ -417,7 +418,22 @@ def closure():
 
                         # And accumulate the gradients
                         loss.backward()
-                        total_loss += loss.detach()
+
+                        # For multi-GPU optimization, we need to average and
+                        # sync the gradients + losses across all participating
+                        # GPUs with an all-reduce call.
+                        if self.multi_gpu_used:
+                            for param in self.parameters():
+                                if param.requires_grad:
+                                    dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
+                                    param.grad.data /= self.world_size
+                            
+                            # Sum the loss value across all devices for reporting
+                            dist.all_reduce(loss, op=dist.ReduceOp.SUM) 
+                        
+                        # Normalize the accumulating total loss by the number of GPUs used
+                        total_loss += loss.detach() // self.world_size
+                        
 
                     # If we have a regularizer, we can calculate it separately,
                     # and the gradients will add to the minibatch gradient
@@ -425,6 +441,15 @@ def closure():
                        and hasattr(self, 'regularizer'):
                         loss = self.regularizer(regularization_factor)
                         loss.backward()
+
+                        # For multi-GPU optimization, we need to average and
+                        # sync the gradients + losses across all participating
+                        # GPUs with an all-reduce call.
+                        if self.multi_gpu_used:
+                            for param in self.parameters():
+                                if param.requires_grad:
+                                    dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
+                                    param.grad.data /= self.world_size
                     
                     return total_loss
 
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 21804998..191ffab2 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -95,25 +95,15 @@ def distributed_wrapper(rank: int,
     device='cuda'
     model.to(device=device)
     dataset.get_as(device=device) 
-
-    # Wrap the model with DistributedDataParallel
-    model_DDP = DDP(model,
-                    device_ids=[model.device_id],  # Tells DDP which GPU the model lives in
-                    output_device=model.device_id, # Tells DDP which GPU to output to
-                    find_unused_parameters=True) # TODO: Understand what this is really doing...
     
-    # Don't start reconstructing until all GPUs have synced.
-    barrier()   
     # Start the reconstruction loop, but feed in model_DDP.module so we don't
     # have to change `model._` to `model.module._` in the CDTools script
     # We also need to check if we want to pass a pipe to the function
     if pipe is None:
-        func(model_DDP.module, dataset, rank, model.world_size)    
+        func(model, dataset, rank, model.world_size)    
     else:
-        func(model_DDP.module, dataset, rank, model.world_size, pipe)   
-
-    # Wait for all GPUs to finish reconstructing
-    barrier()                               
+        func(model, dataset, rank, model.world_size, pipe)   
+                         
     # Destroy process group
     destroy_process_group()        
 

From 10e4d747d2b331ac8f0db0cb291799c5ecb777cb Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 28 Apr 2025 17:51:12 +0000
Subject: [PATCH 043/115] Changed name of multi-gpu speed test. Added gold
 balls example to the speed test. Added speed-up versus GPU count plot

---
 ...peed_test.py => distributed_speed_test.py} | 125 +++++++++++++-----
 1 file changed, 93 insertions(+), 32 deletions(-)
 rename examples/{fancy_ptycho_multi_gpu_ddp_speed_test.py => distributed_speed_test.py} (64%)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py b/examples/distributed_speed_test.py
similarity index 64%
rename from examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
rename to examples/distributed_speed_test.py
index a4a357ec..67b2539f 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp_speed_test.py
+++ b/examples/distributed_speed_test.py
@@ -13,28 +13,19 @@
 from cdtools.models import CDIModel
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.tools.distributed import distributed
+import torch.multiprocessing as mp
+import torch as t
 from multiprocessing.connection import Connection
 from typing import Tuple, List
 from matplotlib import pyplot as plt
-import torch.multiprocessing as mp
 import time
 import numpy as np
 from copy import deepcopy
 
-# Load the dataset
-filename = r'examples/example_data/lab_ptycho_data.cxi'
-dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
-
-# Create the model
-model = cdtools.models.FancyPtycho.from_dataset(
-    dataset,
-    n_modes=3,
-    oversampling=2, 
-    probe_support_radius=120, 
-    propagation_distance=5e-3,
-    units='mm', 
-    obj_view_crop=-50,
-)
+#The test to run:
+#   'fancy_ptycho' - Runs reconstruction parameters from examples/fancy_ptycho.py
+#   'gold_balls' - Runs reconstruction parameters from examples/gold_ball_ptycho.py
+TEST = 'fancy_ptycho'
 
 # Multi-GPU supported reconstruction
 def reconstruct(model: CDIModel,
@@ -82,21 +73,39 @@ def reconstruct(model: CDIModel,
         model.to(device=device)
         dataset.get_as(device=device)
 
+
     # Perform reconstructions on either single or multi-GPU workflows.
-    for loss in model.Adam_optimize(100, dataset, lr=0.02, batch_size=10):
-        if rank == 0:
-            print(model.report())
-            t_list.append(time.time() - t_start)
+    if TEST == 'fancy_ptycho':
+        for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
+            if rank == 0:
+                print(model.report())
+                t_list.append(time.time() - t_start)
+
+        for loss in model.Adam_optimize(25, dataset, lr=0.005, batch_size=40):
+            if rank == 0:
+                print(model.report())
+                t_list.append(time.time() - t_start)
 
-    for loss in model.Adam_optimize(100, dataset, lr=0.005, batch_size=50):
-        if rank == 0:
-            print(model.report())
-            t_list.append(time.time() - t_start)
+        for loss in model.Adam_optimize(25, dataset, lr=0.001, batch_size=40):
+            if rank == 0:
+                print(model.report())
+                t_list.append(time.time() - t_start)
 
-    for loss in model.Adam_optimize(100, dataset, lr=0.001, batch_size=50):
-        if rank == 0:
-            print(model.report())
-            t_list.append(time.time() - t_start)
+    elif TEST == 'gold_balls':
+        for loss in model.Adam_optimize(20, dataset, lr=0.005, batch_size=50):
+            if rank == 0: 
+                print(model.report())
+                t_list.append(time.time() - t_start)
+
+        for loss in model.Adam_optimize(50, dataset, lr=0.002, batch_size=100):
+            if rank == 0:
+                print(model.report())
+                t_list.append(time.time() - t_start)
+
+        for loss in model.Adam_optimize(100, dataset, lr=0.001, batch_size=100):
+            if rank == 0:
+                print(model.report())
+                t_list.append(time.time() - t_start)
 
     # We need to send the time_history and loss_history through
     # the child connection to the parent (sitting in the name-main block)
@@ -126,11 +135,47 @@ def run_test(world_sizes: int,
         runs: int
             How many repeat reconstructions to perform
     """
+    # Load the dataset and model
+    if TEST == 'fancy_ptycho':
+        filename = 'cdtools/examples/example_data/lab_ptycho_data.cxi'
+    elif TEST == 'gold_balls':
+        filename = 'cdtools/examples/example_data/AuBalls_700ms_30nmStep_3_6SS_filter.cxi'
+
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+    
+    if TEST == 'fancy_ptycho':
+        model = cdtools.models.FancyPtycho.from_dataset(
+            dataset,
+            n_modes=3,
+            oversampling=2, 
+            probe_support_radius=120, 
+            propagation_distance=5e-3,
+            units='mm', 
+            obj_view_crop=-50)
+        
+    elif TEST == 'gold_balls':
+        pad = 10
+        dataset.pad(pad)
+        model = cdtools.models.FancyPtycho.from_dataset(
+            dataset,
+            n_modes=3,
+            probe_support_radius=50,
+            propagation_distance=2e-6,
+            units='um',
+            probe_fourier_crop=pad)
+        model.translation_offsets.data += 0.7 * t.randn_like(model.translation_offsets)
+        model.weights.requires_grad = False
+
     # Set up a parent/child connection to get some info from the GPU-accelerated function
     parent_conn, child_conn = mp.Pipe()
     
-    # Execute
-    fig, (ax1,ax2) = plt.subplots(1,2)
+    # Set stuff up for plots
+    fig, (ax1,ax2,ax3) = plt.subplots(1,3)
+
+    # Store the value of the single GPU time
+    time_1gpu = 0
+    std_1gpu = 0
+
     for world_size in world_sizes:
         # Get the GPU IDs to use
         dev_id = device_ids[0:world_size] 
@@ -160,7 +205,8 @@ def run_test(world_sizes: int,
                                     master_addr = 'localhost',
                                     master_port = '8888',
                                     timeout=300,
-                                    pipe=child_conn)
+                                    pipe=child_conn,
+                                    nccl_p2p_disable=True)
                 while parent_conn.poll():
                     final_time, loss_history = parent_conn.recv()
                     time_list.append(final_time)
@@ -172,11 +218,22 @@ def run_test(world_sizes: int,
         loss_mean = np.array(loss_hist_list).mean(axis=0)
         loss_std = np.array(loss_hist_list).std(axis=0)
 
+        # If a single GPU is used, store the time
+        if world_size == 1:
+            time_1gpu = time_mean[-1]
+            std_1gpu = time_std[-1]
+
+        # Calculate the speed-up relative to using a single GPU
+        speed_up_mean = time_1gpu / time_mean[-1] 
+        speed_up_std = speed_up_mean * \
+            np.sqrt((std_1gpu/time_1gpu)**2 + (time_std[-1]/time_mean[-1])**2)
+
         # Add another plot
         ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
                     label=f'{world_size} GPUs')
         ax2.errorbar(np.arange(0,loss_mean.shape[0]), loss_mean, yerr=loss_std,
                     label=f'{world_size} GPUs')
+        ax3.errorbar(world_size, speed_up_mean, yerr=speed_up_std, fmt='o')
         
     # Plot
     fig.suptitle(f'Multi-GPU performance test | {runs} runs performed')
@@ -184,20 +241,24 @@ def run_test(world_sizes: int,
     ax1.set_xscale('linear')
     ax2.set_yscale('log')
     ax2.set_xscale('linear')
+    ax3.set_yscale('linear')
+    ax3.set_xscale('linear')
     ax1.legend()
     ax2.legend()
     ax1.set_xlabel('Time (min)')
     ax1.set_ylabel('Loss')
     ax2.set_xlabel('Epochs')
+    ax3.set_xlabel('Number of GPUs')
+    ax3.set_ylabel('Speed-up relative to single GPU')
     plt.show()
 
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
     # Define the number of GPUs to use.
-    world_sizes = [1, 2, 4] 
+    world_sizes = [1, 2, 4, 6] 
 
     # Define which GPU IDs to use
-    device_ids = [7, 6, 5, 4]
+    device_ids = [7, 6, 5, 4, 3, 2, 1]
 
     # How many reconstruction runs to perform for statistics
     runs = 3

From 7763520fe4673b2b0eaefd5d585413199d043871 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 9 May 2025 21:57:34 +0000
Subject: [PATCH 044/115] Created Reconstructor class to enable separation of
 optimization loops from CDIModel

---
 examples/fancy_ptycho_optimizer.py |  49 +++++
 src/cdtools/optimizer/__init__.py  |  18 ++
 src/cdtools/optimizer/adam.py      | 142 ++++++++++++++
 src/cdtools/optimizer/base.py      | 305 +++++++++++++++++++++++++++++
 4 files changed, 514 insertions(+)
 create mode 100644 examples/fancy_ptycho_optimizer.py
 create mode 100644 src/cdtools/optimizer/__init__.py
 create mode 100644 src/cdtools/optimizer/adam.py
 create mode 100644 src/cdtools/optimizer/base.py

diff --git a/examples/fancy_ptycho_optimizer.py b/examples/fancy_ptycho_optimizer.py
new file mode 100644
index 00000000..3011a53b
--- /dev/null
+++ b/examples/fancy_ptycho_optimizer.py
@@ -0,0 +1,49 @@
+import cdtools
+import cdtools.optimizer
+from matplotlib import pyplot as plt
+
+filename = 'examples/example_data/lab_ptycho_data.cxi'
+dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+# FancyPtycho is the workhorse model
+model = cdtools.models.FancyPtycho.from_dataset(
+    dataset,
+    n_modes=3, # Use 3 incoherently mixing probe modes
+    oversampling=2, # Simulate the probe on a 2xlarger real-space array
+    probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
+    propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
+    units='mm', # Set the units for the live plots
+    obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
+)
+
+device = 'cuda'
+model.to(device=device)
+dataset.get_as(device=device)
+
+# An Adam Reconstructor object is created to perform Adam 
+# optimization on the FancyPtycho model and dataset
+recon = cdtools.optimizer.Adam(model, dataset)
+
+# The learning rate parameter sets the alpha for Adam.
+# The beta parameters are (0.9, 0.999) by default
+# The batch size sets the minibatch size
+for loss in recon.optimize(50, lr=0.02, batch_size=10):
+    print(model.report())
+    # Plotting is expensive, so we only do it every tenth epoch
+    if model.epoch % 10 == 0:
+        model.inspect(dataset)
+
+# It's common to chain several different reconstruction loops. Here, we
+# started with an aggressive refinement to find the probe, and now we
+# polish the reconstruction with a lower learning rate and larger minibatch
+for loss in recon.optimize(50, lr=0.005, batch_size=50):
+    print(model.report())
+    if model.epoch % 10 == 0:
+        model.inspect(dataset)
+
+# This orthogonalizes the recovered probe modes
+model.tidy_probes()
+
+model.inspect(dataset)
+model.compare(dataset)
+plt.show()
diff --git a/src/cdtools/optimizer/__init__.py b/src/cdtools/optimizer/__init__.py
new file mode 100644
index 00000000..4777830a
--- /dev/null
+++ b/src/cdtools/optimizer/__init__.py
@@ -0,0 +1,18 @@
+"""This module contains optimizers for performing reconstructions
+
+"""
+
+# We define __all__ to be sure that import * only imports what we want
+__all__ = [
+    'CDIModel',
+    'SimplePtycho',
+    'FancyPtycho',
+    'Bragg2DPtycho',
+    'Multislice2DPtycho',
+    'MultislicePtycho',
+    'RPI',
+]
+
+from cdtools.optimizer.base import Reconstructor
+from cdtools.optimizer.adam import Adam
+
diff --git a/src/cdtools/optimizer/adam.py b/src/cdtools/optimizer/adam.py
new file mode 100644
index 00000000..d0046bf6
--- /dev/null
+++ b/src/cdtools/optimizer/adam.py
@@ -0,0 +1,142 @@
+"""Adam optimizer class
+
+"""
+import torch as t
+from torch.utils import data as torchdata
+from torch.utils.data.distributed import DistributedSampler
+import torch.distributed as dist
+from matplotlib import pyplot as plt
+from matplotlib.widgets import Slider
+from matplotlib import ticker
+import numpy as np
+import threading
+import queue
+import time
+from scipy import io
+from contextlib import contextmanager
+from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
+from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
+from cdtools.models import CDIModel
+from typing import Tuple, List
+from cdtools.optimizer import Reconstructor
+
+__all__ = ['Adam']
+
+class Adam(Reconstructor):
+    def __init__(self,
+                 model: CDIModel,
+                 dataset: Ptycho2DDataset,
+                 schedule: bool = False,
+                 subset: List[int] = None,
+                 thread: bool = True):
+        
+        super().__init__(model, 
+                         dataset, 
+                         subset, 
+                         thread)
+        
+        # Define the optimizer
+        self.optimizer = t.optim.Adam(self.model.parameters())
+        
+
+    def setup_dataloader(self,
+                         batch_size):
+        # Make a dataloader suited for either single-GPU use or cases
+        # where a process group (i.e., multiple GPUs) has been initialized
+        if self.multi_gpu_used:
+            # First, create a sampler to load subsets of dataset to the GPUs
+            self.sampler = DistributedSampler(self.dataset,
+                                            num_replicas=self.world_size,
+                                            rank=self.rank,
+                                            shuffle=True,
+                                            drop_last=False)
+            # Now create the dataloader
+            self.data_loader = torchdata.DataLoader(self.dataset,
+                                                    batch_size=batch_size//self.world_size,
+                                                    num_workers=0, # Creating extra threads in children processes may cause problems. Leave this at 0.
+                                                    drop_last=False,
+                                                    pin_memory=False,# I'm not 100% sure what this does, but apparently making this True can cause bugs
+                                                    sampler=self.sampler)
+        else:
+            self.data_loader = torchdata.DataLoader(self.dataset,
+                                                    batch_size=batch_size,
+                                                    shuffle=True)
+        # Store the optimizer parameters
+        #self.hyperparameters['lr'] = lr
+        #self.hyperparameters['betas'] = betas
+        #self.hyperparameters['amsgrad'] = amsgrad
+    
+    def adjust_optimizer(self,
+                         lr=0.005,
+                         betas=(0.9, 0.999),
+                         amsgrad=False):
+
+        for param_group in self.optimizer.param_groups:
+            param_group['lr'] = lr
+            param_group['betas'] = betas
+            param_group['amsgrad'] = amsgrad
+
+
+    def optimize(self,
+                 iterations,
+                 batch_size=15,
+                 lr=0.005,
+                 betas=(0.9, 0.999),
+                 schedule=False,
+                 amsgrad=False,
+                 subset=None,
+                 regularization_factor = None,
+                 thread=True,
+                 calculation_width=10):
+        """Runs a round of reconstruction using the Adam optimizer
+
+        Formerly CDIModel.Adam_optimize
+        
+        This calls the base Reconstructor.optimize method
+        (formerly CDIModel.AD_optimize) to run a round of reconstruction
+
+        NOTE: A decision should be made regarding whether self.optimize
+              should only be allowed to adjust reconstruction
+              hyperparameters rather than initialize them
+        """
+        self.model.training_history += (
+            f'Planning {iterations} epochs of Adam, with a learning rate = '
+            f'{lr}, batch size = {batch_size}, regularization_factor = '
+            f'{regularization_factor}, and schedule = {schedule}.\n'
+        )
+
+        #############################################################
+        # The subset statement is contained in Reconstructor.__init__
+        #############################################################
+
+        #############################################################
+        # The dataloader step is handled by self.dataloader
+        # TODO: Figure out a way to adjust the batch_size without
+        #       creating a brand-spanking-new one each time
+        #############################################################
+        self.setup_dataloader(batch_size)
+
+        #############################################################
+        # The optimizer is created in self.__init__, but the 
+        # hyperparameters need to be set up with self.adjust_optimizer
+        #############################################################
+        self.adjust_optimizer(lr,
+                              betas,
+                              amsgrad)
+
+        #############################################################
+        # Define the scheduler
+        # NOTE: We may want to define this in __init__ and simply
+        #
+        if schedule:
+            self.scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 
+                                                                    factor=0.2,
+                                                                    threshold=1e-9)
+        else:
+            self.scheduler = None
+
+
+        return super(Adam, self).optimize(iterations,
+                                            regularization_factor,
+                                            thread,
+                                            calculation_width)
\ No newline at end of file
diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/optimizer/base.py
new file mode 100644
index 00000000..e8116e6e
--- /dev/null
+++ b/src/cdtools/optimizer/base.py
@@ -0,0 +1,305 @@
+"""This module contains the base Reconstructor class for performing
+optimization ('reconstructions') on ptychographic/CDI models.
+
+The Reconstructor class is designed to resemble so-called
+'Trainer' classes that (in the language of the AI/ML folks) that
+handles the 'training' of a model given some dataset and optimizer.
+
+The subclasses of the Reconstructor class are required to implement
+their own data loaders and optimizer adjusters
+"""
+
+import torch as t
+from torch.utils import data as torchdata
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+import threading
+import queue
+import time
+from contextlib import contextmanager
+from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
+from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
+from cdtools.models import CDIModel
+import torch.distributed as dist
+from typing import Tuple, List
+
+__all__ = ['Reconstructor']
+
+class Reconstructor:
+    def __init__(self,
+                 model: CDIModel,
+                 dataset: Ptycho2DDataset,
+                 subset: List[int] = None,
+                 thread: bool = True):
+        
+        # Store parameters as attributes
+        self.multi_gpu_used = model.multi_gpu_used
+        self.world_size = model.world_size
+        self.rank = model.rank
+        self.subset = subset
+        self.thread = thread
+
+        # Initialize some attributes that must be defined by other methods
+        self.optimizer = None
+        self.scheduler = None
+        self.data_loader = None
+        #self.epoch = model.epoch
+        
+        # Store either the original or DDP-wrapped model, along with
+        # references to model attributes/methods
+        """ For now, don't do DDP-wrapping; check if porting the bits
+        and pieces from CDIModel works before doing DDP.
+        if self.multi_gpu_used:
+            self.model = DDP(model, 
+                             device_ids=[0]) # This is used if CUDA_VISIBLE_DEVICES is manually set
+            store_references(self.model.module)
+        
+        else:
+        """
+        self.model = model
+        #store_model_attributes(self.model)
+
+        # Store the dataset
+        if subset is not None:
+            # if subset is just one pattern, turn into a list for convenience
+            if type(subset) == type(1):
+                subset = [subset]
+            dataset = torchdata.Subset(dataset, subset)
+        
+        self.dataset = dataset
+
+        
+    def setup_dataloader(self, **kwargs):
+        """Sets up the dataloader 
+
+        The dataloader needs to be defined manually for each subclass.
+        While each subclass will likely use similar calls to 
+        """
+        raise NotImplementedError()
+
+
+    def adjust_optimizer(self, **kwargs):
+        """This is to allow us to set up parameters for whatever optimizer we're
+        interested in using.
+
+        The different optimization schemes (Adam, LBFGS, SGD) seem to take in 
+        different hyperparameters. This function is intended to modify the
+        parameters
+
+        This is not defined here. For each optimizer, the keyword
+        arguments should be manually defined as parameters
+        """
+        raise NotImplementedError()
+
+
+    def optimize(self,
+                 iterations,
+                 regularization_factor=None,
+                 thread=True,
+                 calculation_width=10):
+        """Runs a round of reconstruction using the provided optimizer
+        
+        Formerly CDIModel.AD_optimize
+
+        This is the basic automatic differentiation reconstruction tool
+        which all the other, algorithm-specific tools, use. It is a
+        generator which yields the average loss each epoch, ending after
+        the specified number of iterations.
+
+        By default, the computation will be run in a separate thread. This
+        is done to enable live plotting with matplotlib during a reconstruction.
+        If the computation was done in the main thread, this would freeze
+        the plots. This behavior can be turned off by setting the keyword
+        argument 'thread' to False.        
+
+        Parameters
+        ----------
+        iterations : int
+            How many epochs of the algorithm to run
+        data_loader : torch.utils.data.DataLoader
+            A data loader loading the CDataset to reconstruct
+        optimizer : torch.optim.Optimizer
+            The optimizer to run the reconstruction with
+        scheduler : torch.optim.lr_scheduler._LRScheduler
+            Optional, a learning rate scheduler to use
+        regularization_factor : float or list(float)
+            Optional, if the model has a regularizer defined, the set of parameters to pass the regularizer method
+        thread : bool
+            Default True, whether to run the computation in a separate thread to allow interaction with plots during computation
+        calculation_width : int
+            Default 10, how many translations to pass through at once for each round of gradient accumulation. This does not affect the result, but may affect the calculation speed.
+
+        Yields
+        ------
+        loss : float
+            The summed loss over the latest epoch, divided by the total diffraction pattern intensity
+        """
+
+        def run_epoch(stop_event=None):
+            """Runs one full epoch of the reconstruction."""
+            # If we're using DistributedSampler (likely the case if you're using 
+            # multiple GPUs), we need to tell it which epoch we're on. Otherwise
+            # data shuffling will not work properly
+            if self.multi_gpu_used: 
+                self.data_loader.sampler.set_epoch(self.model.epoch)
+
+            # First, initialize some tracking variables
+            normalization = 0
+            loss = 0
+            N = 0
+            t0 = time.time()
+
+            # The data loader is responsible for setting the minibatch
+            # size, so each set is a minibatch
+            for inputs, patterns in self.data_loader:
+                normalization += t.sum(patterns).cpu().numpy()
+                N += 1
+                def closure():
+                    self.optimizer.zero_grad()
+
+                    # We further break up the minibatch into a set of chunks.
+                    # This lets us use larger minibatches than can fit
+                    # on the GPU at once, while still doing batch processing
+                    # for efficiency
+                    input_chunks = [[inp[i:i + calculation_width]
+                                     for inp in inputs]
+                                    for i in range(0, len(inputs[0]),
+                                                   calculation_width)]
+                    pattern_chunks = [patterns[i:i + calculation_width]
+                                      for i in range(0, len(inputs[0]),
+                                                     calculation_width)]
+
+                    total_loss = 0
+                    for inp, pats in zip(input_chunks, pattern_chunks):
+                        # This check allows for graceful exit when threading
+                        if stop_event is not None and stop_event.is_set():
+                            exit()
+
+                        # Run the simulation
+                        sim_patterns = self.model.forward(*inp) 
+
+                        # Calculate the loss
+                        if hasattr(self, 'mask'):
+                            loss = self.model.loss(pats,sim_patterns, mask=self.model.mask)
+                        else:
+                            loss = self.model.loss(pats,sim_patterns)
+
+                        # And accumulate the gradients
+                        loss.backward()
+
+                        # For multi-GPU optimization, we need to average and
+                        # sync the gradients + losses across all participating
+                        # GPUs with an all-reduce call.
+                        if self.multi_gpu_used:
+                            for param in self.model.parameters():
+                                if param.requires_grad:
+                                    dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
+                                    param.grad.data /= self.model.world_size
+                            
+                            # Sum the loss value across all devices for reporting
+                            dist.all_reduce(loss, op=dist.ReduceOp.SUM) 
+                        
+                        # Normalize the accumulating total loss by the number of GPUs used
+                        total_loss += loss.detach() // self.model.world_size
+                        
+
+                    # If we have a regularizer, we can calculate it separately,
+                    # and the gradients will add to the minibatch gradient
+                    if regularization_factor is not None \
+                       and hasattr(self.model, 'regularizer'):
+                        loss = self.model.regularizer(regularization_factor)
+                        loss.backward()
+
+                        # For multi-GPU optimization, we need to average and
+                        # sync the gradients + losses across all participating
+                        # GPUs with an all-reduce call.
+                        if self.multi_gpu_used:
+                            for param in self.model.parameters():
+                                if param.requires_grad:
+                                    dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
+                                    param.grad.data /= self.model.world_size
+                    
+                    return total_loss
+
+                # This takes the step for this minibatch
+                loss += self.optimizer.step(closure).detach().cpu().numpy()
+            
+            loss /= normalization
+
+            # We step the scheduler after the full epoch
+            if self.scheduler is not None:
+                self.scheduler.step(loss)
+
+            self.model.loss_history.append(loss)
+            self.model.epoch = len(self.model.loss_history)
+            self.model.latest_iteration_time = time.time() - t0
+            self.model.training_history += self.model.report() + '\n'
+            return loss
+
+        # We store the current optimizer as a model parameter so that
+        # it can be saved and loaded for checkpointing
+        self.current_optimizer = self.optimizer
+        
+        # If we don't want to run in a different thread, this is easy
+        if not thread:
+            for it in range(iterations):
+                if self.model.skip_computation():
+                    self.epoch = self.epoch + 1
+                    if len(self.model.loss_history) >= 1:
+                        yield self.model.loss_history[-1]
+                    else:
+                        yield float('nan')
+                    continue
+
+                yield run_epoch()
+                    
+                
+        # But if we do want to thread, it's annoying:
+        else:
+            # Here we set up the communication with the computation thread
+            result_queue = queue.Queue()
+            stop_event = threading.Event()
+            def target():
+                try:
+                    result_queue.put(run_epoch(stop_event))
+                except Exception as e:
+                    # If something bad happens, put the exception into the
+                    # result queue
+                    result_queue.put(e)
+
+            # And this actually starts and monitors the thread
+            for it in range(iterations):
+                if self.model.skip_computation():
+                    self.model.epoch = self.model.epoch + 1                    
+                    if len(self.model.loss_history) >= 1:
+                        yield self.model.loss_history[-1]
+                    else:
+                        yield float('nan')
+                    continue
+
+                calc = threading.Thread(target=target, name='calculator', daemon=True)
+                try:
+                    calc.start()
+                    while calc.is_alive():
+                        if hasattr(self.model, 'figs'):
+                            self.model.figs[0].canvas.start_event_loop(0.01)
+                        else:
+                            calc.join()
+
+                except KeyboardInterrupt as e:
+                    stop_event.set()
+                    print('\nAsking execution thread to stop cleanly - please be patient.')
+                    calc.join()
+                    raise e
+
+                res = result_queue.get()
+
+                # If something went wrong in the thead, we'll get an exception
+                if isinstance(res, Exception):
+                    raise res
+
+                yield res
+
+        # And finally, we unset the current optimizer:
+        self.current_optimizer = None
\ No newline at end of file

From c610e875e8eebddcb4e788bb393b9cd60aff37b4 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 9 May 2025 22:02:21 +0000
Subject: [PATCH 045/115] Added comment to adam.py to highlight similarities to
 CDIModel.AD_optimize

---
 src/cdtools/optimizer/adam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cdtools/optimizer/adam.py b/src/cdtools/optimizer/adam.py
index d0046bf6..89d620b1 100644
--- a/src/cdtools/optimizer/adam.py
+++ b/src/cdtools/optimizer/adam.py
@@ -135,7 +135,7 @@ def optimize(self,
         else:
             self.scheduler = None
 
-
+        # This is analagous to making a call to CDIModel.AD_optimize
         return super(Adam, self).optimize(iterations,
                                             regularization_factor,
                                             thread,

From 3650f3b38fc26beab7c44beb918bacba023ccab8 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 10 May 2025 02:12:18 +0000
Subject: [PATCH 046/115] Created a script to compare performance of the old
 and new methods for running reconstructions

---
 examples/fancy_ptycho_comparison.py | 111 ++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 examples/fancy_ptycho_comparison.py

diff --git a/examples/fancy_ptycho_comparison.py b/examples/fancy_ptycho_comparison.py
new file mode 100644
index 00000000..c8d4b71a
--- /dev/null
+++ b/examples/fancy_ptycho_comparison.py
@@ -0,0 +1,111 @@
+"""This script runs reconstructions using both the old
+method of cdtools reconstruction (model.Adam_optimize)
+and the new method based on the creation of a Reconstructor
+class
+
+"""
+
+
+import cdtools
+import cdtools.optimizer
+import torch as t
+import numpy as np
+import time
+import copy
+from matplotlib import pyplot as plt
+
+t.manual_seed(0)
+
+filename = 'examples/example_data/AuBalls_700ms_30nmStep_3_6SS_filter.cxi'
+dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+# Create a dict to store loss values
+losses = {}
+
+pad = 10
+dataset.pad(pad)
+model_original = cdtools.models.FancyPtycho.from_dataset(
+    dataset,
+    n_modes=3,
+    probe_support_radius=50,
+    propagation_distance=2e-6,
+    units='um',
+    probe_fourier_crop=pad)
+model_original.translation_offsets.data += 0.7 * t.randn_like(model_original.translation_offsets)
+model_original.weights.requires_grad = False
+
+def reload_model():
+    return copy.deepcopy(model_original)
+    
+
+# For running the optimizer class
+numiter = 5
+
+# Set stuff up for plots
+fig, (ax1,ax2) = plt.subplots(1,2)
+
+for option in ('old_method', 'optimizer'):
+    time_list = []
+    loss_hist_list = []
+
+    # Iterate n-number of times for statistics
+    for i in range(numiter):
+        t.cuda.empty_cache()
+        model = reload_model()
+        device = 'cuda'
+        model.to(device=device)
+        dataset.get_as(device=device)
+        # Construct a local time list
+        local_time_list = []
+        t_start = time.time()
+
+        def report_n_record():
+            print(model.report())
+            local_time_list.append(time.time() - t_start)
+
+        if option == 'optimizer':
+            recon = cdtools.optimizer.Adam(model, dataset)
+            for loss in recon.optimize(20, lr=0.005, batch_size=50):
+                report_n_record()
+            for loss in recon.optimize(20, lr=0.002, batch_size=100):
+                report_n_record()
+            for loss in recon.optimize(20, lr=0.001, batch_size=100):
+                report_n_record()
+        
+        elif option == 'old_method':
+            for loss in model.Adam_optimize(20, dataset, lr=0.005, batch_size=50):
+                report_n_record()
+            for loss in model.Adam_optimize(20, dataset, lr=0.002, batch_size=100):
+                report_n_record()
+            for loss in model.Adam_optimize(20, dataset, lr=0.001, batch_size=100):
+                report_n_record()
+
+        # After reconstructing, store the loss history and time values
+        loss_hist_list.append(model.loss_history)
+        time_list.append(local_time_list)
+
+    # After testing either the new or old method, calculate the statistics and plot
+    time_mean = np.array(time_list).mean(axis=0)/60
+    time_std = np.array(time_list).std(axis=0)/60
+    loss_mean = np.array(loss_hist_list).mean(axis=0)
+    loss_std = np.array(loss_hist_list).std(axis=0)
+
+    ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
+                    label=option)
+    ax2.errorbar(np.arange(0,loss_mean.shape[0]), loss_mean, yerr=loss_std,
+                label=option)
+
+# Plot                     
+fig.suptitle(f'Comparing old and new optimization refactor | {numiter} runs performed')
+ax1.set_yscale('log')
+ax1.set_xscale('linear')
+ax2.set_yscale('log')
+ax2.set_xscale('linear')
+ax1.legend()
+ax2.legend()
+ax1.set_xlabel('Time (min)')
+ax1.set_ylabel('Loss')
+ax2.set_xlabel('Epochs')
+plt.show()
+
+

From 30d17b350d7b3ffcbf8d99ec45c203f315f0a854 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 10 May 2025 02:21:59 +0000
Subject: [PATCH 047/115] MultiGPU fancy_ptycho example now works with the
 Reconstructor class

---
 examples/fancy_ptycho_multi_gpu_ddp.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu_ddp.py
index 6f7a0212..21ad2c89 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu_ddp.py
@@ -1,4 +1,5 @@
 import cdtools
+import cdtools.optimizer
 from matplotlib import pyplot as plt
 
 # We need to import 2 additional functions
@@ -16,7 +17,7 @@
     propagation_distance=5e-3,
     units='mm', 
     obj_view_crop=-50)
-
+model.background.requires_grad=True
 # Remove or comment out lines moving the dataset and model to GPU.
 # This process will be handled by the cdtools.tools.distributed methods.
 
@@ -36,7 +37,7 @@
 # GPU, and world_size is the total number of GPUs used.
 
 def multi_gpu_reconstruct(model, dataset, rank, world_size):
-
+    """
     for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=10):
         
         # We can still perform model.report, but we want only 1 GPU printing stuff.
@@ -54,8 +55,17 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
         if model.epoch % 20 == 0:
             if rank == 0:
                 model.inspect(dataset)
+    """
+    recon = cdtools.optimizer.Adam(model,dataset)
+    if rank == 0:
+        model.inspect(dataset)
 
-    model.tidy_probes()
+    for loss in recon.optimize(50, lr=0.02, batch_size=50):
+        if rank == 0: print(model.report())
+        # Plotting is expensive, so we only do it every tenth epoch
+        if model.epoch % 10 == 0 and rank == 0:
+            model.inspect(dataset)
+    #model.tidy_probes()
     model.inspect(dataset)
 
     # You don't need to add the `if rank == 0` here either...
@@ -81,7 +91,7 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
     distributed.spawn(multi_gpu_reconstruct, 
                       model=model,
                       dataset=dataset,
-                      device_ids = [1,3,6,7],
+                      device_ids = [0,1,2,3],
                       master_addr='localhost',
                       master_port='8888',
-                      timeout=6000)
+                      timeout=30)

From 30c1f77f644b333d85d2257083213a9a6da2733b Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 10 May 2025 02:49:20 +0000
Subject: [PATCH 048/115] Distributed speed test now works with the
 Reconstructor class

---
 examples/distributed_speed_test.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/examples/distributed_speed_test.py b/examples/distributed_speed_test.py
index 67b2539f..ccba2843 100644
--- a/examples/distributed_speed_test.py
+++ b/examples/distributed_speed_test.py
@@ -11,6 +11,7 @@
 
 import cdtools
 from cdtools.models import CDIModel
+import cdtools.optimizer
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.tools.distributed import distributed
 import torch.multiprocessing as mp
@@ -73,36 +74,38 @@ def reconstruct(model: CDIModel,
         model.to(device=device)
         dataset.get_as(device=device)
 
+    # Set up the Reconstructor with the Adam optimizer
+    recon = cdtools.optimizer.Adam(model,dataset)
 
     # Perform reconstructions on either single or multi-GPU workflows.
     if TEST == 'fancy_ptycho':
-        for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
+        for loss in recon.optimize(50, lr=0.02, batch_size=40):
             if rank == 0:
                 print(model.report())
                 t_list.append(time.time() - t_start)
 
-        for loss in model.Adam_optimize(25, dataset, lr=0.005, batch_size=40):
+        for loss in recon.optimize(25, lr=0.005, batch_size=40):
             if rank == 0:
                 print(model.report())
                 t_list.append(time.time() - t_start)
 
-        for loss in model.Adam_optimize(25, dataset, lr=0.001, batch_size=40):
+        for loss in recon.optimize(25, lr=0.001, batch_size=40):
             if rank == 0:
                 print(model.report())
                 t_list.append(time.time() - t_start)
 
     elif TEST == 'gold_balls':
-        for loss in model.Adam_optimize(20, dataset, lr=0.005, batch_size=50):
+        for loss in recon.optimize(20, lr=0.005, batch_size=50):
             if rank == 0: 
                 print(model.report())
                 t_list.append(time.time() - t_start)
 
-        for loss in model.Adam_optimize(50, dataset, lr=0.002, batch_size=100):
+        for loss in recon.optimize(50, lr=0.002, batch_size=100):
             if rank == 0:
                 print(model.report())
                 t_list.append(time.time() - t_start)
 
-        for loss in model.Adam_optimize(100, dataset, lr=0.001, batch_size=100):
+        for loss in recon.optimize(100, lr=0.001, batch_size=100):
             if rank == 0:
                 print(model.report())
                 t_list.append(time.time() - t_start)
@@ -255,10 +258,10 @@ def run_test(world_sizes: int,
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
     # Define the number of GPUs to use.
-    world_sizes = [1, 2, 4, 6] 
+    world_sizes = [1, 2, 4] 
 
     # Define which GPU IDs to use
-    device_ids = [7, 6, 5, 4, 3, 2, 1]
+    device_ids = [1, 2, 5, 7]
 
     # How many reconstruction runs to perform for statistics
     runs = 3

From 854c907cf1d55d44f0b7d94086ac8c0fe77ae16e Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 12 May 2025 22:14:16 +0000
Subject: [PATCH 049/115] Cleaned up the descriptions and removed unneccessary
 attributes for the Reconstructor class and subclass.

---
 src/cdtools/optimizer/adam.py | 116 ++++++++++++++++++++++------------
 src/cdtools/optimizer/base.py |  97 ++++++++++++++--------------
 2 files changed, 123 insertions(+), 90 deletions(-)

diff --git a/src/cdtools/optimizer/adam.py b/src/cdtools/optimizer/adam.py
index 89d620b1..810ba7c0 100644
--- a/src/cdtools/optimizer/adam.py
+++ b/src/cdtools/optimizer/adam.py
@@ -1,17 +1,14 @@
-"""Adam optimizer class
+"""This module contains the Adam Reconstructor subclass for performing
+optimization ('reconstructions') on ptychographic/CDI models using
+the Adam optimizer.
 
+The Reconstructor class is designed to resemble so-called
+'Trainer' classes that (in the language of the AI/ML folks) handles
+the 'training' of a model given some dataset and optimizer.
 """
 import torch as t
 from torch.utils import data as torchdata
 from torch.utils.data.distributed import DistributedSampler
-import torch.distributed as dist
-from matplotlib import pyplot as plt
-from matplotlib.widgets import Slider
-from matplotlib import ticker
-import numpy as np
-import threading
-import queue
-import time
 from scipy import io
 from contextlib import contextmanager
 from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
@@ -23,44 +20,76 @@
 __all__ = ['Adam']
 
 class Adam(Reconstructor):
+    """
+    The Adam Reconstructor subclass handles the optimization ('reconstruction') of 
+    ptychographic models and datasets using the Adam optimizer.
+
+    Parameters
+    ----------
+    model: CDIModel
+        Model for CDI/ptychography reconstruction
+    dataset: Ptycho2DDataset
+        The dataset to reconstruct against
+    subset : list(int) or int
+        Optional, a pattern index or list of pattern indices to use
+
+    Important attributes:
+    - **model** -- Always points to the core model used.
+    - **multi_gpu_used** -- Whether or not multi-GPU computation will be performed
+      using a distributed data approach. This attribute will be pulled from the
+      CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
+    - **optimizer** -- This class by default uses `torch.optim.Adam` to perform
+      optimizations.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that must be defined when creating the
+      `Reconstructor` subclass through the `setup_scheduler` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that must be defined when creating
+      the Reconstructor subclass through the `setup_dataloader` method.
+    """
     def __init__(self,
                  model: CDIModel,
                  dataset: Ptycho2DDataset,
-                 schedule: bool = False,
-                 subset: List[int] = None,
-                 thread: bool = True):
-        
-        super().__init__(model, 
-                         dataset, 
-                         subset, 
-                         thread)
+                 subset: List[int] = None):
+
+        super().__init__(model, dataset, subset)
         
-        # Define the optimizer
+        # Define the optimizer for use in this subclass
         self.optimizer = t.optim.Adam(self.model.parameters())
         
 
     def setup_dataloader(self,
-                         batch_size):
+                         batch_size: int = 15,
+                         shuffle: bool = True):
+        """
+        Sets up the dataloader.
+
+        Parameters
+        ----------
+        batch_size : int
+            Optional, the size of the minibatches to use
+        shuffle : bool
+            Optional, enable/disable shuffling of the dataset. This option
+            is intended for diagnostic purposes and should be left as True.
+        """
         # Make a dataloader suited for either single-GPU use or cases
         # where a process group (i.e., multiple GPUs) has been initialized
         if self.multi_gpu_used:
             # First, create a sampler to load subsets of dataset to the GPUs
             self.sampler = DistributedSampler(self.dataset,
-                                            num_replicas=self.world_size,
-                                            rank=self.rank,
-                                            shuffle=True,
-                                            drop_last=False)
+                                              num_replicas=self.world_size,
+                                              rank=self.rank,
+                                              shuffle=shuffle,
+                                              drop_last=False)
             # Now create the dataloader
             self.data_loader = torchdata.DataLoader(self.dataset,
                                                     batch_size=batch_size//self.world_size,
                                                     num_workers=0, # Creating extra threads in children processes may cause problems. Leave this at 0.
                                                     drop_last=False,
-                                                    pin_memory=False,# I'm not 100% sure what this does, but apparently making this True can cause bugs
+                                                    pin_memory=False,
                                                     sampler=self.sampler)
         else:
             self.data_loader = torchdata.DataLoader(self.dataset,
                                                     batch_size=batch_size,
-                                                    shuffle=True)
+                                                    shuffle=shuffle)
         # Store the optimizer parameters
         #self.hyperparameters['lr'] = lr
         #self.hyperparameters['betas'] = betas
@@ -70,7 +99,19 @@ def adjust_optimizer(self,
                          lr=0.005,
                          betas=(0.9, 0.999),
                          amsgrad=False):
+        """
+        Change hyperparameters for the utilized optimizer.
 
+        Parameters
+        ----------
+        lr : float
+            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
+            typically the highest possible value with any chance of being stable
+        betas : tuple
+            Optional, the beta_1 and beta_2 to use. Default is (0.9, 0.999).
+        amsgrad: bool
+            Optional, whether to use the AMSGrad variant of this algorithm
+        """
         for param_group in self.optimizer.param_groups:
             param_group['lr'] = lr
             param_group['betas'] = betas
@@ -88,46 +129,37 @@ def optimize(self,
                  regularization_factor = None,
                  thread=True,
                  calculation_width=10):
-        """Runs a round of reconstruction using the Adam optimizer
+        """
+        Runs a round of reconstruction using the Adam optimizer
 
-        Formerly CDIModel.Adam_optimize
+        Formerly `CDIModel.Adam_optimize`
         
-        This calls the base Reconstructor.optimize method
-        (formerly CDIModel.AD_optimize) to run a round of reconstruction
-
-        NOTE: A decision should be made regarding whether self.optimize
-              should only be allowed to adjust reconstruction
-              hyperparameters rather than initialize them
+        This calls the Reconstructor.optimize superclass method
+        (formerly `CDIModel.AD_optimize`) to run a round of reconstruction
+        once the dataloader and optimizer hyperparameters have been
+        set up.
         """
+        # Update the training history
         self.model.training_history += (
             f'Planning {iterations} epochs of Adam, with a learning rate = '
             f'{lr}, batch size = {batch_size}, regularization_factor = '
             f'{regularization_factor}, and schedule = {schedule}.\n'
         )
 
-        #############################################################
         # The subset statement is contained in Reconstructor.__init__
-        #############################################################
 
-        #############################################################
         # The dataloader step is handled by self.dataloader
         # TODO: Figure out a way to adjust the batch_size without
         #       creating a brand-spanking-new one each time
-        #############################################################
         self.setup_dataloader(batch_size)
 
-        #############################################################
         # The optimizer is created in self.__init__, but the 
         # hyperparameters need to be set up with self.adjust_optimizer
-        #############################################################
         self.adjust_optimizer(lr,
                               betas,
                               amsgrad)
 
-        #############################################################
         # Define the scheduler
-        # NOTE: We may want to define this in __init__ and simply
-        #
         if schedule:
             self.scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 
                                                                     factor=0.2,
diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/optimizer/base.py
index e8116e6e..0c8761d2 100644
--- a/src/cdtools/optimizer/base.py
+++ b/src/cdtools/optimizer/base.py
@@ -2,17 +2,15 @@
 optimization ('reconstructions') on ptychographic/CDI models.
 
 The Reconstructor class is designed to resemble so-called
-'Trainer' classes that (in the language of the AI/ML folks) that
-handles the 'training' of a model given some dataset and optimizer.
+'Trainer' classes that (in the language of the AI/ML folks) handles
+the 'training' of a model given some dataset and optimizer.
 
-The subclasses of the Reconstructor class are required to implement
+The subclasses of Reconstructor are required to implement
 their own data loaders and optimizer adjusters
 """
 
 import torch as t
 from torch.utils import data as torchdata
-from torch.utils.data.distributed import DistributedSampler
-from torch.nn.parallel import DistributedDataParallel as DDP
 import torch.distributed as dist
 import threading
 import queue
@@ -27,38 +25,52 @@
 __all__ = ['Reconstructor']
 
 class Reconstructor:
+    """
+    Reconstructor handles the optimization ('reconstruction') of ptychographic
+    models given a CDIModel (or subclass) and corresponding Ptycho2DDataset.
+    
+    This is a base model that defines all functions Reconstructor subclasses
+    must implement.
+
+    Parameters
+    ----------
+    model: CDIModel
+        Model for CDI/ptychography reconstruction
+    dataset: Ptycho2DDataset
+        The dataset to reconstruct against
+    subset : list(int) or int
+        Optional, a pattern index or list of pattern indices to use
+
+    Important attributes:
+    - **model** -- Always points to the core model used.
+    - **multi_gpu_used** -- Whether or not multi-GPU computation will be performed
+      using a distributed data approach. This attribute will be pulled from the
+      CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
+    - **optimizer** -- A `torch.optim.Optimizer` that must be defined when initializing the
+      Reconstructor subclass.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that must be defined when creating the
+      `Reconstructor` subclass through the `setup_scheduler` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that must be defined when creating
+      the Reconstructor subclass through the `setup_dataloader` method.
+    """
     def __init__(self,
                  model: CDIModel,
                  dataset: Ptycho2DDataset,
-                 subset: List[int] = None,
-                 thread: bool = True):
-        
-        # Store parameters as attributes
+                 subset: List[int] = None):
+        # Store parameters as attributes of Reconstructor
+        self.subset = subset
         self.multi_gpu_used = model.multi_gpu_used
         self.world_size = model.world_size
         self.rank = model.rank
-        self.subset = subset
-        self.thread = thread
 
-        # Initialize some attributes that must be defined by other methods
-        self.optimizer = None
-        self.scheduler = None
-        self.data_loader = None
-        #self.epoch = model.epoch
+        # Initialize attributes that must be defined by the subclasses
+        self.optimizer = None       # Defined in the __init__ of the subclass as a torch.optim.Optimizer
+        self.scheduler = None       # Defined in the setup_scheduler method
+        self.data_loader = None     # Defined in the setup_dataloader method
         
-        # Store either the original or DDP-wrapped model, along with
-        # references to model attributes/methods
-        """ For now, don't do DDP-wrapping; check if porting the bits
-        and pieces from CDIModel works before doing DDP.
-        if self.multi_gpu_used:
-            self.model = DDP(model, 
-                             device_ids=[0]) # This is used if CUDA_VISIBLE_DEVICES is manually set
-            store_references(self.model.module)
-        
-        else:
-        """
+        # Store the original model
+        # TODO: Include DDP support + wrapping
         self.model = model
-        #store_model_attributes(self.model)
 
         # Store the dataset
         if subset is not None:
@@ -71,24 +83,19 @@ def __init__(self,
 
         
     def setup_dataloader(self, **kwargs):
-        """Sets up the dataloader 
+        """
+        Sets up the dataloader. 
 
         The dataloader needs to be defined manually for each subclass.
-        While each subclass will likely use similar calls to 
         """
         raise NotImplementedError()
 
 
     def adjust_optimizer(self, **kwargs):
-        """This is to allow us to set up parameters for whatever optimizer we're
-        interested in using.
-
-        The different optimization schemes (Adam, LBFGS, SGD) seem to take in 
-        different hyperparameters. This function is intended to modify the
-        parameters
+        """
+        Change hyperparameters for the utilized optimizer.
 
-        This is not defined here. For each optimizer, the keyword
-        arguments should be manually defined as parameters
+        For each optimizer, the keyword arguments should be manually defined as parameters.
         """
         raise NotImplementedError()
 
@@ -98,7 +105,8 @@ def optimize(self,
                  regularization_factor=None,
                  thread=True,
                  calculation_width=10):
-        """Runs a round of reconstruction using the provided optimizer
+        """
+        Runs a round of reconstruction using the provided optimizer
         
         Formerly CDIModel.AD_optimize
 
@@ -117,12 +125,6 @@ def optimize(self,
         ----------
         iterations : int
             How many epochs of the algorithm to run
-        data_loader : torch.utils.data.DataLoader
-            A data loader loading the CDataset to reconstruct
-        optimizer : torch.optim.Optimizer
-            The optimizer to run the reconstruction with
-        scheduler : torch.optim.lr_scheduler._LRScheduler
-            Optional, a learning rate scheduler to use
         regularization_factor : float or list(float)
             Optional, if the model has a regularizer defined, the set of parameters to pass the regularizer method
         thread : bool
@@ -190,7 +192,7 @@ def closure():
 
                         # For multi-GPU optimization, we need to average and
                         # sync the gradients + losses across all participating
-                        # GPUs with an all-reduce call.
+                        # GPUs with an all-reduce call.               
                         if self.multi_gpu_used:
                             for param in self.model.parameters():
                                 if param.requires_grad:
@@ -199,8 +201,7 @@ def closure():
                             
                             # Sum the loss value across all devices for reporting
                             dist.all_reduce(loss, op=dist.ReduceOp.SUM) 
-                        
-                        # Normalize the accumulating total loss by the number of GPUs used
+                        # Normalize the accumulating total loss by the numer of GPUs used
                         total_loss += loss.detach() // self.model.world_size
                         
 

From 64f986f440ef35dd0954878f779af48df95d12ae Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 12 May 2025 22:22:18 +0000
Subject: [PATCH 050/115] Scheduler is now defined in __init__

---
 src/cdtools/optimizer/adam.py | 25 +++++++++++++------------
 src/cdtools/optimizer/base.py |  6 +++---
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/cdtools/optimizer/adam.py b/src/cdtools/optimizer/adam.py
index 810ba7c0..089ecc3b 100644
--- a/src/cdtools/optimizer/adam.py
+++ b/src/cdtools/optimizer/adam.py
@@ -32,6 +32,8 @@ class Adam(Reconstructor):
         The dataset to reconstruct against
     subset : list(int) or int
         Optional, a pattern index or list of pattern indices to use
+    schedule : bool
+        Optional, create a learning rate scheduler (torch.optim.lr_scheduler._LRScheduler)
 
     Important attributes:
     - **model** -- Always points to the core model used.
@@ -48,12 +50,21 @@ class Adam(Reconstructor):
     def __init__(self,
                  model: CDIModel,
                  dataset: Ptycho2DDataset,
-                 subset: List[int] = None):
+                 subset: List[int] = None,
+                 schedule: bool = False):
 
         super().__init__(model, dataset, subset)
         
         # Define the optimizer for use in this subclass
         self.optimizer = t.optim.Adam(self.model.parameters())
+
+        # Define the scheduler
+        if schedule:
+            self.scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 
+                                                                    factor=0.2,
+                                                                    threshold=1e-9)
+        else:
+            self.scheduler = None
         
 
     def setup_dataloader(self,
@@ -155,17 +166,7 @@ def optimize(self,
 
         # The optimizer is created in self.__init__, but the 
         # hyperparameters need to be set up with self.adjust_optimizer
-        self.adjust_optimizer(lr,
-                              betas,
-                              amsgrad)
-
-        # Define the scheduler
-        if schedule:
-            self.scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 
-                                                                    factor=0.2,
-                                                                    threshold=1e-9)
-        else:
-            self.scheduler = None
+        self.adjust_optimizer(lr, betas, amsgrad)
 
         # This is analagous to making a call to CDIModel.AD_optimize
         return super(Adam, self).optimize(iterations,
diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/optimizer/base.py
index 0c8761d2..e02d18fd 100644
--- a/src/cdtools/optimizer/base.py
+++ b/src/cdtools/optimizer/base.py
@@ -48,8 +48,8 @@ class Reconstructor:
       CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
     - **optimizer** -- A `torch.optim.Optimizer` that must be defined when initializing the
       Reconstructor subclass.
-    - **scheduler** -- A `torch.optim.lr_scheduler` that must be defined when creating the
-      `Reconstructor` subclass through the `setup_scheduler` method.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that may be defined when initializing the
+      `Reconstructor` subclass.
     - **data_loader** -- A torch.utils.data.DataLoader that must be defined when creating
       the Reconstructor subclass through the `setup_dataloader` method.
     """
@@ -65,7 +65,7 @@ def __init__(self,
 
         # Initialize attributes that must be defined by the subclasses
         self.optimizer = None       # Defined in the __init__ of the subclass as a torch.optim.Optimizer
-        self.scheduler = None       # Defined in the setup_scheduler method
+        self.scheduler = None       # Defined in the __init__ of the subclass as a torch.optim.lr_scheduler
         self.data_loader = None     # Defined in the setup_dataloader method
         
         # Store the original model

From 1b7d094abc7e572e9072ba1e6765e7408def1e66 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 12 May 2025 23:16:30 +0000
Subject: [PATCH 051/115] Moved scheduler back to optimize. Moved
 setup_dataloader to the Reconstructor superclass

---
 src/cdtools/optimizer/adam.py | 144 +++++++++++++++-------------------
 src/cdtools/optimizer/base.py |  53 ++++++++++---
 2 files changed, 106 insertions(+), 91 deletions(-)

diff --git a/src/cdtools/optimizer/adam.py b/src/cdtools/optimizer/adam.py
index 089ecc3b..6ee54dc3 100644
--- a/src/cdtools/optimizer/adam.py
+++ b/src/cdtools/optimizer/adam.py
@@ -14,7 +14,7 @@
 from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.models import CDIModel
-from typing import Tuple, List
+from typing import Tuple, List, Union
 from cdtools.optimizer import Reconstructor
 
 __all__ = ['Adam']
@@ -42,74 +42,24 @@ class Adam(Reconstructor):
       CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
     - **optimizer** -- This class by default uses `torch.optim.Adam` to perform
       optimizations.
-    - **scheduler** -- A `torch.optim.lr_scheduler` that must be defined when creating the
-      `Reconstructor` subclass through the `setup_scheduler` method.
-    - **data_loader** -- A torch.utils.data.DataLoader that must be defined when creating
-      the Reconstructor subclass through the `setup_dataloader` method.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the `optimize` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
+      `setup_dataloader` method.
     """
     def __init__(self,
                  model: CDIModel,
                  dataset: Ptycho2DDataset,
-                 subset: List[int] = None,
-                 schedule: bool = False):
+                 subset: List[int] = None):
 
         super().__init__(model, dataset, subset)
         
         # Define the optimizer for use in this subclass
         self.optimizer = t.optim.Adam(self.model.parameters())
-
-        # Define the scheduler
-        if schedule:
-            self.scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 
-                                                                    factor=0.2,
-                                                                    threshold=1e-9)
-        else:
-            self.scheduler = None
-        
-
-    def setup_dataloader(self,
-                         batch_size: int = 15,
-                         shuffle: bool = True):
-        """
-        Sets up the dataloader.
-
-        Parameters
-        ----------
-        batch_size : int
-            Optional, the size of the minibatches to use
-        shuffle : bool
-            Optional, enable/disable shuffling of the dataset. This option
-            is intended for diagnostic purposes and should be left as True.
-        """
-        # Make a dataloader suited for either single-GPU use or cases
-        # where a process group (i.e., multiple GPUs) has been initialized
-        if self.multi_gpu_used:
-            # First, create a sampler to load subsets of dataset to the GPUs
-            self.sampler = DistributedSampler(self.dataset,
-                                              num_replicas=self.world_size,
-                                              rank=self.rank,
-                                              shuffle=shuffle,
-                                              drop_last=False)
-            # Now create the dataloader
-            self.data_loader = torchdata.DataLoader(self.dataset,
-                                                    batch_size=batch_size//self.world_size,
-                                                    num_workers=0, # Creating extra threads in children processes may cause problems. Leave this at 0.
-                                                    drop_last=False,
-                                                    pin_memory=False,
-                                                    sampler=self.sampler)
-        else:
-            self.data_loader = torchdata.DataLoader(self.dataset,
-                                                    batch_size=batch_size,
-                                                    shuffle=shuffle)
-        # Store the optimizer parameters
-        #self.hyperparameters['lr'] = lr
-        #self.hyperparameters['betas'] = betas
-        #self.hyperparameters['amsgrad'] = amsgrad
     
     def adjust_optimizer(self,
-                         lr=0.005,
-                         betas=(0.9, 0.999),
-                         amsgrad=False):
+                         lr: int = 0.005,
+                         betas: Tuple[float] = (0.9, 0.999),
+                         amsgrad: bool = False):
         """
         Change hyperparameters for the utilized optimizer.
 
@@ -120,7 +70,7 @@ def adjust_optimizer(self,
             typically the highest possible value with any chance of being stable
         betas : tuple
             Optional, the beta_1 and beta_2 to use. Default is (0.9, 0.999).
-        amsgrad: bool
+        amsgrad : bool
             Optional, whether to use the AMSGrad variant of this algorithm
         """
         for param_group in self.optimizer.param_groups:
@@ -130,16 +80,16 @@ def adjust_optimizer(self,
 
 
     def optimize(self,
-                 iterations,
-                 batch_size=15,
-                 lr=0.005,
-                 betas=(0.9, 0.999),
-                 schedule=False,
-                 amsgrad=False,
-                 subset=None,
-                 regularization_factor = None,
-                 thread=True,
-                 calculation_width=10):
+                 iterations: int,
+                 batch_size: int = 15,
+                 lr: float = 0.005,
+                 betas: Tuple[float] = (0.9, 0.999),
+                 schedule: bool = False,
+                 amsgrad: bool = False,
+                 regularization_factor: Union[float, List[float]] = None,
+                 thread: bool = True,
+                 calculation_width: int = 10,
+                 shuffle: bool = True):
         """
         Runs a round of reconstruction using the Adam optimizer
 
@@ -149,6 +99,34 @@ def optimize(self,
         (formerly `CDIModel.AD_optimize`) to run a round of reconstruction
         once the dataloader and optimizer hyperparameters have been
         set up.
+
+        Parameters
+        ----------
+        iterations : int
+            How many epochs of the algorithm to run
+        batch_size : int
+            Optional, the size of the minibatches to use
+        lr : float
+            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
+            typically the highest possible value with any chance of being stable
+        betas : tuple
+            Optional, the beta_1 and beta_2 to use. Default is (0.9, 0.999).
+        schedule : bool
+            Optional, create a learning rate scheduler (torch.optim.lr_scheduler._LRScheduler)
+        amsgra : bool
+            Optional, whether to use the AMSGrad variant of this algorithm
+        regularization_factor : float or list(float)
+            Optional, if the model has a regularizer defined, the set of parameters to pass 
+            the regularizer method
+        thread : bool
+            Default True, whether to run the computation in a separate thread to allow 
+            interaction with plots during computation
+        calculation_width : int
+            Default 10, how many translations to pass through at once for each round of 
+            gradient accumulation. Does not affect the result, only the calculation speed 
+        shuffle : bool
+            Optional, enable/disable shuffling of the dataset. This option
+            is intended for diagnostic purposes and should be left as True.
         """
         # Update the training history
         self.model.training_history += (
@@ -157,19 +135,25 @@ def optimize(self,
             f'{regularization_factor}, and schedule = {schedule}.\n'
         )
 
-        # The subset statement is contained in Reconstructor.__init__
+        # 1) The subset statement is contained in Reconstructor.__init__
 
-        # The dataloader step is handled by self.dataloader
-        # TODO: Figure out a way to adjust the batch_size without
-        #       creating a brand-spanking-new one each time
-        self.setup_dataloader(batch_size)
+        # 2) Set up / re-initialize the data laoder
+        self.setup_dataloader(batch_size=batch_size, shuffle=shuffle)
 
-        # The optimizer is created in self.__init__, but the 
-        # hyperparameters need to be set up with self.adjust_optimizer
+        # 3) The optimizer is created in self.__init__, but the 
+        #    hyperparameters need to be set up with self.adjust_optimizer
         self.adjust_optimizer(lr, betas, amsgrad)
 
-        # This is analagous to making a call to CDIModel.AD_optimize
+        # 4) Set up the scheduler
+        if schedule:
+            self.scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 
+                                                                    factor=0.2,
+                                                                    threshold=1e-9)
+        else:
+            self.scheduler = None
+
+        # 5) This is analagous to making a call to CDIModel.AD_optimize
         return super(Adam, self).optimize(iterations,
-                                            regularization_factor,
-                                            thread,
-                                            calculation_width)
\ No newline at end of file
+                                          regularization_factor,
+                                          thread,
+                                          calculation_width)
\ No newline at end of file
diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/optimizer/base.py
index e02d18fd..8fceb0b7 100644
--- a/src/cdtools/optimizer/base.py
+++ b/src/cdtools/optimizer/base.py
@@ -11,6 +11,7 @@
 
 import torch as t
 from torch.utils import data as torchdata
+from torch.utils.data.distributed import DistributedSampler
 import torch.distributed as dist
 import threading
 import queue
@@ -48,10 +49,9 @@ class Reconstructor:
       CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
     - **optimizer** -- A `torch.optim.Optimizer` that must be defined when initializing the
       Reconstructor subclass.
-    - **scheduler** -- A `torch.optim.lr_scheduler` that may be defined when initializing the
-      `Reconstructor` subclass.
-    - **data_loader** -- A torch.utils.data.DataLoader that must be defined when creating
-      the Reconstructor subclass through the `setup_dataloader` method.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that may be defined during the `optimize` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
+      `setup_dataloader` method.
     """
     def __init__(self,
                  model: CDIModel,
@@ -65,8 +65,8 @@ def __init__(self,
 
         # Initialize attributes that must be defined by the subclasses
         self.optimizer = None       # Defined in the __init__ of the subclass as a torch.optim.Optimizer
-        self.scheduler = None       # Defined in the __init__ of the subclass as a torch.optim.lr_scheduler
-        self.data_loader = None     # Defined in the setup_dataloader method
+        self.scheduler = None       # Defined as a torch.optim.lr_scheduler
+        self.data_loader = None     # Defined as a torch.utils.data.DataLoader in the setup_dataloader method
         
         # Store the original model
         # TODO: Include DDP support + wrapping
@@ -82,14 +82,45 @@ def __init__(self,
         self.dataset = dataset
 
         
-    def setup_dataloader(self, **kwargs):
+    def setup_dataloader(self, 
+                         batch_size: int = None,
+                         shuffle: bool = True):
         """
-        Sets up the dataloader. 
+        Sets up / re-initializes the dataloader. 
 
-        The dataloader needs to be defined manually for each subclass.
+        Parameters
+        ----------
+        batch_size : int
+            Optional, the size of the minibatches to use
+        shuffle : bool
+            Optional, enable/disable shuffling of the dataset. This option
+            is intended for diagnostic purposes and should be left as True.
         """
-        raise NotImplementedError()
-
+        if self.multi_gpu_used:
+            # NOTE: Multi-GPU implementation is intended for use with Adam. All other subclasses
+            # and optimizers have not been tested yet.
+
+            # First, create a sampler to load subsets of dataset to the GPUs
+            self.sampler = DistributedSampler(self.dataset,
+                                              num_replicas=self.world_size,
+                                              rank=self.rank,
+                                              shuffle=shuffle,
+                                              drop_last=False)
+            # Now create the dataloader
+            self.data_loader = torchdata.DataLoader(self.dataset,
+                                                    batch_size=batch_size//self.world_size,
+                                                    num_workers=0, # Creating extra threads in children processes may cause problems. Leave this at 0.
+                                                    drop_last=False,
+                                                    pin_memory=False,
+                                                    sampler=self.sampler)
+        else:
+            if batch_size is not None:
+                self.data_loader = torchdata.DataLoader(self.dataset,
+                                                        batch_size=batch_size,
+                                                        shuffle=shuffle)
+            else:
+                self.data_loader = torchdata.Dataloader(self.dataset)
+    
 
     def adjust_optimizer(self, **kwargs):
         """

From 140c65dae94d1188193cef0ab725b57f0f46b623 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 12 May 2025 23:28:21 +0000
Subject: [PATCH 052/115] Added type annotations to Reconstructor.optimize

---
 src/cdtools/optimizer/base.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/optimizer/base.py
index 8fceb0b7..da05636d 100644
--- a/src/cdtools/optimizer/base.py
+++ b/src/cdtools/optimizer/base.py
@@ -21,7 +21,7 @@
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.models import CDIModel
 import torch.distributed as dist
-from typing import Tuple, List
+from typing import Tuple, List, Union
 
 __all__ = ['Reconstructor']
 
@@ -132,10 +132,10 @@ def adjust_optimizer(self, **kwargs):
 
 
     def optimize(self,
-                 iterations,
-                 regularization_factor=None,
-                 thread=True,
-                 calculation_width=10):
+                 iterations: int,
+                 regularization_factor: Union[float, List[float]] = None,
+                 thread: bool = True,
+                 calculation_width: int = 10):
         """
         Runs a round of reconstruction using the provided optimizer
         

From 81f9f5f4e24686e040fe627eb4322bb0095eb20b Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 10 Jun 2025 03:13:51 +0000
Subject: [PATCH 053/115] Renamed the Reconstructor class to the Optimizer
 class

---
 src/cdtools/optimizer/__init__.py | 11 +++--------
 src/cdtools/optimizer/adam.py     | 14 +++++++-------
 src/cdtools/optimizer/base.py     | 18 +++++++++---------
 3 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/cdtools/optimizer/__init__.py b/src/cdtools/optimizer/__init__.py
index 4777830a..5def263a 100644
--- a/src/cdtools/optimizer/__init__.py
+++ b/src/cdtools/optimizer/__init__.py
@@ -4,15 +4,10 @@
 
 # We define __all__ to be sure that import * only imports what we want
 __all__ = [
-    'CDIModel',
-    'SimplePtycho',
-    'FancyPtycho',
-    'Bragg2DPtycho',
-    'Multislice2DPtycho',
-    'MultislicePtycho',
-    'RPI',
+    'Optimizer',
+    'Adam'
 ]
 
-from cdtools.optimizer.base import Reconstructor
+from cdtools.optimizer.base import Optimizer
 from cdtools.optimizer.adam import Adam
 
diff --git a/src/cdtools/optimizer/adam.py b/src/cdtools/optimizer/adam.py
index 6ee54dc3..b4c73e7a 100644
--- a/src/cdtools/optimizer/adam.py
+++ b/src/cdtools/optimizer/adam.py
@@ -1,8 +1,8 @@
-"""This module contains the Adam Reconstructor subclass for performing
+"""This module contains the Adam Optimizer subclass for performing
 optimization ('reconstructions') on ptychographic/CDI models using
 the Adam optimizer.
 
-The Reconstructor class is designed to resemble so-called
+The Optimizer class is designed to resemble so-called
 'Trainer' classes that (in the language of the AI/ML folks) handles
 the 'training' of a model given some dataset and optimizer.
 """
@@ -15,13 +15,13 @@
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.models import CDIModel
 from typing import Tuple, List, Union
-from cdtools.optimizer import Reconstructor
+from cdtools.optimizer import Optimizer
 
 __all__ = ['Adam']
 
-class Adam(Reconstructor):
+class Adam(Optimizer):
     """
-    The Adam Reconstructor subclass handles the optimization ('reconstruction') of 
+    The Adam Optimizer subclass handles the optimization ('reconstruction') of 
     ptychographic models and datasets using the Adam optimizer.
 
     Parameters
@@ -95,7 +95,7 @@ def optimize(self,
 
         Formerly `CDIModel.Adam_optimize`
         
-        This calls the Reconstructor.optimize superclass method
+        This calls the Optimizer.optimize superclass method
         (formerly `CDIModel.AD_optimize`) to run a round of reconstruction
         once the dataloader and optimizer hyperparameters have been
         set up.
@@ -135,7 +135,7 @@ def optimize(self,
             f'{regularization_factor}, and schedule = {schedule}.\n'
         )
 
-        # 1) The subset statement is contained in Reconstructor.__init__
+        # 1) The subset statement is contained in Optimizer.__init__
 
         # 2) Set up / re-initialize the data laoder
         self.setup_dataloader(batch_size=batch_size, shuffle=shuffle)
diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/optimizer/base.py
index da05636d..8101455b 100644
--- a/src/cdtools/optimizer/base.py
+++ b/src/cdtools/optimizer/base.py
@@ -1,11 +1,11 @@
-"""This module contains the base Reconstructor class for performing
+"""This module contains the base Optimizer class for performing
 optimization ('reconstructions') on ptychographic/CDI models.
 
-The Reconstructor class is designed to resemble so-called
+The Optimizer class is designed to resemble so-called
 'Trainer' classes that (in the language of the AI/ML folks) handles
 the 'training' of a model given some dataset and optimizer.
 
-The subclasses of Reconstructor are required to implement
+The subclasses of Optimizer are required to implement
 their own data loaders and optimizer adjusters
 """
 
@@ -23,14 +23,14 @@
 import torch.distributed as dist
 from typing import Tuple, List, Union
 
-__all__ = ['Reconstructor']
+__all__ = ['Optimizer']
 
-class Reconstructor:
+class Optimizer:
     """
-    Reconstructor handles the optimization ('reconstruction') of ptychographic
+    Optimizer handles the optimization ('reconstruction') of ptychographic
     models given a CDIModel (or subclass) and corresponding Ptycho2DDataset.
     
-    This is a base model that defines all functions Reconstructor subclasses
+    This is a base model that defines all functions Optimizer subclasses
     must implement.
 
     Parameters
@@ -48,7 +48,7 @@ class Reconstructor:
       using a distributed data approach. This attribute will be pulled from the
       CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
     - **optimizer** -- A `torch.optim.Optimizer` that must be defined when initializing the
-      Reconstructor subclass.
+      Optimizer subclass.
     - **scheduler** -- A `torch.optim.lr_scheduler` that may be defined during the `optimize` method.
     - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
       `setup_dataloader` method.
@@ -57,7 +57,7 @@ def __init__(self,
                  model: CDIModel,
                  dataset: Ptycho2DDataset,
                  subset: List[int] = None):
-        # Store parameters as attributes of Reconstructor
+        # Store parameters as attributes of Optimizer
         self.subset = subset
         self.multi_gpu_used = model.multi_gpu_used
         self.world_size = model.world_size

From 60737e61a1c9ff198f6cc044ba71bbd77b6e4174 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 10 Jun 2025 04:08:45 +0000
Subject: [PATCH 054/115] Refactor Optimizer to separate run_epoch from
 optimize. Also created sync_and_avg_gradients in distributed

---
 src/cdtools/optimizer/base.py                | 224 ++++++++++---------
 src/cdtools/tools/distributed/distributed.py |  22 +-
 2 files changed, 134 insertions(+), 112 deletions(-)

diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/optimizer/base.py
index 8101455b..6aebd3b8 100644
--- a/src/cdtools/optimizer/base.py
+++ b/src/cdtools/optimizer/base.py
@@ -20,7 +20,7 @@
 from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.models import CDIModel
-import torch.distributed as dist
+import cdtools.tools.distributed as cdtdist
 from typing import Tuple, List, Union
 
 __all__ = ['Optimizer']
@@ -69,7 +69,6 @@ def __init__(self,
         self.data_loader = None     # Defined as a torch.utils.data.DataLoader in the setup_dataloader method
         
         # Store the original model
-        # TODO: Include DDP support + wrapping
         self.model = model
 
         # Store the dataset
@@ -78,7 +77,6 @@ def __init__(self,
             if type(subset) == type(1):
                 subset = [subset]
             dataset = torchdata.Subset(dataset, subset)
-        
         self.dataset = dataset
 
         
@@ -97,9 +95,6 @@ def setup_dataloader(self,
             is intended for diagnostic purposes and should be left as True.
         """
         if self.multi_gpu_used:
-            # NOTE: Multi-GPU implementation is intended for use with Adam. All other subclasses
-            # and optimizers have not been tested yet.
-
             # First, create a sampler to load subsets of dataset to the GPUs
             self.sampler = DistributedSampler(self.dataset,
                                               num_replicas=self.world_size,
@@ -130,6 +125,118 @@ def adjust_optimizer(self, **kwargs):
         """
         raise NotImplementedError()
 
+    def _run_epoch(self, 
+                   stop_event: threading.Event = None,
+                   regularization_factor: Union[float, List[float]] = None,
+                   calculation_width: int = 10):
+        """
+        Runs one full epoch of the reconstruction. Intended to be called
+        by Optimizer.optimize.
+
+        Parameters
+        ----------
+        stop_event : threading.Event
+            Default None, causes the reconstruction to stop when an exception
+            occurs in Optimizer.optimize.
+        regularization_factor : float or list(float)
+            Optional, if the model has a regularizer defined, the set of 
+            parameters to pass the regularizer method
+        calculation_width : int
+            Default 10, how many translations to pass through at once for each 
+            round of gradient accumulation. This does not affect the result, but 
+            may affect the calculation speed.
+
+        Yields
+        ------
+        loss : float
+            The summed loss over the latest epoch, divided by the total diffraction 
+            pattern intensity
+        """
+        # If we're using DistributedSampler (i.e., multi-GPU useage), we need to 
+        # tell it which epoch we're on. Otherwise data shuffling will not work properly
+        if self.multi_gpu_used: 
+            self.data_loader.sampler.set_epoch(self.model.epoch)
+
+        # Initialize some tracking variables
+        normalization = 0
+        loss = 0
+        N = 0
+        t0 = time.time()
+
+        # The data loader is responsible for setting the minibatch
+        # size, so each set is a minibatch
+        for inputs, patterns in self.data_loader:
+            normalization += t.sum(patterns).cpu().numpy()
+            N += 1
+            def closure():
+                self.optimizer.zero_grad()
+
+                # We further break up the minibatch into a set of chunks.
+                # This lets us use larger minibatches than can fit
+                # on the GPU at once, while still doing batch processing
+                # for efficiency
+                input_chunks = [[inp[i:i + calculation_width]
+                                    for inp in inputs]
+                                for i in range(0, len(inputs[0]),
+                                                calculation_width)]
+                pattern_chunks = [patterns[i:i + calculation_width]
+                                    for i in range(0, len(inputs[0]),
+                                                    calculation_width)]
+
+                total_loss = 0
+                for inp, pats in zip(input_chunks, pattern_chunks):
+                    # This check allows for graceful exit when threading
+                    if stop_event is not None and stop_event.is_set():
+                        exit()
+
+                    # Run the simulation
+                    sim_patterns = self.model.forward(*inp) 
+
+                    # Calculate the loss
+                    if hasattr(self, 'mask'):
+                        loss = self.model.loss(pats,sim_patterns, mask=self.model.mask)
+                    else:
+                        loss = self.model.loss(pats,sim_patterns)
+
+                    # And accumulate the gradients
+                    loss.backward()
+
+                    # For multi-GPU, average and sync the gradients + losses across all 
+                    # participating GPUs with an all-reduce call. Also sum the losses.             
+                    if self.multi_gpu_used:
+                        cdtdist.sync_and_avg_gradients(self.model)
+                        dist.all_reduce(loss, op=dist.ReduceOp.SUM) 
+
+                    # Normalize the accumulating total loss by the numer of GPUs used
+                    total_loss += loss.detach() // self.model.world_size
+
+                # If we have a regularizer, we can calculate it separately,
+                # and the gradients will add to the minibatch gradient
+                if regularization_factor is not None and hasattr(self.model, 'regularizer'):
+                    loss = self.model.regularizer(regularization_factor)
+                    loss.backward()
+
+                    # For multi-GPU optimization, average and sync the gradients + 
+                    # losses across all participating GPUs with an all-reduce call.
+                    if self.multi_gpu_used:
+                        cdtdist.sync_and_avg_gradients(self.model)
+                
+                return total_loss
+
+            # This takes the step for this minibatch
+            loss += self.optimizer.step(closure).detach().cpu().numpy()
+        
+        loss /= normalization
+
+        # We step the scheduler after the full epoch
+        if self.scheduler is not None:
+            self.scheduler.step(loss)
+
+        self.model.loss_history.append(loss)
+        self.model.epoch = len(self.model.loss_history)
+        self.model.latest_iteration_time = time.time() - t0
+        self.model.training_history += self.model.report() + '\n'
+        return loss
 
     def optimize(self,
                  iterations: int,
@@ -169,106 +276,6 @@ def optimize(self,
             The summed loss over the latest epoch, divided by the total diffraction pattern intensity
         """
 
-        def run_epoch(stop_event=None):
-            """Runs one full epoch of the reconstruction."""
-            # If we're using DistributedSampler (likely the case if you're using 
-            # multiple GPUs), we need to tell it which epoch we're on. Otherwise
-            # data shuffling will not work properly
-            if self.multi_gpu_used: 
-                self.data_loader.sampler.set_epoch(self.model.epoch)
-
-            # First, initialize some tracking variables
-            normalization = 0
-            loss = 0
-            N = 0
-            t0 = time.time()
-
-            # The data loader is responsible for setting the minibatch
-            # size, so each set is a minibatch
-            for inputs, patterns in self.data_loader:
-                normalization += t.sum(patterns).cpu().numpy()
-                N += 1
-                def closure():
-                    self.optimizer.zero_grad()
-
-                    # We further break up the minibatch into a set of chunks.
-                    # This lets us use larger minibatches than can fit
-                    # on the GPU at once, while still doing batch processing
-                    # for efficiency
-                    input_chunks = [[inp[i:i + calculation_width]
-                                     for inp in inputs]
-                                    for i in range(0, len(inputs[0]),
-                                                   calculation_width)]
-                    pattern_chunks = [patterns[i:i + calculation_width]
-                                      for i in range(0, len(inputs[0]),
-                                                     calculation_width)]
-
-                    total_loss = 0
-                    for inp, pats in zip(input_chunks, pattern_chunks):
-                        # This check allows for graceful exit when threading
-                        if stop_event is not None and stop_event.is_set():
-                            exit()
-
-                        # Run the simulation
-                        sim_patterns = self.model.forward(*inp) 
-
-                        # Calculate the loss
-                        if hasattr(self, 'mask'):
-                            loss = self.model.loss(pats,sim_patterns, mask=self.model.mask)
-                        else:
-                            loss = self.model.loss(pats,sim_patterns)
-
-                        # And accumulate the gradients
-                        loss.backward()
-
-                        # For multi-GPU optimization, we need to average and
-                        # sync the gradients + losses across all participating
-                        # GPUs with an all-reduce call.               
-                        if self.multi_gpu_used:
-                            for param in self.model.parameters():
-                                if param.requires_grad:
-                                    dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
-                                    param.grad.data /= self.model.world_size
-                            
-                            # Sum the loss value across all devices for reporting
-                            dist.all_reduce(loss, op=dist.ReduceOp.SUM) 
-                        # Normalize the accumulating total loss by the numer of GPUs used
-                        total_loss += loss.detach() // self.model.world_size
-                        
-
-                    # If we have a regularizer, we can calculate it separately,
-                    # and the gradients will add to the minibatch gradient
-                    if regularization_factor is not None \
-                       and hasattr(self.model, 'regularizer'):
-                        loss = self.model.regularizer(regularization_factor)
-                        loss.backward()
-
-                        # For multi-GPU optimization, we need to average and
-                        # sync the gradients + losses across all participating
-                        # GPUs with an all-reduce call.
-                        if self.multi_gpu_used:
-                            for param in self.model.parameters():
-                                if param.requires_grad:
-                                    dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
-                                    param.grad.data /= self.model.world_size
-                    
-                    return total_loss
-
-                # This takes the step for this minibatch
-                loss += self.optimizer.step(closure).detach().cpu().numpy()
-            
-            loss /= normalization
-
-            # We step the scheduler after the full epoch
-            if self.scheduler is not None:
-                self.scheduler.step(loss)
-
-            self.model.loss_history.append(loss)
-            self.model.epoch = len(self.model.loss_history)
-            self.model.latest_iteration_time = time.time() - t0
-            self.model.training_history += self.model.report() + '\n'
-            return loss
-
         # We store the current optimizer as a model parameter so that
         # it can be saved and loaded for checkpointing
         self.current_optimizer = self.optimizer
@@ -284,8 +291,7 @@ def closure():
                         yield float('nan')
                     continue
 
-                yield run_epoch()
-                    
+                yield self._run_epoch()
                 
         # But if we do want to thread, it's annoying:
         else:
@@ -294,7 +300,7 @@ def closure():
             stop_event = threading.Event()
             def target():
                 try:
-                    result_queue.put(run_epoch(stop_event))
+                    result_queue.put(self._run_epoch(stop_event))
                 except Exception as e:
                     # If something bad happens, put the exception into the
                     # result queue
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 191ffab2..16d734e7 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -20,6 +20,7 @@
 import torch as t
 from torch.distributed import init_process_group, destroy_process_group, barrier
 from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
 import torch.multiprocessing as mp
 from multiprocessing.connection import Connection
 from cdtools.models import CDIModel
@@ -28,8 +29,21 @@
 import os
 from typing import Callable, List
 
-__all__ = ['distributed_wrapper', 'spawn']
+__all__ = ['sync_and_avg_gradients', 'distributed_wrapper', 'spawn']
 
+def sync_and_avg_gradients(model):
+    """
+    Synchronizes the average of the model parameter gradients across all
+    participating GPUs.
+
+    Parameters:
+        model: CDIModel
+            Model for CDI/ptychography reconstruction  
+    """
+    for param in model.parameters():
+        if param.requires_grad:
+            dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
+            param.grad.data /= model.world_size
 
 def distributed_wrapper(rank: int, 
                         func: Callable[[CDIModel, Ptycho2DDataset, int, int], None], 
@@ -39,7 +53,8 @@ def distributed_wrapper(rank: int,
                         backend: str = 'nccl', 
                         timeout: int = 600,
                         pipe: Connection = None):
-    """Wraps functions containing reconstruction loops (i.e., `for loss in 
+    """
+    Wraps functions containing reconstruction loops (i.e., `for loss in 
     model.Adam_optimize`) to enable multi-GPU operations to be set up. The 
     wrapped function needs to passed to `torch.multiprocessing.spawn` or 
     `cdtools.tools.distributed.distributed.spawn`
@@ -118,7 +133,8 @@ def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],
           timeout: int = 600,
           nccl_p2p_disable: bool = True,
           pipe: Connection = None):
-    """Spawns subprocesses on `world_size` GPUs that runs reconstruction
+    """
+    Spawns subprocesses on `world_size` GPUs that runs reconstruction
     loops wrapped around a function `func`.
     
     This is a wrapper around `torch.multiprocessing.spawn` which includes 

From cfa86b168552b3a7a531ed50b4c95aa82b7ca3e8 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 10 Jun 2025 04:11:27 +0000
Subject: [PATCH 055/115] Refactor Optimizer to separate run_epoch from
 optimize. Also created sync_and_avg_gradients in distributed

---
 src/cdtools/optimizer/base.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/optimizer/base.py
index 6aebd3b8..90ba37bb 100644
--- a/src/cdtools/optimizer/base.py
+++ b/src/cdtools/optimizer/base.py
@@ -238,6 +238,7 @@ def closure():
         self.model.training_history += self.model.report() + '\n'
         return loss
 
+
     def optimize(self,
                  iterations: int,
                  regularization_factor: Union[float, List[float]] = None,
@@ -291,7 +292,8 @@ def optimize(self,
                         yield float('nan')
                     continue
 
-                yield self._run_epoch()
+                yield self._run_epoch(regularization_factor=regularization_factor,
+                                      calculation_width=calculation_width)
                 
         # But if we do want to thread, it's annoying:
         else:
@@ -300,7 +302,9 @@ def optimize(self,
             stop_event = threading.Event()
             def target():
                 try:
-                    result_queue.put(self._run_epoch(stop_event))
+                    result_queue.put(self._run_epoch(stop_event=stop_event,
+                                                     regularization_factor=regularization_factor,
+                                                     calculation_width=calculation_width))
                 except Exception as e:
                     # If something bad happens, put the exception into the
                     # result queue

From 81e5cd978bfd16101a815862c981e26fd2ef4980 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 10 Jun 2025 04:24:31 +0000
Subject: [PATCH 056/115] Removed DDP dependency from distributed.py. Tidied up
 the docs.

---
 src/cdtools/tools/distributed/distributed.py | 30 ++++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 16d734e7..9c1ad9e9 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -1,32 +1,32 @@
 """Contains functions to make reconstruction scripts compatible
 with multi-GPU distributive approaches in PyTorch.
 
-The functions in this module require parts of the user-written
+Multi-GPU computing here is based on distributed data parallelism, where
+each GPU is given identical copies of a model and performs optimization
+using different parts of the dataset. After the parameter gradients
+are calculated (`loss.backwards()`) on each GPU, the gradients need to be
+synchronized and averaged across all participating GPUs. 
+
+The functions in this module assist with both gradient synchronization and
+setting up conditions necessary to perform distributive computing. Some
+functions in this module require parts of the user-written
 reconstruction script to be first wrapped in a function (as shown in 
 examples/fancy_ptycho_multi_gpu_ddp.py). The functions in this module
 are designed to wrap around/call these user-defined functions, enabling
 reconstructions to be performed across several GPUs.
 
-As of 20250302, the methods here are based on 
-torch.nn.parallel.DistributedDataParallel, which implements distributed
-data parallelism. In this scheme, replicas of the CDI/ptychography model
-are given to each device. These devices will synchronize gradients across
-each model replica. These methods however do not define how the Dataset is
+NOTE: These methods however do not define how the Dataset is
 distributed across each device; this process can be handled by using
 DistributedSampler with the DataLoader.
 """
 
-import numpy as np
-import torch as t
-from torch.distributed import init_process_group, destroy_process_group, barrier
-from torch.nn.parallel import DistributedDataParallel as DDP
 import torch.distributed as dist
 import torch.multiprocessing as mp
+import datetime
+import os
 from multiprocessing.connection import Connection
 from cdtools.models import CDIModel
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
-import datetime
-import os
 from typing import Callable, List
 
 __all__ = ['sync_and_avg_gradients', 'distributed_wrapper', 'spawn']
@@ -103,8 +103,8 @@ def distributed_wrapper(rank: int,
         model.multi_gpu_used = True
 
     # Initialize the process group
-    init_process_group(backend=backend, rank=rank, 
-                       world_size=model.world_size, timeout=timeout)
+    dist.init_process_group(backend=backend, rank=rank, 
+                            world_size=model.world_size, timeout=timeout)
     
     # Load the model to the appropriate GPU rank the process is using
     device='cuda'
@@ -120,7 +120,7 @@ def distributed_wrapper(rank: int,
         func(model, dataset, rank, model.world_size, pipe)   
                          
     # Destroy process group
-    destroy_process_group()        
+    dist.destroy_process_group()        
 
 
 def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],

From dfb9d14d9ef17ba22ab03cd8dac6bd4829e4b9f7 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 10 Jun 2025 04:59:12 +0000
Subject: [PATCH 057/115] CDIModel.Adam_optimize refactored to only use
 cdtools.optimizer.Adam

---
 src/cdtools/models/base.py | 96 ++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 57 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 7aaa8ea2..7994b9e2 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -43,6 +43,7 @@
 from contextlib import contextmanager
 from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
 
+
 __all__ = ['CDIModel']
 
 
@@ -71,6 +72,7 @@ def __init__(self):
         self.world_size = 1             # Total number of GPUs being used.
         self.multi_gpu_used = False     # Self explanatory
 
+
     def from_dataset(self, dataset):
         raise NotImplementedError()
 
@@ -550,7 +552,8 @@ def Adam_optimize(
             thread=True,
             calculation_width=10
     ):
-        """Runs a round of reconstruction using the Adam optimizer
+        """Runs a round of reconstruction using the Adam optimizer from
+        cdtools.optimizer.Adam.
 
         This is generally accepted to be the most robust algorithm for use
         with ptychography. Like all the other optimization routines,
@@ -566,7 +569,9 @@ def Adam_optimize(
         batch_size : int
             Optional, the size of the minibatches to use
         lr : float
-            Optional, The learning rate (alpha) to use. Defaultis 0.005. 0.05 is typically the highest possible value with any chance of being stable
+            Optional, The learning rate (alpha) to use. Defaultis 0.005. 
+            0.05 is typically the highest possible value with any chance 
+            of being stable
         betas : tuple
             Optional, the beta_1 and beta_2 to use. Default is (0.9, 0.999).
         schedule : float
@@ -574,66 +579,43 @@ def Adam_optimize(
         subset : list(int) or int
             Optional, a pattern index or list of pattern indices to use
         regularization_factor : float or list(float)
-            Optional, if the model has a regularizer defined, the set of parameters to pass the regularizer method
+            Optional, if the model has a regularizer defined, the set of 
+            parameters to pass the regularizer method
         thread : bool
-            Default True, whether to run the computation in a separate thread to allow interaction with plots during computation
+            Default True, whether to run the computation in a separate thread 
+            to allow interaction with plots during computation
         calculation_width : int
-            Default 10, how many translations to pass through at once for each round of gradient accumulation. Does not affect the result, only the calculation speed 
+            Default 10, how many translations to pass through at once for 
+            each round of gradient accumulation. Does not affect the result, 
+            only the calculation speed 
         
         """
-
-        self.training_history += (
-            f'Planning {iterations} epochs of Adam, with a learning rate = '
-            f'{lr}, batch size = {batch_size}, regularization_factor = '
-            f'{regularization_factor}, and schedule = {schedule}.\n'
-        )
+        # We want to have model.Adam_optimize call AND store cdtoptim.Adam.optimize
+        # to be able to perform reconstructions without creating a new
+        # optimizer each time we update the hyperparameters.
+        # 
+        # The only way to do this is to make cdtoptim.Adam an attribute
+        # of the model. But since cdtoptim.Adam also depends on CDIModel,
+        # this seems to give rise to a circular import error unless
+        # we import cdtools.optimizer within this method:
+        import cdtools.optimizer as cdtoptim
+
+        # Next, we want to create an Optimizer.Adam if one does not already exist.
+        if not hasattr(self, 'optimizer'):
+            self.optimizer = cdtoptim.Adam(model=self, 
+                                           dataset=dataset, 
+                                           subset=subset)
         
-
-        if subset is not None:
-            # if subset is just one pattern, turn into a list for convenience
-            if type(subset) == type(1):
-                subset = [subset]
-            dataset = torchdata.Subset(dataset, subset)
-
-        # Make a dataloader suited for either single-GPU use or cases
-        # where a process group (i.e., multiple GPUs) has been initialized
-        if self.multi_gpu_used:
-            # First, create a sampler to load subsets of dataset to the GPUs
-            sampler = DistributedSampler(dataset,
-                                         num_replicas=self.world_size,
-                                         rank=self.rank,
-                                         shuffle=True,
-                                         drop_last=False)
-            # Now create the dataloader
-            data_loader = torchdata.DataLoader(dataset,
-                                               batch_size=batch_size//self.world_size,
-                                               num_workers=0, # Creating extra threads in children processes may cause problems. Leave this at 0.
-                                               drop_last=False,
-                                               pin_memory=False,# I'm not 100% sure what this does, but apparently making this True can cause bugs
-                                               sampler=sampler)
-        else:
-            data_loader = torchdata.DataLoader(dataset,
-                                            batch_size=batch_size,
-                                            shuffle=True)
-
-        # Define the optimizer
-        optimizer = t.optim.Adam(
-            self.parameters(),
-            lr = lr,
-            betas=betas,
-            amsgrad=amsgrad)
-
-        # Define the scheduler
-        if schedule:
-            scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2,threshold=1e-9)
-        else:
-            scheduler = None
-
-        return self.AD_optimize(iterations, data_loader, optimizer,
-                                scheduler=scheduler,
-                                regularization_factor=regularization_factor,
-                                thread=thread,
-                                calculation_width=calculation_width)
+        # Run some reconstructions
+        return self.optimizer.optimize(iterations=iterations,
+                                       batch_size=batch_size,
+                                       lr=lr,
+                                       betas=betas,
+                                       schedule=schedule,
+                                       amsgrad=amsgrad,
+                                       regularization_factor=regularization_factor,
+                                       thread=thread,
+                                       calculation_width=calculation_width)
 
 
     def LBFGS_optimize(self, iterations, dataset,

From ac3373ca07c854f0747f5a1869ae4bb063b48171 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 10 Jun 2025 15:52:33 +0000
Subject: [PATCH 058/115] Revert "Renamed the Reconstructor class to the
 Optimizer class"

This reverts commit 81f9f5f4e24686e040fe627eb4322bb0095eb20b.
---
 src/cdtools/optimizer/__init__.py | 11 ++++++++---
 src/cdtools/optimizer/adam.py     | 14 +++++++-------
 src/cdtools/optimizer/base.py     | 18 +++++++++---------
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/src/cdtools/optimizer/__init__.py b/src/cdtools/optimizer/__init__.py
index 5def263a..4777830a 100644
--- a/src/cdtools/optimizer/__init__.py
+++ b/src/cdtools/optimizer/__init__.py
@@ -4,10 +4,15 @@
 
 # We define __all__ to be sure that import * only imports what we want
 __all__ = [
-    'Optimizer',
-    'Adam'
+    'CDIModel',
+    'SimplePtycho',
+    'FancyPtycho',
+    'Bragg2DPtycho',
+    'Multislice2DPtycho',
+    'MultislicePtycho',
+    'RPI',
 ]
 
-from cdtools.optimizer.base import Optimizer
+from cdtools.optimizer.base import Reconstructor
 from cdtools.optimizer.adam import Adam
 
diff --git a/src/cdtools/optimizer/adam.py b/src/cdtools/optimizer/adam.py
index b4c73e7a..6ee54dc3 100644
--- a/src/cdtools/optimizer/adam.py
+++ b/src/cdtools/optimizer/adam.py
@@ -1,8 +1,8 @@
-"""This module contains the Adam Optimizer subclass for performing
+"""This module contains the Adam Reconstructor subclass for performing
 optimization ('reconstructions') on ptychographic/CDI models using
 the Adam optimizer.
 
-The Optimizer class is designed to resemble so-called
+The Reconstructor class is designed to resemble so-called
 'Trainer' classes that (in the language of the AI/ML folks) handles
 the 'training' of a model given some dataset and optimizer.
 """
@@ -15,13 +15,13 @@
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.models import CDIModel
 from typing import Tuple, List, Union
-from cdtools.optimizer import Optimizer
+from cdtools.optimizer import Reconstructor
 
 __all__ = ['Adam']
 
-class Adam(Optimizer):
+class Adam(Reconstructor):
     """
-    The Adam Optimizer subclass handles the optimization ('reconstruction') of 
+    The Adam Reconstructor subclass handles the optimization ('reconstruction') of 
     ptychographic models and datasets using the Adam optimizer.
 
     Parameters
@@ -95,7 +95,7 @@ def optimize(self,
 
         Formerly `CDIModel.Adam_optimize`
         
-        This calls the Optimizer.optimize superclass method
+        This calls the Reconstructor.optimize superclass method
         (formerly `CDIModel.AD_optimize`) to run a round of reconstruction
         once the dataloader and optimizer hyperparameters have been
         set up.
@@ -135,7 +135,7 @@ def optimize(self,
             f'{regularization_factor}, and schedule = {schedule}.\n'
         )
 
-        # 1) The subset statement is contained in Optimizer.__init__
+        # 1) The subset statement is contained in Reconstructor.__init__
 
         # 2) Set up / re-initialize the data laoder
         self.setup_dataloader(batch_size=batch_size, shuffle=shuffle)
diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/optimizer/base.py
index 90ba37bb..0ca0d519 100644
--- a/src/cdtools/optimizer/base.py
+++ b/src/cdtools/optimizer/base.py
@@ -1,11 +1,11 @@
-"""This module contains the base Optimizer class for performing
+"""This module contains the base Reconstructor class for performing
 optimization ('reconstructions') on ptychographic/CDI models.
 
-The Optimizer class is designed to resemble so-called
+The Reconstructor class is designed to resemble so-called
 'Trainer' classes that (in the language of the AI/ML folks) handles
 the 'training' of a model given some dataset and optimizer.
 
-The subclasses of Optimizer are required to implement
+The subclasses of Reconstructor are required to implement
 their own data loaders and optimizer adjusters
 """
 
@@ -23,14 +23,14 @@
 import cdtools.tools.distributed as cdtdist
 from typing import Tuple, List, Union
 
-__all__ = ['Optimizer']
+__all__ = ['Reconstructor']
 
-class Optimizer:
+class Reconstructor:
     """
-    Optimizer handles the optimization ('reconstruction') of ptychographic
+    Reconstructor handles the optimization ('reconstruction') of ptychographic
     models given a CDIModel (or subclass) and corresponding Ptycho2DDataset.
     
-    This is a base model that defines all functions Optimizer subclasses
+    This is a base model that defines all functions Reconstructor subclasses
     must implement.
 
     Parameters
@@ -48,7 +48,7 @@ class Optimizer:
       using a distributed data approach. This attribute will be pulled from the
       CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
     - **optimizer** -- A `torch.optim.Optimizer` that must be defined when initializing the
-      Optimizer subclass.
+      Reconstructor subclass.
     - **scheduler** -- A `torch.optim.lr_scheduler` that may be defined during the `optimize` method.
     - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
       `setup_dataloader` method.
@@ -57,7 +57,7 @@ def __init__(self,
                  model: CDIModel,
                  dataset: Ptycho2DDataset,
                  subset: List[int] = None):
-        # Store parameters as attributes of Optimizer
+        # Store parameters as attributes of Reconstructor
         self.subset = subset
         self.multi_gpu_used = model.multi_gpu_used
         self.world_size = model.world_size

From 7fb2b966a16629990cd961eabefd96c58abaf040 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 10 Jun 2025 15:57:07 +0000
Subject: [PATCH 059/115] Updated __init__.py in optimizer

---
 src/cdtools/optimizer/__init__.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/cdtools/optimizer/__init__.py b/src/cdtools/optimizer/__init__.py
index 4777830a..d5072d70 100644
--- a/src/cdtools/optimizer/__init__.py
+++ b/src/cdtools/optimizer/__init__.py
@@ -4,13 +4,8 @@
 
 # We define __all__ to be sure that import * only imports what we want
 __all__ = [
-    'CDIModel',
-    'SimplePtycho',
-    'FancyPtycho',
-    'Bragg2DPtycho',
-    'Multislice2DPtycho',
-    'MultislicePtycho',
-    'RPI',
+    'Reconstructor',
+    'Adam'
 ]
 
 from cdtools.optimizer.base import Reconstructor

From 2958aff5a84e25e1c2fb0414c378665e27436042 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 10 Jun 2025 16:23:41 +0000
Subject: [PATCH 060/115] Renamed the optimizer module to reconstructors

---
 examples/distributed_speed_test.py                    | 11 +++++------
 examples/fancy_ptycho_comparison.py                   |  9 ++++-----
 ...cho_multi_gpu_ddp.py => fancy_ptycho_multi_gpu.py} |  3 +--
 examples/fancy_ptycho_optimizer.py                    |  3 +--
 src/cdtools/__init__.py                               |  4 ++--
 src/cdtools/models/base.py                            |  8 ++++----
 src/cdtools/{optimizer => reconstructors}/__init__.py |  4 ++--
 src/cdtools/{optimizer => reconstructors}/adam.py     |  2 +-
 src/cdtools/{optimizer => reconstructors}/base.py     |  0
 9 files changed, 20 insertions(+), 24 deletions(-)
 rename examples/{fancy_ptycho_multi_gpu_ddp.py => fancy_ptycho_multi_gpu.py} (97%)
 rename src/cdtools/{optimizer => reconstructors}/__init__.py (65%)
 rename src/cdtools/{optimizer => reconstructors}/adam.py (99%)
 rename src/cdtools/{optimizer => reconstructors}/base.py (100%)

diff --git a/examples/distributed_speed_test.py b/examples/distributed_speed_test.py
index ccba2843..6a4928b4 100644
--- a/examples/distributed_speed_test.py
+++ b/examples/distributed_speed_test.py
@@ -11,7 +11,6 @@
 
 import cdtools
 from cdtools.models import CDIModel
-import cdtools.optimizer
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.tools.distributed import distributed
 import torch.multiprocessing as mp
@@ -74,8 +73,8 @@ def reconstruct(model: CDIModel,
         model.to(device=device)
         dataset.get_as(device=device)
 
-    # Set up the Reconstructor with the Adam optimizer
-    recon = cdtools.optimizer.Adam(model,dataset)
+    # Set up the Reconstructor with the Adam reconstructor
+    recon = cdtools.reconstructors.Adam(model,dataset)
 
     # Perform reconstructions on either single or multi-GPU workflows.
     if TEST == 'fancy_ptycho':
@@ -140,9 +139,9 @@ def run_test(world_sizes: int,
     """
     # Load the dataset and model
     if TEST == 'fancy_ptycho':
-        filename = 'cdtools/examples/example_data/lab_ptycho_data.cxi'
+        filename = 'examples/example_data/lab_ptycho_data.cxi'
     elif TEST == 'gold_balls':
-        filename = 'cdtools/examples/example_data/AuBalls_700ms_30nmStep_3_6SS_filter.cxi'
+        filename = 'examples/example_data/AuBalls_700ms_30nmStep_3_6SS_filter.cxi'
 
     dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
     
@@ -264,7 +263,7 @@ def run_test(world_sizes: int,
     device_ids = [1, 2, 5, 7]
 
     # How many reconstruction runs to perform for statistics
-    runs = 3
+    runs = 1
 
     # Run the test
     run_test(world_sizes, device_ids, runs)
diff --git a/examples/fancy_ptycho_comparison.py b/examples/fancy_ptycho_comparison.py
index c8d4b71a..df952423 100644
--- a/examples/fancy_ptycho_comparison.py
+++ b/examples/fancy_ptycho_comparison.py
@@ -7,7 +7,6 @@
 
 
 import cdtools
-import cdtools.optimizer
 import torch as t
 import numpy as np
 import time
@@ -38,13 +37,13 @@ def reload_model():
     return copy.deepcopy(model_original)
     
 
-# For running the optimizer class
+# For running the reconstructors class
 numiter = 5
 
 # Set stuff up for plots
 fig, (ax1,ax2) = plt.subplots(1,2)
 
-for option in ('old_method', 'optimizer'):
+for option in ('old_method', 'reconstructors'):
     time_list = []
     loss_hist_list = []
 
@@ -63,8 +62,8 @@ def report_n_record():
             print(model.report())
             local_time_list.append(time.time() - t_start)
 
-        if option == 'optimizer':
-            recon = cdtools.optimizer.Adam(model, dataset)
+        if option == 'reconstructors':
+            recon = cdtools.reconstructors.Adam(model, dataset)
             for loss in recon.optimize(20, lr=0.005, batch_size=50):
                 report_n_record()
             for loss in recon.optimize(20, lr=0.002, batch_size=100):
diff --git a/examples/fancy_ptycho_multi_gpu_ddp.py b/examples/fancy_ptycho_multi_gpu.py
similarity index 97%
rename from examples/fancy_ptycho_multi_gpu_ddp.py
rename to examples/fancy_ptycho_multi_gpu.py
index 21ad2c89..9c1533f8 100644
--- a/examples/fancy_ptycho_multi_gpu_ddp.py
+++ b/examples/fancy_ptycho_multi_gpu.py
@@ -1,5 +1,4 @@
 import cdtools
-import cdtools.optimizer
 from matplotlib import pyplot as plt
 
 # We need to import 2 additional functions
@@ -56,7 +55,7 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
             if rank == 0:
                 model.inspect(dataset)
     """
-    recon = cdtools.optimizer.Adam(model,dataset)
+    recon = cdtools.reconstructors.Adam(model,dataset)
     if rank == 0:
         model.inspect(dataset)
 
diff --git a/examples/fancy_ptycho_optimizer.py b/examples/fancy_ptycho_optimizer.py
index 3011a53b..79f9c4ed 100644
--- a/examples/fancy_ptycho_optimizer.py
+++ b/examples/fancy_ptycho_optimizer.py
@@ -1,5 +1,4 @@
 import cdtools
-import cdtools.optimizer
 from matplotlib import pyplot as plt
 
 filename = 'examples/example_data/lab_ptycho_data.cxi'
@@ -22,7 +21,7 @@
 
 # An Adam Reconstructor object is created to perform Adam 
 # optimization on the FancyPtycho model and dataset
-recon = cdtools.optimizer.Adam(model, dataset)
+recon = cdtools.reconstructors.Adam(model, dataset)
 
 # The learning rate parameter sets the alpha for Adam.
 # The beta parameters are (0.9, 0.999) by default
diff --git a/src/cdtools/__init__.py b/src/cdtools/__init__.py
index 96842075..91322091 100644
--- a/src/cdtools/__init__.py
+++ b/src/cdtools/__init__.py
@@ -4,9 +4,9 @@
 warnings.filterwarnings("ignore",
                         message='To copy construct from a tensor, ')
 
-__all__ = ['tools', 'datasets', 'models']
+__all__ = ['tools', 'datasets', 'models', 'reconstructors']
 
 from cdtools import tools
 from cdtools import datasets
 from cdtools import models
-
+from cdtools import reconstructors
diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 7994b9e2..49ad8f5b 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -598,13 +598,13 @@ def Adam_optimize(
         # of the model. But since cdtoptim.Adam also depends on CDIModel,
         # this seems to give rise to a circular import error unless
         # we import cdtools.optimizer within this method:
-        import cdtools.optimizer as cdtoptim
+        from cdtools.reconstructors import Adam
 
         # Next, we want to create an Optimizer.Adam if one does not already exist.
         if not hasattr(self, 'optimizer'):
-            self.optimizer = cdtoptim.Adam(model=self, 
-                                           dataset=dataset, 
-                                           subset=subset)
+            self.optimizer = Adam(model=self, 
+                                  dataset=dataset, 
+                                  subset=subset)
         
         # Run some reconstructions
         return self.optimizer.optimize(iterations=iterations,
diff --git a/src/cdtools/optimizer/__init__.py b/src/cdtools/reconstructors/__init__.py
similarity index 65%
rename from src/cdtools/optimizer/__init__.py
rename to src/cdtools/reconstructors/__init__.py
index d5072d70..07ac638d 100644
--- a/src/cdtools/optimizer/__init__.py
+++ b/src/cdtools/reconstructors/__init__.py
@@ -8,6 +8,6 @@
     'Adam'
 ]
 
-from cdtools.optimizer.base import Reconstructor
-from cdtools.optimizer.adam import Adam
+from cdtools.reconstructors.base import Reconstructor
+from cdtools.reconstructors.adam import Adam
 
diff --git a/src/cdtools/optimizer/adam.py b/src/cdtools/reconstructors/adam.py
similarity index 99%
rename from src/cdtools/optimizer/adam.py
rename to src/cdtools/reconstructors/adam.py
index 6ee54dc3..388e9ba3 100644
--- a/src/cdtools/optimizer/adam.py
+++ b/src/cdtools/reconstructors/adam.py
@@ -15,7 +15,7 @@
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.models import CDIModel
 from typing import Tuple, List, Union
-from cdtools.optimizer import Reconstructor
+from cdtools.reconstructors import Reconstructor
 
 __all__ = ['Adam']
 
diff --git a/src/cdtools/optimizer/base.py b/src/cdtools/reconstructors/base.py
similarity index 100%
rename from src/cdtools/optimizer/base.py
rename to src/cdtools/reconstructors/base.py

From 934135524a23eab22d5051d214a9abaa7c8fb33e Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 12 Jun 2025 04:09:49 +0000
Subject: [PATCH 061/115] Separated LBFGS from CDIModel into a Reconstructor
 subclass

---
 src/cdtools/models/base.py             |  49 ++++-----
 src/cdtools/reconstructors/__init__.py |   5 +-
 src/cdtools/reconstructors/lbfgs.py    | 133 +++++++++++++++++++++++++
 3 files changed, 162 insertions(+), 25 deletions(-)
 create mode 100644 src/cdtools/reconstructors/lbfgs.py

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 49ad8f5b..e3fb0d24 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -590,14 +590,14 @@ def Adam_optimize(
             only the calculation speed 
         
         """
-        # We want to have model.Adam_optimize call AND store cdtoptim.Adam.optimize
+        # We want to have model.Adam_optimize call AND store cdtools.reconstructors.Adam
         # to be able to perform reconstructions without creating a new
         # optimizer each time we update the hyperparameters.
         # 
-        # The only way to do this is to make cdtoptim.Adam an attribute
-        # of the model. But since cdtoptim.Adam also depends on CDIModel,
+        # The only way to do this is to make the Adam reconstructor an attribute
+        # of the model. But since the Adam reconstructor also depends on CDIModel,
         # this seems to give rise to a circular import error unless
-        # we import cdtools.optimizer within this method:
+        # we import cdtools.reconstructors within this method:
         from cdtools.reconstructors import Adam
 
         # Next, we want to create an Optimizer.Adam if one does not already exist.
@@ -650,26 +650,29 @@ def LBFGS_optimize(self, iterations, dataset,
             Default True, whether to run the computation in a separate thread to allow interaction with plots during computation.
 
         """
-        if subset is not None:
-            # if just one pattern, turn into a list for convenience
-            if type(subset) == type(1):
-                subset = [subset]
-            dataset = torchdata.Subset(dataset, subset)
-
-        # Make a dataloader. This basically does nothing but load all the
-        # data at once
-        data_loader = torchdata.DataLoader(dataset, batch_size=len(dataset))
-
-
-        # Define the optimizer
-        optimizer = t.optim.LBFGS(self.parameters(),
-                                  lr = lr, history_size=history_size,
-                                  line_search_fn=line_search_fn)
+        # We want to have model.LBFGS_optimize call AND store cdtools.reconstructors.LBFGS
+        # to be able to perform reconstructions without creating a new
+        # optimizer each time we update the hyperparameters.
+        # 
+        # The only way to do this is to make the LBFGS reconstructor an attribute
+        # of the model. But since the LBFGS reconstructor also depends on CDIModel,
+        # this seems to give rise to a circular import error unless
+        # we import cdtools.reconstructors within this method:
+        from cdtools.reconstructors import LBFGS
 
-        return self.AD_optimize(iterations, data_loader, optimizer,
-                                regularization_factor=regularization_factor,
-                                thread=thread,
-                                calculation_width=calculation_width)
+        # Next, we want to create an Optimizer.Adam if one does not already exist.
+        if not hasattr(self, 'optimizer'):
+            self.optimizer = LBFGS(model=self, 
+                                   dataset=dataset, 
+                                   subset=subset)
+        
+        # Run some reconstructions
+        return self.optimizer.optimize(iterations=iterations,
+                                       lr=lr,
+                                       history_size=history_size,
+                                       regularization_factor=regularization_factor,
+                                       thread=thread,
+                                       calculation_width=calculation_width)
 
 
     def SGD_optimize(self, iterations, dataset, batch_size=None,
diff --git a/src/cdtools/reconstructors/__init__.py b/src/cdtools/reconstructors/__init__.py
index 07ac638d..c2a3bb54 100644
--- a/src/cdtools/reconstructors/__init__.py
+++ b/src/cdtools/reconstructors/__init__.py
@@ -5,9 +5,10 @@
 # We define __all__ to be sure that import * only imports what we want
 __all__ = [
     'Reconstructor',
-    'Adam'
+    'Adam',
+    'LBFGS'
 ]
 
 from cdtools.reconstructors.base import Reconstructor
 from cdtools.reconstructors.adam import Adam
-
+from cdtools.reconstructors.lbfgs import LBFGS
diff --git a/src/cdtools/reconstructors/lbfgs.py b/src/cdtools/reconstructors/lbfgs.py
new file mode 100644
index 00000000..c4097180
--- /dev/null
+++ b/src/cdtools/reconstructors/lbfgs.py
@@ -0,0 +1,133 @@
+"""This module contains the LBFGS Reconstructor subclass for performing
+optimization ('reconstructions') on ptychographic/CDI models using
+the LBFGS optimizer.
+
+The Reconstructor class is designed to resemble so-called
+'Trainer' classes that (in the language of the AI/ML folks) handles
+the 'training' of a model given some dataset and optimizer.
+"""
+import torch as t
+from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
+from cdtools.models import CDIModel
+from typing import Tuple, List, Union
+from cdtools.reconstructors import Reconstructor
+
+__all__ = ['LBFGS']
+
+class LBFGS(Reconstructor):
+    """
+    The LBFGS Reconstructor subclass handles the optimization ('reconstruction') of 
+    ptychographic models and datasets using the LBFGS optimizer.
+
+    Parameters
+    ----------
+    model: CDIModel
+        Model for CDI/ptychography reconstruction
+    dataset: Ptycho2DDataset
+        The dataset to reconstruct against
+    subset : list(int) or int
+        Optional, a pattern index or list of pattern indices to use
+    schedule : bool
+        Optional, create a learning rate scheduler (torch.optim.lr_scheduler._LRScheduler)
+
+    Important attributes:
+    - **model** -- Always points to the core model used.
+    - **multi_gpu_used** -- Whether or not multi-GPU computation will be performed
+      using a distributed data approach. This attribute will be pulled from the
+      CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
+    - **optimizer** -- This class by default uses `torch.optim.LBFGS` to perform
+      optimizations.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the `optimize` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
+      `setup_dataloader` method.
+    """
+    def __init__(self,
+                 model: CDIModel,
+                 dataset: Ptycho2DDataset,
+                 subset: List[int] = None):
+
+        super().__init__(model, dataset, subset)
+        
+        # Define the optimizer for use in this subclass
+        self.optimizer = t.optim.LBFGS(self.model.parameters())
+    
+    def adjust_optimizer(self,
+                         lr: int = 0.005,
+                         history_size: int = 2,
+                         line_search_fn: str = None):
+        """
+        Change hyperparameters for the utilized optimizer.
+
+        Parameters
+        ----------
+        lr : float
+            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
+            typically the highest possible value with any chance of being stable
+        history_size : int
+            Optional, the length of the history to use.
+        line_search_fn : str
+            Optional, either `strong_wolfe` or None
+        """
+        for param_group in self.optimizer.param_groups:
+            param_group['lr'] = lr
+            param_group['history_size'] = history_size
+            param_group['line_search_fn'] = line_search_fn
+
+
+    def optimize(self,
+                 iterations: int,
+                 lr: float = 0.1,
+                 history_size: int = 2,
+                 regularization_factor: Union[float, List[float]] = None,
+                 thread: bool = True,
+                 calculation_width: int = 10,
+                 line_search_fn: str = None):
+        """
+        Runs a round of reconstruction using the LBFGS optimizer
+
+        Formerly `CDIModel.LBFGS_optimize`
+        
+        This algorithm is often less stable that Adam, however in certain
+        situations or geometries it can be shockingly efficient. Like all
+        the other optimization routines, it is defined as a generator
+        function which yields the average loss each epoch.
+
+        NOTE: There is no batch size, because it is a usually a bad idea to use
+        LBFGS on anything but all the data at onece
+
+        Parameters
+        ----------
+        iterations : int
+            How many epochs of the algorithm to run
+        lr : float
+            Optional, The learning rate (alpha) to use. Default is 0.1. 
+        history_size : int
+            Optional, the length of the history to use.
+        regularization_factor : float or list(float)
+            Optional, if the model has a regularizer defined, the set of parameters to pass 
+            the regularizer method
+        thread : bool
+            Default True, whether to run the computation in a separate thread to allow 
+            interaction with plots during computation
+        calculation_width : int
+            Default 10, how many translations to pass through at once for each round of 
+            gradient accumulation. Does not affect the result, only the calculation speed 
+        shuffle : bool
+            Optional, enable/disable shuffling of the dataset. This option
+            is intended for diagnostic purposes and should be left as True.
+        """
+        # 1) The subset statement is contained in Reconstructor.__init__
+
+        # 2) Set up / re-initialize the data loader. For LBFGS, we load
+        #    all the data at once.
+        self.setup_dataloader(batch_size=len(self.dataset))
+
+        # 3) The optimizer is created in self.__init__, but the 
+        #    hyperparameters need to be set up with self.adjust_optimizer
+        self.adjust_optimizer(lr=lr, history_size=history_size, line_search_fn=line_search_fn)
+
+        # 4) This is analagous to making a call to CDIModel.AD_optimize
+        return super(LBFGS, self).optimize(iterations,
+                                           regularization_factor,
+                                           thread,
+                                           calculation_width)
\ No newline at end of file

From 4232906022b6d3f1dd8169e7e497571b9e6002be Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 12 Jun 2025 21:38:47 +0000
Subject: [PATCH 062/115] Separated SGD from CDIModel into a Reconstructor
 subclass

---
 src/cdtools/models/base.py             |  55 +++++----
 src/cdtools/reconstructors/__init__.py |   4 +-
 src/cdtools/reconstructors/sgd.py      | 158 +++++++++++++++++++++++++
 3 files changed, 196 insertions(+), 21 deletions(-)
 create mode 100644 src/cdtools/reconstructors/sgd.py

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index e3fb0d24..429d0910 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -696,6 +696,13 @@ def SGD_optimize(self, iterations, dataset, batch_size=None,
             Optional, the learning rate to use
         momentum : float
             Optional, the length of the history to use.
+        dampening : float
+            Optional, dampening for the momentum
+        weight_decay : float
+            Optional, weight decay (L2 penalty)
+        nesterov : bool
+            Optional, enables Nesterov momentum. Only applicable when momentum 
+            is non-zero. 
         subset : list(int) or int
             Optional, a pattern index or list of pattern indices to use
         regularization_factor : float or list(float)
@@ -703,23 +710,24 @@ def SGD_optimize(self, iterations, dataset, batch_size=None,
         thread : bool
             Default True, whether to run the computation in a separate thread to allow interaction with plots during computation
         calculation_width : int
-            Default 1, how many translations to pass through at once for each round of gradient accumulation
+            Default 10, how many translations to pass through at once for each round of gradient accumulation
 
         """
-
-        if subset is not None:
-            # if just one pattern, turn into a list for convenience
-            if type(subset) == type(1):
-                subset = [subset]
-            dataset = torchdata.Subset(dataset, subset)
-
-        # Make a dataloader
-        if batch_size is not None:
-            data_loader = torchdata.DataLoader(dataset, batch_size=batch_size,
-                                               shuffle=True)
-        else:
-            data_loader = torchdata.DataLoader(dataset)
-
+        # We want to have model.SGD_optimize call AND store cdtools.reconstructors.SGD
+        # to be able to perform reconstructions without creating a new
+        # optimizer each time we update the hyperparameters.
+        # 
+        # The only way to do this is to make the SGD reconstructor an attribute
+        # of the model. But since the SGD reconstructor also depends on CDIModel,
+        # this seems to give rise to a circular import error unless
+        # we import cdtools.reconstructors within this method:
+        from cdtools.reconstructors import SGD
+        
+        # Next, we want to create an Optimizer.Adam if one does not already exist.
+        if not hasattr(self, 'optimizer'):
+            self.optimizer = SGD(model=self, 
+                                 dataset=dataset, 
+                                 subset=subset)
 
         # Define the optimizer
         optimizer = t.optim.SGD(self.parameters(),
@@ -727,11 +735,18 @@ def SGD_optimize(self, iterations, dataset, batch_size=None,
                                 dampening=dampening,
                                 weight_decay=weight_decay,
                                 nesterov=nesterov)
-
-        return self.AD_optimize(iterations, data_loader, optimizer,
-                                regularization_factor=regularization_factor,
-                                thread=thread,
-                                calculation_width=calculation_width)
+    
+        # Run some reconstructions
+        return self.optimizer.optimize(iterations=iterations,
+                                       batch_size=batch_size,
+                                       lr=lr,
+                                       momentum=momentum,
+                                       dampening=dampening,
+                                       weight_decay=weight_decay,
+                                       nesterov=nesterov,
+                                       regularization_factor=regularization_factor,
+                                       thread=thread,
+                                       calculation_width=calculation_width)
 
 
     def report(self):
diff --git a/src/cdtools/reconstructors/__init__.py b/src/cdtools/reconstructors/__init__.py
index c2a3bb54..84b96ab4 100644
--- a/src/cdtools/reconstructors/__init__.py
+++ b/src/cdtools/reconstructors/__init__.py
@@ -6,9 +6,11 @@
 __all__ = [
     'Reconstructor',
     'Adam',
-    'LBFGS'
+    'LBFGS',
+    'SGD'
 ]
 
 from cdtools.reconstructors.base import Reconstructor
 from cdtools.reconstructors.adam import Adam
 from cdtools.reconstructors.lbfgs import LBFGS
+from cdtools.reconstructors.sgd import SGD
diff --git a/src/cdtools/reconstructors/sgd.py b/src/cdtools/reconstructors/sgd.py
new file mode 100644
index 00000000..266db20b
--- /dev/null
+++ b/src/cdtools/reconstructors/sgd.py
@@ -0,0 +1,158 @@
+"""This module contains the SGD Reconstructor subclass for performing
+optimization ('reconstructions') on ptychographic/CDI models using
+stochastic gradient descent.
+
+The Reconstructor class is designed to resemble so-called
+'Trainer' classes that (in the language of the AI/ML folks) handles
+the 'training' of a model given some dataset and optimizer.
+"""
+import torch as t
+from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
+from cdtools.models import CDIModel
+from typing import Tuple, List, Union
+from cdtools.reconstructors import Reconstructor
+
+__all__ = ['SGD']
+
+class SGD(Reconstructor):
+    """
+    The Adam Reconstructor subclass handles the optimization ('reconstruction') of 
+    ptychographic models and datasets using the Adam optimizer.
+
+    Parameters
+    ----------
+    model: CDIModel
+        Model for CDI/ptychography reconstruction
+    dataset: Ptycho2DDataset
+        The dataset to reconstruct against
+    subset : list(int) or int
+        Optional, a pattern index or list of pattern indices to use
+
+    Important attributes:
+    - **model** -- Always points to the core model used.
+    - **multi_gpu_used** -- Whether or not multi-GPU computation will be performed
+      using a distributed data approach. This attribute will be pulled from the
+      CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
+    - **optimizer** -- This class by default uses `torch.optim.Adam` to perform
+      optimizations.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the `optimize` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
+      `setup_dataloader` method.
+    """
+    def __init__(self,
+                 model: CDIModel,
+                 dataset: Ptycho2DDataset,
+                 subset: List[int] = None):
+
+        super().__init__(model, dataset, subset)
+        
+        # Define the optimizer for use in this subclass
+        self.optimizer = t.optim.SGD(self.model.parameters())
+    
+    def adjust_optimizer(self,
+                         lr: int = 0.005,
+                         momentum: float = 0,
+                         dampening: float = 0,
+                         weight_decay: float = 0,
+                         nesterov: bool = False):
+        """
+        Change hyperparameters for the utilized optimizer.
+
+        Parameters
+        ----------
+        lr : float
+            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
+            typically the highest possible value with any chance of being stable
+        momentum : float
+            Optional, the length of the history to use.
+        dampening : float
+            Optional, dampening for the momentum
+        weight_decay : float
+            Optional, weight decay (L2 penalty)
+        nesterov : bool
+            Optional, enables Nesterov momentum. Only applicable when momentum 
+            is non-zero. 
+        """
+        for param_group in self.optimizer.param_groups:
+            param_group['lr'] = lr
+            param_group['momentum'] = momentum
+            param_group['dampening'] = dampening
+            param_group['weight_decay'] = weight_decay
+            param_group['nesterov'] = nesterov
+
+
+    def optimize(self,
+                 iterations: int,
+                 batch_size: int = None,
+                 lr: float = 0.01,
+                 momentum: float = 0,
+                 dampening: float = 0,
+                 weight_decay: float = 0,
+                 nesterov: bool = False,
+                 regularization_factor: Union[float, List[float]] = None,
+                 thread: bool = True,
+                 calculation_width: int = 10,
+                 shuffle: bool = True):
+        """
+        Runs a round of reconstruction using the Adam optimizer
+
+        Formerly `CDIModel.Adam_optimize`
+        
+        This calls the Reconstructor.optimize superclass method
+        (formerly `CDIModel.AD_optimize`) to run a round of reconstruction
+        once the dataloader and optimizer hyperparameters have been
+        set up.
+
+        Parameters
+        ----------
+        iterations : int
+            How many epochs of the algorithm to run
+        batch_size : int
+            Optional, the size of the minibatches to use. Default is 
+        lr : float
+            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
+            typically the highest possible value with any chance of being stable
+        momentum : float
+            Optional, the length of the history to use.
+        dampening : float
+            Optional, dampening for the momentum
+        weight_decay : float
+            Optional, weight decay (L2 penalty)
+        nesterov : bool
+            Optional, enables Nesterov momentum. Only applicable when momentum 
+            is non-zero. 
+        regularization_factor : float or list(float)
+            Optional, if the model has a regularizer defined, the set of parameters to pass 
+            the regularizer method
+        thread : bool
+            Default True, whether to run the computation in a separate thread to allow 
+            interaction with plots during computation
+        calculation_width : int
+            Default 10, how many translations to pass through at once for each round of 
+            gradient accumulation. Does not affect the result, only the calculation speed 
+        shuffle : bool
+            Optional, enable/disable shuffling of the dataset. This option
+            is intended for diagnostic purposes and should be left as True.
+        """
+        # 1) The subset statement is contained in Reconstructor.__init__
+
+        # 2) Set up / re-initialize the data laoder
+        if batch_size is not None:
+            self.setup_dataloader(batch_size=batch_size, shuffle=shuffle)
+        else:
+            # Use default torch dataloader parameters
+            self.setup_dataloader(batch_size=1, shuffle=False)
+
+        # 3) The optimizer is created in self.__init__, but the 
+        #    hyperparameters need to be set up with self.adjust_optimizer
+        self.adjust_optimizer(lr=lr,
+                              momentum=momentum,
+                              dampening=dampening,
+                              weight_decay=weight_decay,
+                              nesterov=nesterov)
+
+        # 4) This is analagous to making a call to CDIModel.AD_optimize
+        return super(SGD, self).optimize(iterations,
+                                         regularization_factor,
+                                         thread,
+                                         calculation_width)
\ No newline at end of file

From ce678004a33f4db59ba965a7667af9b57248a16d Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 12 Jun 2025 21:54:55 +0000
Subject: [PATCH 063/115] Removed CDIModel.AD_optimize

---
 src/cdtools/models/base.py | 209 -------------------------------------
 1 file changed, 209 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 429d0910..17c36d46 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -328,215 +328,6 @@ def checkpoint(self, *args):
 
         self.current_checkpoint_id += 1
 
-
-        
-    def AD_optimize(self, iterations, data_loader,  optimizer,\
-                    scheduler=None, regularization_factor=None, thread=True,
-                    calculation_width=10):
-        """Runs a round of reconstruction using the provided optimizer
-
-        This is the basic automatic differentiation reconstruction tool
-        which all the other, algorithm-specific tools, use. It is a
-        generator which yields the average loss each epoch, ending after
-        the specified number of iterations.
-
-        By default, the computation will be run in a separate thread. This
-        is done to enable live plotting with matplotlib during a reconstruction.
-        If the computation was done in the main thread, this would freeze
-        the plots. This behavior can be turned off by setting the keyword
-        argument 'thread' to False.        
-
-        Parameters
-        ----------
-        iterations : int
-            How many epochs of the algorithm to run
-        data_loader : torch.utils.data.DataLoader
-            A data loader loading the CDataset to reconstruct
-        optimizer : torch.optim.Optimizer
-            The optimizer to run the reconstruction with
-        scheduler : torch.optim.lr_scheduler._LRScheduler
-            Optional, a learning rate scheduler to use
-        regularization_factor : float or list(float)
-            Optional, if the model has a regularizer defined, the set of parameters to pass the regularizer method
-        thread : bool
-            Default True, whether to run the computation in a separate thread to allow interaction with plots during computation
-        calculation_width : int
-            Default 10, how many translations to pass through at once for each round of gradient accumulation. This does not affect the result, but may affect the calculation speed.
-
-        Yields
-        ------
-        loss : float
-            The summed loss over the latest epoch, divided by the total diffraction pattern intensity
-        """
-
-        def run_epoch(stop_event=None):
-            """Runs one full epoch of the reconstruction."""
-            # If we're using DistributedSampler (likely the case if you're using 
-            # multiple GPUs), we need to tell it which epoch we're on. Otherwise
-            # data shuffling will not work properly
-            if self.multi_gpu_used: 
-                data_loader.sampler.set_epoch(self.epoch)
-
-            # First, initialize some tracking variables
-            normalization = 0
-            loss = 0
-            N = 0
-            t0 = time.time()
-
-            # The data loader is responsible for setting the minibatch
-            # size, so each set is a minibatch
-            for inputs, patterns in data_loader:
-                normalization += t.sum(patterns).cpu().numpy()
-                N += 1
-                def closure():
-                    optimizer.zero_grad()
-
-                    # We further break up the minibatch into a set of chunks.
-                    # This lets us use larger minibatches than can fit
-                    # on the GPU at once, while still doing batch processing
-                    # for efficiency
-                    input_chunks = [[inp[i:i + calculation_width]
-                                     for inp in inputs]
-                                    for i in range(0, len(inputs[0]),
-                                                   calculation_width)]
-                    pattern_chunks = [patterns[i:i + calculation_width]
-                                      for i in range(0, len(inputs[0]),
-                                                     calculation_width)]
-
-                    total_loss = 0
-                    for inp, pats in zip(input_chunks, pattern_chunks):
-                        # This check allows for graceful exit when threading
-                        if stop_event is not None and stop_event.is_set():
-                            exit()
-
-                        # Run the simulation
-                        sim_patterns = self.forward(*inp) ## TODO: Do a deep dive plotting-per-iteration of this
-
-                        # Calculate the loss
-                        if hasattr(self, 'mask'):
-                            loss = self.loss(pats,sim_patterns, mask=self.mask)
-                        else:
-                            loss = self.loss(pats,sim_patterns)
-
-                        # And accumulate the gradients
-                        loss.backward()
-
-                        # For multi-GPU optimization, we need to average and
-                        # sync the gradients + losses across all participating
-                        # GPUs with an all-reduce call.
-                        if self.multi_gpu_used:
-                            for param in self.parameters():
-                                if param.requires_grad:
-                                    dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
-                                    param.grad.data /= self.world_size
-                            
-                            # Sum the loss value across all devices for reporting
-                            dist.all_reduce(loss, op=dist.ReduceOp.SUM) 
-                        
-                        # Normalize the accumulating total loss by the number of GPUs used
-                        total_loss += loss.detach() // self.world_size
-                        
-
-                    # If we have a regularizer, we can calculate it separately,
-                    # and the gradients will add to the minibatch gradient
-                    if regularization_factor is not None \
-                       and hasattr(self, 'regularizer'):
-                        loss = self.regularizer(regularization_factor)
-                        loss.backward()
-
-                        # For multi-GPU optimization, we need to average and
-                        # sync the gradients + losses across all participating
-                        # GPUs with an all-reduce call.
-                        if self.multi_gpu_used:
-                            for param in self.parameters():
-                                if param.requires_grad:
-                                    dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
-                                    param.grad.data /= self.world_size
-                    
-                    return total_loss
-
-                # This takes the step for this minibatch
-                loss += optimizer.step(closure).detach().cpu().numpy()
-            
-            loss /= normalization
-
-            # We step the scheduler after the full epoch
-            if scheduler is not None:
-                scheduler.step(loss)
-
-            self.loss_history.append(loss)
-            self.epoch = len(self.loss_history)
-            self.latest_iteration_time = time.time() - t0
-            self.training_history += self.report() + '\n'
-            return loss
-
-        # We store the current optimizer as a model parameter so that
-        # it can be saved and loaded for checkpointing
-        self.current_optimizer = optimizer
-        
-        # If we don't want to run in a different thread, this is easy
-        if not thread:
-            for it in range(iterations):
-                if self.skip_computation():
-                    self.epoch = self.epoch + 1
-                    if len(self.loss_history) >= 1:
-                        yield self.loss_history[-1]
-                    else:
-                        yield float('nan')
-                    continue
-
-                yield run_epoch()
-                    
-                
-        # But if we do want to thread, it's annoying:
-        else:
-            # Here we set up the communication with the computation thread
-            result_queue = queue.Queue()
-            stop_event = threading.Event()
-            def target():
-                try:
-                    result_queue.put(run_epoch(stop_event))
-                except Exception as e:
-                    # If something bad happens, put the exception into the
-                    # result queue
-                    result_queue.put(e)
-
-            # And this actually starts and monitors the thread
-            for it in range(iterations):
-                if self.skip_computation():
-                    self.epoch = self.epoch + 1                    
-                    if len(self.loss_history) >= 1:
-                        yield self.loss_history[-1]
-                    else:
-                        yield float('nan')
-                    continue
-
-                calc = threading.Thread(target=target, name='calculator', daemon=True)
-                try:
-                    calc.start()
-                    while calc.is_alive():
-                        if hasattr(self, 'figs'):
-                            self.figs[0].canvas.start_event_loop(0.01)
-                        else:
-                            calc.join()
-
-                except KeyboardInterrupt as e:
-                    stop_event.set()
-                    print('\nAsking execution thread to stop cleanly - please be patient.')
-                    calc.join()
-                    raise e
-
-                res = result_queue.get()
-
-                # If something went wrong in the thead, we'll get an exception
-                if isinstance(res, Exception):
-                    raise res
-
-                yield res
-
-        # And finally, we unset the current optimizer:
-        self.current_optimizer = None
-
     
     def Adam_optimize(
             self,

From a2967d46468cd7a09aafaddc2060f6eaeb689002 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 13 Jun 2025 22:06:23 +0000
Subject: [PATCH 064/115] Cleaned up model and reconstructor docs

---
 src/cdtools/models/base.py          | 163 ++++++++++++++++------------
 src/cdtools/reconstructors/adam.py  |   2 +-
 src/cdtools/reconstructors/base.py  |  10 +-
 src/cdtools/reconstructors/lbfgs.py |   3 -
 src/cdtools/reconstructors/sgd.py   |  19 ++--
 5 files changed, 110 insertions(+), 87 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index fc952459..3b8ab161 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -42,7 +42,8 @@
 from scipy import io
 from contextlib import contextmanager
 from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
-
+from cdtools.datasets import CDataset
+from typing import List, Union, Tuple
 
 __all__ = ['CDIModel']
 
@@ -331,20 +332,21 @@ def checkpoint(self, *args):
     
     def Adam_optimize(
             self,
-            iterations,
-            dataset,
-            batch_size=15,
-            lr=0.005,
-            betas=(0.9, 0.999),
-            schedule=False,
-            amsgrad=False,
-            subset=None,
-            regularization_factor=None,
+            iterations: int,
+            dataset: CDataset,
+            batch_size: int = 15,
+            lr: float = 0.005,
+            betas: Tuple[float] = (0.9, 0.999),
+            schedule: bool = False,
+            amsgrad: bool = False,
+            subset: Union[int, List[int]] = None,
+            regularization_factor: Union[float, List[float]] = None,
             thread=True,
             calculation_width=10
     ):
-        """Runs a round of reconstruction using the Adam optimizer from
-        cdtools.optimizer.Adam.
+        """
+        Runs a round of reconstruction using the Adam optimizer from
+        cdtools.reconstructors.Adam.
 
         This is generally accepted to be the most robust algorithm for use
         with ptychography. Like all the other optimization routines,
@@ -354,31 +356,33 @@ def Adam_optimize(
         Parameters
         ----------
         iterations : int
-            How many epochs of the algorithm to run
+            How many epochs of the algorithm to run.
         dataset : CDataset
-            The dataset to reconstruct against
+            The dataset to reconstruct against.
         batch_size : int
-            Optional, the size of the minibatches to use
+            Optional, the size of the minibatches to use.
         lr : float
             Optional, The learning rate (alpha) to use. Defaultis 0.005. 
             0.05 is typically the highest possible value with any chance 
-            of being stable
-        betas : tuple
+            of being stable.
+        betas : tuple(float)
             Optional, the beta_1 and beta_2 to use. Default is (0.9, 0.999).
-        schedule : float
-            Optional, whether to use the ReduceLROnPlateau scheduler
+        schedule : bool
+            Optional, whether to use the ReduceLROnPlateau scheduler.
+        amsgrad : bool
+            Optional, whether to use the AMSGrad variant of this algorithm.
         subset : list(int) or int
             Optional, a pattern index or list of pattern indices to use
-        regularization_factor : float or list(float)
+        regularization_factor : float or list(float).
             Optional, if the model has a regularizer defined, the set of 
-            parameters to pass the regularizer method
+            parameters to pass the regularizer method.
         thread : bool
             Default True, whether to run the computation in a separate thread 
-            to allow interaction with plots during computation
+            to allow interaction with plots during computation.
         calculation_width : int
             Default 10, how many translations to pass through at once for 
             each round of gradient accumulation. Does not affect the result, 
-            only the calculation speed 
+            only the calculation speed. 
         
         """
         # We want to have model.Adam_optimize call AND store cdtools.reconstructors.Adam
@@ -409,46 +413,56 @@ def Adam_optimize(
                                        calculation_width=calculation_width)
 
 
-    def LBFGS_optimize(self, iterations, dataset,
-                       lr=0.1,history_size=2, subset=None,
-                       regularization_factor=None, thread=True,
-                       calculation_width=10, line_search_fn=None):
-        """Runs a round of reconstruction using the L-BFGS optimizer
+    def LBFGS_optimize(self, 
+                       iterations: int, 
+                       dataset: CDataset,
+                       lr: float = 0.1,
+                       history_size: int = 2, 
+                       subset: Union[int, List[int]] = None,
+                       regularization_factor: Union[float, List[float]] =None, 
+                       thread: bool = True,
+                       calculation_width: int = 10, 
+                       line_search_fn: str = None):
+        """
+        Runs a round of reconstruction using the L-BFGS optimizer from
+        cdtools.reconstructors.LBFGS.
 
         This algorithm is often less stable that Adam, however in certain
         situations or geometries it can be shockingly efficient. Like all
         the other optimization routines, it is defined as a generator
         function which yields the average loss each epoch.
 
-        Note: There is no batch size, because it is a usually a bad idea to use
+        NOTE: There is no batch size, because it is a usually a bad idea to use
         LBFGS on anything but all the data at onece
 
         Parameters
         ----------
         iterations : int
-            How many epochs of the algorithm to run
+            How many epochs of the algorithm to run.
         dataset : CDataset
-            The dataset to reconstruct against
+            The dataset to reconstruct against.
         lr : float
-            Optional, the learning rate to use
+            Optional, the learning rate to use.
         history_size : int
             Optional, the length of the history to use.
         subset : list(int) or int
-            Optional, a pattern index or list of pattern indices to ues
+            Optional, a pattern index or list of pattern indices to use.
         regularization_factor : float or list(float)
-            Optional, if the model has a regularizer defined, the set of parameters to pass the regularizer method
+            Optional, if the model has a regularizer defined, the set of parameters 
+            to pass the regularizer method.
         thread : bool
-            Default True, whether to run the computation in a separate thread to allow interaction with plots during computation.
-
+            Default True, whether to run the computation in a separate thread to allow 
+            interaction with plots during computation.
+        calculation_width : int
+            Default 10, how many translations to pass through at once for each round of 
+            gradient accumulation. Does not affect the result, only the calculation speed 
         """
-        # We want to have model.LBFGS_optimize call AND store cdtools.reconstructors.LBFGS
-        # to be able to perform reconstructions without creating a new
-        # optimizer each time we update the hyperparameters.
+        # We want to have model.LBFGS_optimize store cdtools.reconstructors.LBFGS
+        # as an attribute to run reconstructions without generating new reconstructors
+        # each time CDIModel.LBFGS_optimize is called.
         # 
-        # The only way to do this is to make the LBFGS reconstructor an attribute
-        # of the model. But since the LBFGS reconstructor also depends on CDIModel,
-        # this seems to give rise to a circular import error unless
-        # we import cdtools.reconstructors within this method:
+        # Since the LBFGS reconstructor also depends on CDIModel, a circular import error 
+        # arises unless we import cdtools.reconstructors within this method:
         from cdtools.reconstructors import LBFGS
 
         # Next, we want to create an Optimizer.Adam if one does not already exist.
@@ -463,14 +477,26 @@ def LBFGS_optimize(self, iterations, dataset,
                                        history_size=history_size,
                                        regularization_factor=regularization_factor,
                                        thread=thread,
-                                       calculation_width=calculation_width)
-
-
-    def SGD_optimize(self, iterations, dataset, batch_size=None,
-                     lr=0.01, momentum=0, dampening=0, weight_decay=0,
-                     nesterov=False, subset=None, regularization_factor=None,
-                     thread=True, calculation_width=10):
-        """Runs a round of reconstruction using the SGD optimizer
+                                       calculation_width=calculation_width,
+                                       line_search_fn = line_search_fn)
+
+
+    def SGD_optimize(self, 
+                     iterations: int, 
+                     dataset: CDataset, 
+                     batch_size: int = None,
+                     lr: float = 2e-7, 
+                     momentum: float = 0, 
+                     dampening: float = 0, 
+                     weight_decay: float = 0,
+                     nesterov: bool = False, 
+                     subset: Union[int, List[int]] = None, 
+                     regularization_factor: Union[float, List[float]] = None,
+                     thread: bool = True, 
+                     calculation_width: int = 10):
+        """
+        Runs a round of reconstruction using the SGD optimizer from
+        cdtools.reconstructors.SGD.
 
         This algorithm is often less stable that Adam, but it is simpler
         and is the basic workhorse of gradience descent.
@@ -478,40 +504,41 @@ def SGD_optimize(self, iterations, dataset, batch_size=None,
         Parameters
         ----------
         iterations : int
-            How many epochs of the algorithm to run
+            How many epochs of the algorithm to run.
         dataset : CDataset
-            The dataset to reconstruct against
+            The dataset to reconstruct against.
         batch_size : int
-            Optional, the size of the minibatches to use
+            Optional, the size of the minibatches to use.
         lr : float
-            Optional, the learning rate to use
+            Optional, the learning rate to use.
         momentum : float
             Optional, the length of the history to use.
         dampening : float
-            Optional, dampening for the momentum
+            Optional, dampening for the momentum.
         weight_decay : float
-            Optional, weight decay (L2 penalty)
+            Optional, weight decay (L2 penalty).
         nesterov : bool
             Optional, enables Nesterov momentum. Only applicable when momentum 
             is non-zero. 
         subset : list(int) or int
-            Optional, a pattern index or list of pattern indices to use
+            Optional, a pattern index or list of pattern indices to use.
         regularization_factor : float or list(float)
-            Optional, if the model has a regularizer defined, the set of parameters to pass the regularizer method
+            Optional, if the model has a regularizer defined, the set of 
+            parameters to pass the regularizer method.
         thread : bool
-            Default True, whether to run the computation in a separate thread to allow interaction with plots during computation
+            Default True, whether to run the computation in a separate thread 
+            to allow interaction with plots during computation.
         calculation_width : int
-            Default 10, how many translations to pass through at once for each round of gradient accumulation
+            Default 10, how many translations to pass through at once for each 
+            round of gradient accumulation.
 
         """
-        # We want to have model.SGD_optimize call AND store cdtools.reconstructors.SGD
-        # to be able to perform reconstructions without creating a new
-        # optimizer each time we update the hyperparameters.
+        # We want to have model.SGD_optimize store cdtools.reconstructors.SGD
+        # as an attribute to run reconstructions without generating new reconstructors
+        # each time CDIModel.SGD_optimize is called.
         # 
-        # The only way to do this is to make the SGD reconstructor an attribute
-        # of the model. But since the SGD reconstructor also depends on CDIModel,
-        # this seems to give rise to a circular import error unless
-        # we import cdtools.reconstructors within this method:
+        # Since the SGD reconstructor also depends on CDIModel, a circular import error 
+        # arises unless we import cdtools.reconstructors within this method:
         from cdtools.reconstructors import SGD
         
         # Next, we want to create an Optimizer.Adam if one does not already exist.
diff --git a/src/cdtools/reconstructors/adam.py b/src/cdtools/reconstructors/adam.py
index 388e9ba3..cbf4378d 100644
--- a/src/cdtools/reconstructors/adam.py
+++ b/src/cdtools/reconstructors/adam.py
@@ -113,7 +113,7 @@ def optimize(self,
             Optional, the beta_1 and beta_2 to use. Default is (0.9, 0.999).
         schedule : bool
             Optional, create a learning rate scheduler (torch.optim.lr_scheduler._LRScheduler)
-        amsgra : bool
+        amsgrad : bool
             Optional, whether to use the AMSGrad variant of this algorithm
         regularization_factor : float or list(float)
             Optional, if the model has a regularizer defined, the set of parameters to pass 
diff --git a/src/cdtools/reconstructors/base.py b/src/cdtools/reconstructors/base.py
index 0ca0d519..4b6ded77 100644
--- a/src/cdtools/reconstructors/base.py
+++ b/src/cdtools/reconstructors/base.py
@@ -18,7 +18,7 @@
 import time
 from contextlib import contextmanager
 from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
-from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
+from cdtools.datasets import CDataset
 from cdtools.models import CDIModel
 import cdtools.tools.distributed as cdtdist
 from typing import Tuple, List, Union
@@ -28,7 +28,7 @@
 class Reconstructor:
     """
     Reconstructor handles the optimization ('reconstruction') of ptychographic
-    models given a CDIModel (or subclass) and corresponding Ptycho2DDataset.
+    models given a CDIModel (or subclass) and corresponding CDataset.
     
     This is a base model that defines all functions Reconstructor subclasses
     must implement.
@@ -37,7 +37,7 @@ class Reconstructor:
     ----------
     model: CDIModel
         Model for CDI/ptychography reconstruction
-    dataset: Ptycho2DDataset
+    dataset: CDataset
         The dataset to reconstruct against
     subset : list(int) or int
         Optional, a pattern index or list of pattern indices to use
@@ -55,8 +55,8 @@ class Reconstructor:
     """
     def __init__(self,
                  model: CDIModel,
-                 dataset: Ptycho2DDataset,
-                 subset: List[int] = None):
+                 dataset: CDataset,
+                 subset: Union[int, List[int]] = None):
         # Store parameters as attributes of Reconstructor
         self.subset = subset
         self.multi_gpu_used = model.multi_gpu_used
diff --git a/src/cdtools/reconstructors/lbfgs.py b/src/cdtools/reconstructors/lbfgs.py
index c4097180..ad1a3a77 100644
--- a/src/cdtools/reconstructors/lbfgs.py
+++ b/src/cdtools/reconstructors/lbfgs.py
@@ -112,9 +112,6 @@ def optimize(self,
         calculation_width : int
             Default 10, how many translations to pass through at once for each round of 
             gradient accumulation. Does not affect the result, only the calculation speed 
-        shuffle : bool
-            Optional, enable/disable shuffling of the dataset. This option
-            is intended for diagnostic purposes and should be left as True.
         """
         # 1) The subset statement is contained in Reconstructor.__init__
 
diff --git a/src/cdtools/reconstructors/sgd.py b/src/cdtools/reconstructors/sgd.py
index 266db20b..fdcfe89f 100644
--- a/src/cdtools/reconstructors/sgd.py
+++ b/src/cdtools/reconstructors/sgd.py
@@ -84,7 +84,7 @@ def adjust_optimizer(self,
     def optimize(self,
                  iterations: int,
                  batch_size: int = None,
-                 lr: float = 0.01,
+                 lr: float = 2e-7,
                  momentum: float = 0,
                  dampening: float = 0,
                  weight_decay: float = 0,
@@ -106,30 +106,29 @@ def optimize(self,
         Parameters
         ----------
         iterations : int
-            How many epochs of the algorithm to run
+            How many epochs of the algorithm to run.
         batch_size : int
-            Optional, the size of the minibatches to use. Default is 
+            Optional, the size of the minibatches to use.  
         lr : float
-            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
-            typically the highest possible value with any chance of being stable
+            Optional, The learning rate to use. The default is 2e-7.
         momentum : float
             Optional, the length of the history to use.
         dampening : float
-            Optional, dampening for the momentum
+            Optional, dampening for the momentum.
         weight_decay : float
-            Optional, weight decay (L2 penalty)
+            Optional, weight decay (L2 penalty).
         nesterov : bool
             Optional, enables Nesterov momentum. Only applicable when momentum 
             is non-zero. 
         regularization_factor : float or list(float)
             Optional, if the model has a regularizer defined, the set of parameters to pass 
-            the regularizer method
+            the regularizer method.
         thread : bool
             Default True, whether to run the computation in a separate thread to allow 
-            interaction with plots during computation
+            interaction with plots during computation.
         calculation_width : int
             Default 10, how many translations to pass through at once for each round of 
-            gradient accumulation. Does not affect the result, only the calculation speed 
+            gradient accumulation. Does not affect the result, only the calculation speed. 
         shuffle : bool
             Optional, enable/disable shuffling of the dataset. This option
             is intended for diagnostic purposes and should be left as True.

From 4ec8399775f59b05f3b3fe4df1e9e8f60a082cda Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 14 Jun 2025 21:52:17 +0000
Subject: [PATCH 065/115] Removed model and dataset dependencies from
 distributed

---
 examples/fancy_ptycho_multi_gpu.py           | 76 +++++++-------------
 src/cdtools/models/base.py                   | 17 ++---
 src/cdtools/tools/distributed/distributed.py | 67 ++++++++---------
 3 files changed, 67 insertions(+), 93 deletions(-)

diff --git a/examples/fancy_ptycho_multi_gpu.py b/examples/fancy_ptycho_multi_gpu.py
index 9c1533f8..07eee3aa 100644
--- a/examples/fancy_ptycho_multi_gpu.py
+++ b/examples/fancy_ptycho_multi_gpu.py
@@ -1,46 +1,35 @@
 import cdtools
 from matplotlib import pyplot as plt
 
-# We need to import 2 additional functions
-from torch.distributed import barrier
+# We need to import the distributed package from CDTools
 from cdtools.tools.distributed import distributed
 
-filename = r'examples/example_data/lab_ptycho_data.cxi'
-dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+filename = r'example_data/lab_ptycho_data.cxi'
 
-model = cdtools.models.FancyPtycho.from_dataset(
-    dataset,
-    n_modes=3,
-    oversampling=2, 
-    probe_support_radius=120, 
-    propagation_distance=5e-3,
-    units='mm', 
-    obj_view_crop=-50)
-model.background.requires_grad=True
-# Remove or comment out lines moving the dataset and model to GPU.
-# This process will be handled by the cdtools.tools.distributed methods.
+# Wrap the rest of the script inside of a function. This function will be
+# distributed across several GPUs for multiprocessing at the end.
 
-#device = 'cuda'
-#model.to(device=device)
-#dataset.get_as(device=device)
+def multi_gpu_reconstruct():
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
+    model = cdtools.models.FancyPtycho.from_dataset(
+        dataset,
+        n_modes=3,
+        oversampling=2, 
+        probe_support_radius=120, 
+        propagation_distance=5e-3,
+        units='mm', 
+        obj_view_crop=-50)
 
-# Wrap the rest of the script inside of a function. This function will be
-# distributed across several GPUs for multiprocessing at the end.
-#
-# CDTools multi-GPU methods expects the function to be declared as...
-# 
-#       def func(model, dataset, rank, world_size):
-#
-# ...where rank is an integer from [0, world_size-1] assigned to each
-# GPU, and world_size is the total number of GPUs used.
+    model.background.requires_grad=True
 
-def multi_gpu_reconstruct(model, dataset, rank, world_size):
-    """
-    for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=10):
-        
+    device= f'cuda'
+    model.to(device=device)
+    dataset.get_as(device=device)
+
+    for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=50):
         # We can still perform model.report, but we want only 1 GPU printing stuff.
-        if rank == 0: 
+        if model.rank == 0: 
             print(model.report())
         
         # You don't need to add the `if rank == 0` here. 
@@ -48,31 +37,22 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
             model.inspect(dataset)
 
     for loss in model.Adam_optimize(50, dataset, lr=0.005, batch_size=50):
-        if rank == 0: 
+        if model.rank == 0: 
             print(model.report())
         
         if model.epoch % 20 == 0:
-            if rank == 0:
-                model.inspect(dataset)
-    """
-    recon = cdtools.reconstructors.Adam(model,dataset)
-    if rank == 0:
-        model.inspect(dataset)
-
-    for loss in recon.optimize(50, lr=0.02, batch_size=50):
-        if rank == 0: print(model.report())
-        # Plotting is expensive, so we only do it every tenth epoch
-        if model.epoch % 10 == 0 and rank == 0:
             model.inspect(dataset)
+
     #model.tidy_probes()
     model.inspect(dataset)
 
     # You don't need to add the `if rank == 0` here either...
     model.compare(dataset)
-
+    
     # ...but you do have to add it here.
-    if rank == 0: plt.show()
+    if model.rank == 0: plt.show()
     
+
 # This will execute the multi_gpu_reconstruct upon running this file
 # Here, we're...
 #   - ...setting up `world_size=4` GPUs to run
@@ -80,7 +60,7 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
 #        the "rank 0 node/machine") is on address `master_addr`
 #   - ...telling CDTools we have a free port on `master_port` on the machine
 #        with rank 0.
-#   - ...going to wait 60 seconds for the GPUs to do something before 
+#   - ...going to wait 30 seconds for the GPUs to do something before 
 #        we terminate the reconstruction. If you want to inspect/compare
 #        the model after reconstruction, consider increasing the timeout.
 #
@@ -88,8 +68,6 @@ def multi_gpu_reconstruct(model, dataset, rank, world_size):
 # master_addr = 'localhost'.
 if __name__ == '__main__':
     distributed.spawn(multi_gpu_reconstruct, 
-                      model=model,
-                      dataset=dataset,
                       device_ids = [0,1,2,3],
                       master_addr='localhost',
                       master_port='8888',
diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 3b8ab161..1c3e8e6c 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -44,6 +44,7 @@
 from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
 from cdtools.datasets import CDataset
 from typing import List, Union, Tuple
+import os
 
 __all__ = ['CDIModel']
 
@@ -65,14 +66,14 @@ def __init__(self):
         self.training_history = ''
         self.epoch = 0
 
-        # These properties indicate to the CDIModel methods whether or not 
-        # multiple GPUs will be used. The purpose is to allow only 1 GPU to call
-        # certain methods to prevent the creation of redundant plots/reports/saves
-        self.rank = None                # Rank of the subprocess running the GPU
-        self.device_id = None           # ID of the GPU being used in multi-GPU 
-        self.world_size = 1             # Total number of GPUs being used.
-        self.multi_gpu_used = False     # Self explanatory
-
+        # These attributes indicate to the CDIModel methods whether or not 
+        # multi-GPU calculations are being performed. These flags help
+        # trigger multi-GPU-specific function calls (i.e., all_reduce) and
+        # prevent redundant plots/reports/saves during multi-GPU use.
+        self.rank = int(os.environ.get('RANK'))       # Rank of the subprocess running the GPU
+        self.world_size = int(os.environ.get('WORLD_SIZE'))   # Total number of GPUs being used.
+        self.multi_gpu_used = int(self.world_size) > 1                # Self explanatory
+        
 
     def from_dataset(self, dataset):
         raise NotImplementedError()
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 9c1ad9e9..d089c0c4 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -25,8 +25,6 @@
 import datetime
 import os
 from multiprocessing.connection import Connection
-from cdtools.models import CDIModel
-from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from typing import Callable, List
 
 __all__ = ['sync_and_avg_gradients', 'distributed_wrapper', 'spawn']
@@ -45,10 +43,9 @@ def sync_and_avg_gradients(model):
             dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
             param.grad.data /= model.world_size
 
+
 def distributed_wrapper(rank: int, 
-                        func: Callable[[CDIModel, Ptycho2DDataset, int, int], None], 
-                        model: CDIModel, 
-                        dataset: Ptycho2DDataset, 
+                        func: Callable[[int, int], None], 
                         device_ids: List[int],
                         backend: str = 'nccl', 
                         timeout: int = 600,
@@ -66,10 +63,6 @@ def distributed_wrapper(rank: int,
         func: Callable[[CDIModel, Ptycho2DDataset, int, int]]
             Function wrapping user-defined reconstruction loops. The function must
             have the following format: func(model, dataset, rank, world_size).
-        model: CDIModel
-            Model for CDI/ptychography reconstruction
-        dataset: Ptycho2DDataset
-            The dataset to reconstruct against
         device_ids: list[int]
             List of GPU IDs to use
         backend: str
@@ -91,41 +84,46 @@ def distributed_wrapper(rank: int,
     # Convert timeout from int to datetime
     timeout = datetime.timedelta(seconds=timeout)
 
+    # Define the world_size
+    world_size = len(device_ids)
+
     # Update the rank in the model and indicate we're using multiple GPUs
-    model.rank = rank
-    model.device_id = device_ids[model.rank]
-    model.world_size = len(device_ids)
+    #model.rank = rank
+    #model.device_id = device_ids[model.rank]
+    #model.world_size = len(device_ids)
 
     # Allow the process to only see the GPU is has been assigned
-    os.environ['CUDA_VISIBLE_DEVICES'] = str(model.device_id) 
-
-    if model.world_size > 1: # In case we need to use 1 GPU for testing
-        model.multi_gpu_used = True
-
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(rank) 
+
+    # Within the called reconstruction function/script, we need to somehow
+    # set up the multi-GPU model flags (model.rank, model.world_size,
+    # and model.multi_gpu_used).
+    #
+    # One way to do this (without having to modify CDIModel here or explicitly
+    # setting up the CDIModel attributes in the reconstruction script) is to
+    # create environment variables for each subprocess. Then, when a model
+    # is created within each subprocess, it can loop up its own local environment
+    # variable and set the rank/world_size/multi_gpu_used flags accordingly.
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['RANK'] = str(rank)
+    os.environ['NCCL_P2P_DISABLE'] = str(int(True))
+    
     # Initialize the process group
     dist.init_process_group(backend=backend, rank=rank, 
-                            world_size=model.world_size, timeout=timeout)
+                            world_size=world_size, timeout=timeout)
     
-    # Load the model to the appropriate GPU rank the process is using
-    device='cuda'
-    model.to(device=device)
-    dataset.get_as(device=device) 
-    
-    # Start the reconstruction loop, but feed in model_DDP.module so we don't
-    # have to change `model._` to `model.module._` in the CDTools script
+    # Run the reconstruction script
     # We also need to check if we want to pass a pipe to the function
     if pipe is None:
-        func(model, dataset, rank, model.world_size)    
+        func()    
     else:
-        func(model, dataset, rank, model.world_size, pipe)   
+        func(pipe)   
                          
     # Destroy process group
     dist.destroy_process_group()        
 
 
-def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],
-          model: CDIModel,
-          dataset: Ptycho2DDataset,
+def spawn(func: Callable[[int, int], None],
           device_ids: List[int],
           master_addr: str,
           master_port: str,
@@ -145,10 +143,6 @@ def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],
         func: Callable[[CDIModel, Ptycho2DDataset, int, int]]
             Function wrapping user-defined reconstruction loops. The function must
             have the following format: func(model, dataset, rank, world_size).
-        model: CDIModel
-            Model for CDI/ptychography reconstruction
-        dataset: Ptycho2DDataset
-            The dataset to reconstruct against
         device_ids: list[int]
             List of GPU IDs to use
         master_addr: str
@@ -179,9 +173,10 @@ def spawn(func: Callable[[CDIModel, Ptycho2DDataset, int, int], None],
     os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
 
     # Ensure a "graceful" termination of subprocesses if something goes wrong.
-    print('\nStarting up multi-GPU reconstructions...')
+    print('\nStarting up multi-GPU reconstructions...\n')
     mp.spawn(distributed_wrapper,
-                args=(func, model, dataset, device_ids, backend, timeout, pipe),
+                args=(func, device_ids, backend, timeout, pipe),
                 nprocs=len(device_ids),
                 join=True)
     print('Reconstructions complete...')
+    
\ No newline at end of file

From bc2669e397313c85fce6d4f2bfc8b20c5130bfa6 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 14 Jun 2025 22:05:06 +0000
Subject: [PATCH 066/115] Fixed CDIModel rank and world_size assignment for
 single GPU use

---
 src/cdtools/models/base.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 1c3e8e6c..3fa86910 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -70,10 +70,15 @@ def __init__(self):
         # multi-GPU calculations are being performed. These flags help
         # trigger multi-GPU-specific function calls (i.e., all_reduce) and
         # prevent redundant plots/reports/saves during multi-GPU use.
-        self.rank = int(os.environ.get('RANK'))       # Rank of the subprocess running the GPU
-        self.world_size = int(os.environ.get('WORLD_SIZE'))   # Total number of GPUs being used.
-        self.multi_gpu_used = int(self.world_size) > 1                # Self explanatory
-        
+        rank = os.environ.get('RANK')
+        world_size = os.environ.get('WORLD_SIZE')
+
+        # Rank of the subprocess running the GPU (defauly rank 0)
+        self.rank = int(rank) if rank is not None else 0 
+        # Total number of GPUs being used.    
+        self.world_size = int(world_size) if world_size is not None else 1   
+        self.multi_gpu_used = int(self.world_size) > 1               
+
 
     def from_dataset(self, dataset):
         raise NotImplementedError()

From ca2890061282eef62b8fd79a9cb202d8f007f3fe Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 14 Jun 2025 22:16:36 +0000
Subject: [PATCH 067/115] Created working implementation of distributing
 single-GPU scripts to multi-GPU with torchrun

---
 examples/fancy_ptycho_torchrun.py | 44 ++++++++++++++++++++++++++++++
 examples/single_to_multi_gpu.py   | 45 +++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 examples/fancy_ptycho_torchrun.py
 create mode 100644 examples/single_to_multi_gpu.py

diff --git a/examples/fancy_ptycho_torchrun.py b/examples/fancy_ptycho_torchrun.py
new file mode 100644
index 00000000..819ea0d4
--- /dev/null
+++ b/examples/fancy_ptycho_torchrun.py
@@ -0,0 +1,44 @@
+import cdtools
+from matplotlib import pyplot as plt
+
+filename = 'example_data/lab_ptycho_data.cxi'
+dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+# FancyPtycho is the workhorse model
+model = cdtools.models.FancyPtycho.from_dataset(
+    dataset,
+    n_modes=3, # Use 3 incoherently mixing probe modes
+    oversampling=2, # Simulate the probe on a 2xlarger real-space array
+    probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
+    propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
+    units='mm', # Set the units for the live plots
+    obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
+)
+
+device = f'cuda:{model.rank}'
+model.to(device=device)
+dataset.get_as(device=device)
+
+# The learning rate parameter sets the alpha for Adam.
+# The beta parameters are (0.9, 0.999) by default
+# The batch size sets the minibatch size
+for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=50):
+    if model.rank == 0: print(model.report())
+    # Plotting is expensive, so we only do it every tenth epoch
+    if model.epoch % 10 == 0:
+        model.inspect(dataset)
+
+# It's common to chain several different reconstruction loops. Here, we
+# started with an aggressive refinement to find the probe, and now we
+# polish the reconstruction with a lower learning rate and larger minibatch
+for loss in model.Adam_optimize(50, dataset,  lr=0.005, batch_size=50):
+    if model.rank == 0: print(model.report())
+    if model.epoch % 10 == 0:
+        model.inspect(dataset)
+
+# This orthogonalizes the recovered probe modes
+model.tidy_probes()
+
+model.inspect(dataset)
+model.compare(dataset)
+plt.show()
diff --git a/examples/single_to_multi_gpu.py b/examples/single_to_multi_gpu.py
new file mode 100644
index 00000000..ff48386c
--- /dev/null
+++ b/examples/single_to_multi_gpu.py
@@ -0,0 +1,45 @@
+"""
+A wrapper script intended to run single-GPU scripts as
+a multi-GPU job
+
+This script is intended to be called by torchrun. It is set
+up so that the group process handling (init and destroy) are
+handled here. The core of the function calls the reconstruction
+script of interest.
+
+Currently, this is the torchrun command I'm calling to use
+4 GPUs:
+
+torchrun --nnodes=1 --nproc_per_node=4 single_to_multi_gpu.py
+"""
+import os
+import datetime
+import torch as t
+import torch.distributed as dist
+
+
+if __name__ == '__main__':
+    # If this script is called by torchrun, several environment
+    # variables should be visible that are needed to initiate the 
+    # process group.
+    rank = int(os.environ.get('RANK'))
+    world_size = int(os.environ.get('WORLD_SIZE'))
+    os.environ['NCCL_P2P_DISABLE'] = str(int(True))
+    os.environ['CUDA_VISIBLE_DEVICE'] = str(rank)
+
+    timeout = datetime.timedelta(seconds=30)
+
+    # Start up the process group (needed so the different
+    # subprocesses can talk with each other)
+    dist.init_process_group(backend='nccl',
+                            rank=rank,
+                            world_size=world_size,
+                            timeout=timeout)
+    
+    try:     
+        # Run the single-GPU reconstruction script
+        import fancy_ptycho_torchrun
+
+    finally:
+        # Kill the process group
+        dist.destroy_process_group()
\ No newline at end of file

From 3b17f10a862e374b6a6903e3cae23b77b9eb979b Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 15 Jun 2025 05:29:38 +0000
Subject: [PATCH 068/115] Fixed bug in CDatasets which uses cuda:0 when
 t.cuda.set_device is utilized

---
 examples/fancy_ptycho_torchrun.py |  7 +++++--
 examples/single_to_multi_gpu.py   | 14 ++++++--------
 src/cdtools/datasets/base.py      | 14 ++++++++++++++
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/examples/fancy_ptycho_torchrun.py b/examples/fancy_ptycho_torchrun.py
index 819ea0d4..585b7ded 100644
--- a/examples/fancy_ptycho_torchrun.py
+++ b/examples/fancy_ptycho_torchrun.py
@@ -1,7 +1,10 @@
 import cdtools
 from matplotlib import pyplot as plt
+import os
+import torch as t
 
 filename = 'example_data/lab_ptycho_data.cxi'
+#filename = 'cdtools/examples/example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
 # FancyPtycho is the workhorse model
@@ -14,8 +17,8 @@
     units='mm', # Set the units for the live plots
     obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
 )
-
-device = f'cuda:{model.rank}'
+#t.cuda.set_device(5)
+device = 'cuda'
 model.to(device=device)
 dataset.get_as(device=device)
 
diff --git a/examples/single_to_multi_gpu.py b/examples/single_to_multi_gpu.py
index ff48386c..00cfb11a 100644
--- a/examples/single_to_multi_gpu.py
+++ b/examples/single_to_multi_gpu.py
@@ -14,26 +14,24 @@
 """
 import os
 import datetime
-import torch as t
 import torch.distributed as dist
+import torch as t
+
 
 
 if __name__ == '__main__':
     # If this script is called by torchrun, several environment
-    # variables should be visible that are needed to initiate the 
-    # process group.
+    # variables are created that we need to store as variables
     rank = int(os.environ.get('RANK'))
     world_size = int(os.environ.get('WORLD_SIZE'))
-    os.environ['NCCL_P2P_DISABLE'] = str(int(True))
-    os.environ['CUDA_VISIBLE_DEVICE'] = str(rank)
 
-    timeout = datetime.timedelta(seconds=30)
+    os.environ['NCCL_P2P_DISABLE'] = str(int(True))
+    t.cuda.set_device(rank)
+    timeout = datetime.timedelta(seconds=300)
 
     # Start up the process group (needed so the different
     # subprocesses can talk with each other)
     dist.init_process_group(backend='nccl',
-                            rank=rank,
-                            world_size=world_size,
                             timeout=timeout)
     
     try:     
diff --git a/src/cdtools/datasets/base.py b/src/cdtools/datasets/base.py
index 3f8ec8c3..e1df0e3e 100644
--- a/src/cdtools/datasets/base.py
+++ b/src/cdtools/datasets/base.py
@@ -131,6 +131,20 @@ def get_as(self, *args, **kwargs):
         ----------
         Accepts the same parameters as torch.Tensor.to
         """
+        # When running a single-GPU script with single_to_multi_gpu.py,
+        # each subprocess (running the single-GPU script on one of several
+        # GPUs) is assigned their own default GPU using `t.cuda.set_device(rank)`.
+        # This is done to allow the single GPU script to use `device='cuda'` and not
+        # `device=f'cuda:{rank}'` for ease of use.
+        #
+        # Say we set `t.cuda.set_device(3)` and use `device='cuda'` in the single GPU
+        # script. When `model.to(device=device)` is called, all torch parameters have
+        # `device='cuda:6'` automatically set. However, when `dataset.get_as` is called, 
+        # we must explicitly define here the GPU ID/rank the dataset will live on or 
+        # else its torch parameters will have `device='cuda'` (i.e., the device is 'cuda:0').
+        if ('device', 'cuda') in kwargs.items():
+            kwargs['device'] = f'cuda:{t.cuda.current_device()}'
+
         self.get_as_args = (args, kwargs)
 
     def __len__(self):

From 588b1504b11a8e56b13b2dbd5d1dc7ccaba11ee1 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 15 Jun 2025 05:55:35 +0000
Subject: [PATCH 069/115] Revert "Fixed bug in CDatasets which uses cuda:0 when
 t.cuda.set_device is utilized"

This reverts commit 3b17f10a862e374b6a6903e3cae23b77b9eb979b.
---
 examples/fancy_ptycho_torchrun.py |  7 ++-----
 examples/single_to_multi_gpu.py   | 14 ++++++++------
 src/cdtools/datasets/base.py      | 14 --------------
 3 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/examples/fancy_ptycho_torchrun.py b/examples/fancy_ptycho_torchrun.py
index 585b7ded..819ea0d4 100644
--- a/examples/fancy_ptycho_torchrun.py
+++ b/examples/fancy_ptycho_torchrun.py
@@ -1,10 +1,7 @@
 import cdtools
 from matplotlib import pyplot as plt
-import os
-import torch as t
 
 filename = 'example_data/lab_ptycho_data.cxi'
-#filename = 'cdtools/examples/example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
 # FancyPtycho is the workhorse model
@@ -17,8 +14,8 @@
     units='mm', # Set the units for the live plots
     obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
 )
-#t.cuda.set_device(5)
-device = 'cuda'
+
+device = f'cuda:{model.rank}'
 model.to(device=device)
 dataset.get_as(device=device)
 
diff --git a/examples/single_to_multi_gpu.py b/examples/single_to_multi_gpu.py
index 00cfb11a..ff48386c 100644
--- a/examples/single_to_multi_gpu.py
+++ b/examples/single_to_multi_gpu.py
@@ -14,24 +14,26 @@
 """
 import os
 import datetime
-import torch.distributed as dist
 import torch as t
-
+import torch.distributed as dist
 
 
 if __name__ == '__main__':
     # If this script is called by torchrun, several environment
-    # variables are created that we need to store as variables
+    # variables should be visible that are needed to initiate the 
+    # process group.
     rank = int(os.environ.get('RANK'))
     world_size = int(os.environ.get('WORLD_SIZE'))
-
     os.environ['NCCL_P2P_DISABLE'] = str(int(True))
-    t.cuda.set_device(rank)
-    timeout = datetime.timedelta(seconds=300)
+    os.environ['CUDA_VISIBLE_DEVICE'] = str(rank)
+
+    timeout = datetime.timedelta(seconds=30)
 
     # Start up the process group (needed so the different
     # subprocesses can talk with each other)
     dist.init_process_group(backend='nccl',
+                            rank=rank,
+                            world_size=world_size,
                             timeout=timeout)
     
     try:     
diff --git a/src/cdtools/datasets/base.py b/src/cdtools/datasets/base.py
index e1df0e3e..3f8ec8c3 100644
--- a/src/cdtools/datasets/base.py
+++ b/src/cdtools/datasets/base.py
@@ -131,20 +131,6 @@ def get_as(self, *args, **kwargs):
         ----------
         Accepts the same parameters as torch.Tensor.to
         """
-        # When running a single-GPU script with single_to_multi_gpu.py,
-        # each subprocess (running the single-GPU script on one of several
-        # GPUs) is assigned their own default GPU using `t.cuda.set_device(rank)`.
-        # This is done to allow the single GPU script to use `device='cuda'` and not
-        # `device=f'cuda:{rank}'` for ease of use.
-        #
-        # Say we set `t.cuda.set_device(3)` and use `device='cuda'` in the single GPU
-        # script. When `model.to(device=device)` is called, all torch parameters have
-        # `device='cuda:6'` automatically set. However, when `dataset.get_as` is called, 
-        # we must explicitly define here the GPU ID/rank the dataset will live on or 
-        # else its torch parameters will have `device='cuda'` (i.e., the device is 'cuda:0').
-        if ('device', 'cuda') in kwargs.items():
-            kwargs['device'] = f'cuda:{t.cuda.current_device()}'
-
         self.get_as_args = (args, kwargs)
 
     def __len__(self):

From 50a8b01c629292cd49114977642b8eddb59dbd9c Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sun, 15 Jun 2025 06:37:57 +0000
Subject: [PATCH 070/115] fancy_ptycho example can now be run on several GPUs
 with no modifications

---
 examples/fancy_ptycho.py          |  2 +-
 examples/fancy_ptycho_torchrun.py | 44 -------------------------------
 examples/single_to_multi_gpu.py   | 40 ++++++++++++++++------------
 3 files changed, 24 insertions(+), 62 deletions(-)
 delete mode 100644 examples/fancy_ptycho_torchrun.py

diff --git a/examples/fancy_ptycho.py b/examples/fancy_ptycho.py
index 1cf58550..feb63a1f 100644
--- a/examples/fancy_ptycho.py
+++ b/examples/fancy_ptycho.py
@@ -22,7 +22,7 @@
 # The learning rate parameter sets the alpha for Adam.
 # The beta parameters are (0.9, 0.999) by default
 # The batch size sets the minibatch size
-for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=10):
+for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=20):
     print(model.report())
     # Plotting is expensive, so we only do it every tenth epoch
     if model.epoch % 10 == 0:
diff --git a/examples/fancy_ptycho_torchrun.py b/examples/fancy_ptycho_torchrun.py
deleted file mode 100644
index 819ea0d4..00000000
--- a/examples/fancy_ptycho_torchrun.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import cdtools
-from matplotlib import pyplot as plt
-
-filename = 'example_data/lab_ptycho_data.cxi'
-dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
-
-# FancyPtycho is the workhorse model
-model = cdtools.models.FancyPtycho.from_dataset(
-    dataset,
-    n_modes=3, # Use 3 incoherently mixing probe modes
-    oversampling=2, # Simulate the probe on a 2xlarger real-space array
-    probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
-    propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
-    units='mm', # Set the units for the live plots
-    obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
-)
-
-device = f'cuda:{model.rank}'
-model.to(device=device)
-dataset.get_as(device=device)
-
-# The learning rate parameter sets the alpha for Adam.
-# The beta parameters are (0.9, 0.999) by default
-# The batch size sets the minibatch size
-for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=50):
-    if model.rank == 0: print(model.report())
-    # Plotting is expensive, so we only do it every tenth epoch
-    if model.epoch % 10 == 0:
-        model.inspect(dataset)
-
-# It's common to chain several different reconstruction loops. Here, we
-# started with an aggressive refinement to find the probe, and now we
-# polish the reconstruction with a lower learning rate and larger minibatch
-for loss in model.Adam_optimize(50, dataset,  lr=0.005, batch_size=50):
-    if model.rank == 0: print(model.report())
-    if model.epoch % 10 == 0:
-        model.inspect(dataset)
-
-# This orthogonalizes the recovered probe modes
-model.tidy_probes()
-
-model.inspect(dataset)
-model.compare(dataset)
-plt.show()
diff --git a/examples/single_to_multi_gpu.py b/examples/single_to_multi_gpu.py
index ff48386c..3578d02b 100644
--- a/examples/single_to_multi_gpu.py
+++ b/examples/single_to_multi_gpu.py
@@ -1,11 +1,12 @@
 """
 A wrapper script intended to run single-GPU scripts as
-a multi-GPU job
+a multi-GPU job when called by torchrun.
 
 This script is intended to be called by torchrun. It is set
-up so that the group process handling (init and destroy) are
-handled here. The core of the function calls the reconstruction
-script of interest.
+up so that the group process handling (init and destroy) and 
+definition of several environmental variables are handled here. 
+The reconstruction script of interest is called by simply
+importing the name of the file (minus the .py extension).
 
 Currently, this is the torchrun command I'm calling to use
 4 GPUs:
@@ -14,31 +15,36 @@
 """
 import os
 import datetime
-import torch as t
 import torch.distributed as dist
 
 
 if __name__ == '__main__':
-    # If this script is called by torchrun, several environment
-    # variables should be visible that are needed to initiate the 
-    # process group.
-    rank = int(os.environ.get('RANK'))
-    world_size = int(os.environ.get('WORLD_SIZE'))
+    # Kill the process if it hangs/pauses for a certain amount
+    # of time.
+    timeout = datetime.timedelta(seconds=30)
+
+    # Enable/disable NVidia Collective Communications Library (NCCL)
+    # peer-to-peer communication. If you find that all your GPUs
+    # are at 100% use but don't seem to be doing anything, try enabling
+    # this variable.
     os.environ['NCCL_P2P_DISABLE'] = str(int(True))
-    os.environ['CUDA_VISIBLE_DEVICE'] = str(rank)
 
-    timeout = datetime.timedelta(seconds=30)
+    # If this script is called by torchrun, the GPU rank is
+    # visible as an environment variable.
+    rank = int(os.environ.get('RANK'))
+
+    # We need to prevent each subprocess from seeing GPUs other
+    # than the one it has been assigned by torchrun.
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
 
     # Start up the process group (needed so the different
     # subprocesses can talk with each other)
     dist.init_process_group(backend='nccl',
-                            rank=rank,
-                            world_size=world_size,
                             timeout=timeout)
-    
+      
     try:     
-        # Run the single-GPU reconstruction script
-        import fancy_ptycho_torchrun
+        # Run the single-GPU reconstruction script by importing it
+        import fancy_ptycho
 
     finally:
         # Kill the process group

From f80989d8f214f68e125e939cbbe80f142f6fd057 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 16 Jun 2025 17:02:28 +0000
Subject: [PATCH 071/115] rank==0 check implemented for several CDIModel file
 and figure saving/plotting methods

---
 src/cdtools/models/base.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 3fa86910..5038095b 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -212,6 +212,10 @@ def save_to_h5(self, filename, *args):
         *args
             Accepts any additional args that model.save_results needs, for this model
         """
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+        
         return nested_dict_to_h5(filename, self.save_results(*args))
     
 
@@ -232,6 +236,10 @@ def save_on_exit(self, filename, *args, exception_filename=None):
         exception_filename : str
             Optional, a separate filename to use if an exception is raised during execution. Default is equal to filename
         """
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+        
         try:
             yield
             self.save_to_h5(filename, *args)
@@ -257,6 +265,10 @@ def save_on_exception(self, filename, *args):
         *args
             Accepts any additional args that model.save_results needs, for this model
         """
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+        
         try:
             yield
         except:
@@ -581,6 +593,9 @@ def report(self):
         report : str
             A string with basic info on the latest iteration
         """
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
         if hasattr(self, 'latest_iteration_time'):
             epoch = len(self.loss_history)
             dt = self.latest_iteration_time
@@ -708,7 +723,10 @@ def save_figures(self, prefix='', extension='.pdf'):
         extention : strategy
             Default is .eps, the file extension to save with.
         """
-
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+        
         if hasattr(self, 'figs') and self.figs:
             figs = self.figs
         else:

From 3f4fe57f94ac2ac9f5da8ec70fff7f1be0df1753 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 16 Jun 2025 17:05:08 +0000
Subject: [PATCH 072/115] Removed rank==0 check for CDIModel.report

---
 src/cdtools/models/base.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 5038095b..a704b92f 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -593,9 +593,6 @@ def report(self):
         report : str
             A string with basic info on the latest iteration
         """
-        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
-        if self.multi_gpu_used and self.rank != 0:
-            return
         if hasattr(self, 'latest_iteration_time'):
             epoch = len(self.loss_history)
             dt = self.latest_iteration_time

From 742d1ddacf1471b85e0fc3c60c08e2742e24621a Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 17 Jun 2025 21:17:38 +0000
Subject: [PATCH 073/115] rank==0 check implemented for CDataset inspect

---
 src/cdtools/datasets/base.py              | 13 +++++++++++++
 src/cdtools/datasets/ptycho_2d_dataset.py |  4 +++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/cdtools/datasets/base.py b/src/cdtools/datasets/base.py
index 3f8ec8c3..62ad3e43 100644
--- a/src/cdtools/datasets/base.py
+++ b/src/cdtools/datasets/base.py
@@ -19,6 +19,7 @@
 import pathlib
 from cdtools.tools import data as cdtdata
 from torch.utils import data as torchdata
+import os
 
 __all__ = ['CDataset']
 
@@ -92,6 +93,18 @@ def __init__(
 
         self.get_as(device='cpu')
 
+        # These attributes indicate to the CDataset methods whether or not 
+        # multi-GPU calculations are being performed. These flags are mostly
+        # used to prevent the production of duplicate plots when CDataset.inspect
+        # is called.
+        rank = os.environ.get('RANK')
+        world_size = os.environ.get('WORLD_SIZE')
+        # Rank of the subprocess running the GPU (defauly rank 0)
+        self.rank = int(rank) if rank is not None else 0 
+        # Total number of GPUs being used.    
+        self.world_size = int(world_size) if world_size is not None else 1   
+        self.multi_gpu_used = int(self.world_size) > 1      
+
 
     def to(self, *args, **kwargs):
         """Sends the relevant data to the given device and dtype
diff --git a/src/cdtools/datasets/ptycho_2d_dataset.py b/src/cdtools/datasets/ptycho_2d_dataset.py
index 6631cd68..f36b53f9 100644
--- a/src/cdtools/datasets/ptycho_2d_dataset.py
+++ b/src/cdtools/datasets/ptycho_2d_dataset.py
@@ -225,7 +225,9 @@ def inspect(
         can display a base-10 log plot of the detector readout at each
         position.
         """
-
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
 
         def get_images(idx):
             inputs, output = self[idx]

From efeaf2e3524102d5392a22e20270f8a53af38157 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 17 Jun 2025 21:20:31 +0000
Subject: [PATCH 074/115] Created custom console script cdt-torchrun to launch
 single-GPU scripts as multi-GPU jobs

---
 setup.py                                      |   5 +
 src/cdtools/tools/distributed/distributed.py  | 268 ++++++++++++++++--
 .../tools/distributed/single_to_multi_gpu.py  |  53 ++++
 3 files changed, 297 insertions(+), 29 deletions(-)
 create mode 100644 src/cdtools/tools/distributed/single_to_multi_gpu.py

diff --git a/setup.py b/setup.py
index 59d4600f..11855fe3 100644
--- a/setup.py
+++ b/setup.py
@@ -37,5 +37,10 @@
         "Programming Language :: Python :: 3",
         "Operating System :: OS Independent",
     ],
+    entry_points={
+        'console_scripts': {
+            'cdt-torchrun = cdtools.tools.distributed.distributed:run_single_to_multi_gpu'
+        }
+    }
 )
 
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index d089c0c4..3a8109b2 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -24,10 +24,24 @@
 import torch.multiprocessing as mp
 import datetime
 import os
+import sys
+import importlib
+import subprocess
+import inspect
+import argparse
 from multiprocessing.connection import Connection
 from typing import Callable, List
+from pathlib import Path
+
+DISTRIBUTED_PATH = os.path.dirname(os.path.abspath(__file__))
+
+__all__ = ['sync_and_avg_gradients', 
+           'torchrunner',
+           'run_single_to_multi_gpu',
+           'wrap_single_gpu_script',
+           '_spawn_wrapper',
+           'spawn']
 
-__all__ = ['sync_and_avg_gradients', 'distributed_wrapper', 'spawn']
 
 def sync_and_avg_gradients(model):
     """
@@ -44,12 +58,213 @@ def sync_and_avg_gradients(model):
             param.grad.data /= model.world_size
 
 
-def distributed_wrapper(rank: int, 
-                        func: Callable[[int, int], None], 
-                        device_ids: List[int],
-                        backend: str = 'nccl', 
-                        timeout: int = 600,
-                        pipe: Connection = None):
+def torchrunner(script_name: str,
+                n_gpus: int = 4):
+    """
+    Executes a torchrun command in a python script or jupyter notebook.
+
+    Parameters:
+        script_name: str
+            The file name of the target script
+        n_gpus: int
+            Number of GPUs to distribute the job over
+    """
+
+    # Perform the torchrun call of the wrapped function
+    subprocess.run(['torchrun', 
+                    '--nnodes=1', 
+                    f'--nproc_per_node={n_gpus}', 
+                    f'{script_name}'])
+
+
+def run_single_to_multi_gpu():
+    """
+    Runs a single-GPU reconstruction script as a multi-GPU job via torchrun.
+    
+    This function can be executed as `cdt-torchrun` in the command line.
+
+    This function is a wrapper over both the single-GPU wrapping sc
+    
+    For example, if we have the reconstruction script `reconstruct.py` and want to use
+    4 GPUs, we can write the following:
+
+    ```
+    cdt-torchrun --nproc_per_node=4 s script_path=reconstruct.py
+    ```
+
+    Arguments:
+        script_path: str
+            Path of the single-GPU script (either full or partial path).
+        --ngpus: int
+            Number of GPUs to use.
+        --nnodes: int
+            Optional, number of nodes. Default 1; more than 1 nodes has not been tested.
+        --backend: str
+            Optional, communication backend for distributed computing (either `nccl` or `gloo`).
+            Default is `nccl`
+        --timeout: int
+            Optional, time in seconds before the distributed process is killed. 
+            Default is 30 seconds.
+        --nccl_p2p_disable: int
+            Optional, disable (1) or enable (0) NCCL peer-to-peer communication. Default
+            is 1.
+        
+    """
+    # Define the arguments we need to pass to dist.script_wrapper
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--ngpus',
+                        type=int,
+                        help='Number of GPUs to use (called --nproc_per_node in torchrun)')
+    parser.add_argument('--nnodes', 
+                        type=str, 
+                        default=1,
+                        help='Number of nodes participating in distributive computing.')
+    parser.add_argument('--backend', 
+                        type=str, 
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='Communication backend (nccl or gloo)')
+    parser.add_argument('--timeout', 
+                        type=int, 
+                        default=30,
+                        help='Time before process is killed in seconds')
+    parser.add_argument('--nccl_p2p_disable', 
+                        type=int, 
+                        default=1,
+                        choices=[0,1],
+                        help='Disable (1) or enable (0) NCCL peer-to-peer communication')
+    parser.add_argument('script_path', 
+                        type=str, 
+                        help='Single GPU script file name (with or without .py extension)')
+    
+    # Get the arguments
+    args = parser.parse_args()
+
+    # Don't let the user die in anticipation
+    print(f'\n[CDTools]: Starting up multi-GPU reconstructions with {args.ngpus} GPUs.\n')
+    
+    # Perform the torchrun call of the wrapped function
+    subprocess.run(['torchrun', # We set up the torchrun arguments first
+                    '--nnodes=1', 
+                    f'--nproc_per_node={args.ngpus}', 
+                    os.path.join(DISTRIBUTED_PATH,'single_to_multi_gpu.py'), # Make the call to the single-to-multi-gpu wrapper script
+                    f'--backend={args.backend}',
+                    f'--timeout={args.timeout}',
+                    f'--nccl_p2p_disable={args.nccl_p2p_disable}',
+                    f'--script_path={args.script_path}'])
+    
+    # Let the user know the job is done
+    print(f'\n[CDTools]: Reconstructions complete.\n')
+
+
+def wrap_single_gpu_script(script_path: str,
+                           backend: str = 'nccl',
+                           timeout: int = 30,
+                           nccl_p2p_disable: bool = True):
+    """
+    Wraps single-GPU reconstruction scripts to be ran as a multi-GPU job via
+    torchrun calls.
+
+    This function is intended to be called in a script (say, single_to_multi_gpu.py) 
+    with the following form:
+
+    ```
+    import cdtools.tools.distributed as dist
+    if __name__ == '__main__':
+        dist.torchrun_single_to_multi_gpu(**kwargs)
+    ```
+    
+    torchrun should then be used to run this script as a distributive job using,
+    for instance:
+    
+    ```
+    torchrun --nnodes=1 --nproc_per_node=4 single_to_multi_gpu.py
+    ```
+
+    Parameters:
+        script_name: str
+            The file path of the single-GPU script (either full or relative).
+            If you're using a relative path, make sure the string doesn't start
+            with a backslash.
+        backend: str
+            Multi-gpu communication backend to use. Default is the 'nccl' backend,
+            which is the only supported backend for CDTools.
+            See https://pytorch.org/docs/stable/distributed.html for additional info
+            about PyTorch-supported backends.
+        timeout: int
+            Timeout for operations executed against the process group in seconds. 
+            Default is 30 seconds. After timeout has been reached, all subprocesses
+            will be aborted and the process calling this method will crash. 
+        nccl_p2p_disable: bool
+            Disable NCCL peer-2-peer communication
+    """
+    # Check if the script path actually exists
+    if not os.path.exists(script_path):
+        raise FileNotFoundError(
+            f'Cannot open file: {os.path.join(os.getcwd(), script_path)}')
+
+    # Make sure that the script_name doesn't contain `.py` and share
+    # the same name as any of the imported modules
+    script_name = Path(script_path).stem
+    if script_name in sys.modules:
+        raise NameError(
+            f'The file name {script_name} cannot share the same name as modules'
+             ' imported in CDTools. Please change the script file name.')
+
+    # Kill the process if it hangs/pauses for a certain amount of time.
+    timeout = datetime.timedelta(seconds=timeout)
+
+    # Enable/disable NVidia Collective Communications Library (NCCL)
+    # peer-to-peer communication. If you find that all your GPUs are at 100% use 
+    # but don't seem to be doing anything, try enabling this variable.
+    os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
+
+    # If this script is called by torchrun, the GPU rank is visible as an 
+    # environment variable.
+    rank = int(os.environ.get('RANK'))
+
+    # We need to prevent each subprocess from seeing GPUs other than the one it has 
+    # been assigned by torchrun.
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
+
+    # Start up the process group (needed so the different subprocesses can talk with 
+    # each other)
+    dist.init_process_group(backend=backend,
+                            timeout=timeout)
+      
+    try:     
+        # Run the single-GPU reconstruction script by importing it using either full 
+        # or partial paths to the script.
+
+        # We need to create a specification for a module's import-system-related state
+        spec = importlib.util.spec_from_file_location(script_name, script_path)
+
+        # Next, we need to import the module from spec
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[script_name] = module
+
+        # As a safeguard against opening something other than a reconstruction
+        # script, check if the script imports CDTools.
+        source_code = inspect.getsource(module)
+        if not ('import cdtools' in source_code or 'from cdtools' in source_code):
+            raise ValueError('Only CDTools reconstruction scripts can be used with this method.')
+
+        # Execute the script
+        spec.loader.exec_module(module)
+        #importlib.import_module(script_path)
+
+    finally:
+        # Kill the process group
+        dist.destroy_process_group()   
+
+
+def _spawn_wrapper(rank: int, 
+                  func: Callable[[int, int], None], 
+                  device_ids: List[int],
+                  backend: str = 'nccl', 
+                  timeout: int = 30,
+                  pipe: Connection = None):
     """
     Wraps functions containing reconstruction loops (i.e., `for loss in 
     model.Adam_optimize`) to enable multi-GPU operations to be set up. The 
@@ -72,7 +287,7 @@ def distributed_wrapper(rank: int,
             about PyTorch-supported backends.
         timeout: int
             Timeout for operations executed against the process group in seconds. 
-            Default is 10 minutes. After timeout has been reached, all subprocesses
+            Default is 30 seconds. After timeout has been reached, all subprocesses
             will be aborted and the process calling this method will crash. 
         pipe: Connection
             A Connection object representing one end of a communication pipe. This
@@ -87,11 +302,6 @@ def distributed_wrapper(rank: int,
     # Define the world_size
     world_size = len(device_ids)
 
-    # Update the rank in the model and indicate we're using multiple GPUs
-    #model.rank = rank
-    #model.device_id = device_ids[model.rank]
-    #model.world_size = len(device_ids)
-
     # Allow the process to only see the GPU is has been assigned
     os.environ['CUDA_VISIBLE_DEVICES'] = str(rank) 
 
@@ -106,21 +316,21 @@ def distributed_wrapper(rank: int,
     # variable and set the rank/world_size/multi_gpu_used flags accordingly.
     os.environ['WORLD_SIZE'] = str(world_size)
     os.environ['RANK'] = str(rank)
-    os.environ['NCCL_P2P_DISABLE'] = str(int(True))
     
     # Initialize the process group
     dist.init_process_group(backend=backend, rank=rank, 
                             world_size=world_size, timeout=timeout)
     
-    # Run the reconstruction script
-    # We also need to check if we want to pass a pipe to the function
-    if pipe is None:
-        func()    
-    else:
-        func(pipe)   
-                         
-    # Destroy process group
-    dist.destroy_process_group()        
+    try:
+        # Run the reconstruction script
+        # We also need to check if we want to pass a pipe to the function
+        if pipe is None:
+            func()    
+        else:
+            func(pipe)   
+    finally:                 
+        # Destroy process group
+        dist.destroy_process_group()        
 
 
 def spawn(func: Callable[[int, int], None],
@@ -128,7 +338,7 @@ def spawn(func: Callable[[int, int], None],
           master_addr: str,
           master_port: str,
           backend: str = 'nccl',
-          timeout: int = 600,
+          timeout: int = 30,
           nccl_p2p_disable: bool = True,
           pipe: Connection = None):
     """
@@ -156,7 +366,7 @@ def spawn(func: Callable[[int, int], None],
             about PyTorch-supported backends.
         timeout: int
             Timeout for operations executed against the process group in seconds. 
-            Default is 10 minutes. After timeout has been reached, all subprocesses
+            Default is 30 seconds. After timeout has been reached, all subprocesses
             will be aborted and the process calling this method will crash.   
         nccl_p2p_disable: bool
             Disable NCCL peer-2-peer communication
@@ -174,9 +384,9 @@ def spawn(func: Callable[[int, int], None],
 
     # Ensure a "graceful" termination of subprocesses if something goes wrong.
     print('\nStarting up multi-GPU reconstructions...\n')
-    mp.spawn(distributed_wrapper,
-                args=(func, device_ids, backend, timeout, pipe),
-                nprocs=len(device_ids),
-                join=True)
+    mp.spawn(_spawn_wrapper,
+             args=(func, device_ids, backend, timeout, pipe),
+             nprocs=len(device_ids),
+             join=True)
     print('Reconstructions complete...')
     
\ No newline at end of file
diff --git a/src/cdtools/tools/distributed/single_to_multi_gpu.py b/src/cdtools/tools/distributed/single_to_multi_gpu.py
new file mode 100644
index 00000000..d1eef4e5
--- /dev/null
+++ b/src/cdtools/tools/distributed/single_to_multi_gpu.py
@@ -0,0 +1,53 @@
+"""
+A wrapper script to run single-GPU reconstruction scripts as
+a multi-GPU job when called by torchrun.
+
+This script is intended to be called by torchrun. It is set
+up so that the group process handling (init and destroy), 
+definition of several environmental variables, and actual
+execution of the single-GPU script are handled by a single 
+call to dist.script_wrapper.
+
+For example, if we have the reconstruction script `reconstruct.py` and want to use
+4 GPUs, we can write the following:
+
+```
+torchrun --nnodes=1 --nproc_per_node=4 single-to-multi-gpu.py --script_path=reconstruct.py
+```
+"""
+import cdtools.tools.distributed as dist
+import argparse
+
+def get_args():
+    # Define the arguments we need to pass to dist.script_wrapper
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--script_path', 
+                        type=str, 
+                        help='Single GPU script file name (with or without .py extension)')
+    parser.add_argument('--backend', 
+                        type=str, 
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='Communication backend (nccl or gloo)')
+    parser.add_argument('--timeout', 
+                        type=int, 
+                        default=30,
+                        help='Time before process is killed in seconds')
+    parser.add_argument('--nccl_p2p_disable', 
+                        type=int, 
+                        default=1,
+                        choices=[0,1],
+                        help='Disable (1) or enable (0) NCCL peer-to-peer communication')
+    
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    # Get args
+    args = get_args()
+    
+    # Pass arguments to dist.script_wrapper
+    dist.wrap_single_gpu_script(script_path=args.script_path,
+                                backend=args.backend,
+                                timeout=args.timeout,
+                                nccl_p2p_disable=bool(args.nccl_p2p_disable))
\ No newline at end of file

From 83aaa8029011ced38bbb533ed51c80fe15584da1 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Tue, 17 Jun 2025 21:31:16 +0000
Subject: [PATCH 075/115] Removed several multi-gpu example scripts using
 depracated methods

---
 examples/fancy_ptycho_comparison.py | 110 ----------------------------
 examples/fancy_ptycho_multi_gpu.py  |  74 -------------------
 examples/fancy_ptycho_optimizer.py  |  48 ------------
 examples/single_to_multi_gpu.py     |  51 -------------
 4 files changed, 283 deletions(-)
 delete mode 100644 examples/fancy_ptycho_comparison.py
 delete mode 100644 examples/fancy_ptycho_multi_gpu.py
 delete mode 100644 examples/fancy_ptycho_optimizer.py
 delete mode 100644 examples/single_to_multi_gpu.py

diff --git a/examples/fancy_ptycho_comparison.py b/examples/fancy_ptycho_comparison.py
deleted file mode 100644
index df952423..00000000
--- a/examples/fancy_ptycho_comparison.py
+++ /dev/null
@@ -1,110 +0,0 @@
-"""This script runs reconstructions using both the old
-method of cdtools reconstruction (model.Adam_optimize)
-and the new method based on the creation of a Reconstructor
-class
-
-"""
-
-
-import cdtools
-import torch as t
-import numpy as np
-import time
-import copy
-from matplotlib import pyplot as plt
-
-t.manual_seed(0)
-
-filename = 'examples/example_data/AuBalls_700ms_30nmStep_3_6SS_filter.cxi'
-dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
-
-# Create a dict to store loss values
-losses = {}
-
-pad = 10
-dataset.pad(pad)
-model_original = cdtools.models.FancyPtycho.from_dataset(
-    dataset,
-    n_modes=3,
-    probe_support_radius=50,
-    propagation_distance=2e-6,
-    units='um',
-    probe_fourier_crop=pad)
-model_original.translation_offsets.data += 0.7 * t.randn_like(model_original.translation_offsets)
-model_original.weights.requires_grad = False
-
-def reload_model():
-    return copy.deepcopy(model_original)
-    
-
-# For running the reconstructors class
-numiter = 5
-
-# Set stuff up for plots
-fig, (ax1,ax2) = plt.subplots(1,2)
-
-for option in ('old_method', 'reconstructors'):
-    time_list = []
-    loss_hist_list = []
-
-    # Iterate n-number of times for statistics
-    for i in range(numiter):
-        t.cuda.empty_cache()
-        model = reload_model()
-        device = 'cuda'
-        model.to(device=device)
-        dataset.get_as(device=device)
-        # Construct a local time list
-        local_time_list = []
-        t_start = time.time()
-
-        def report_n_record():
-            print(model.report())
-            local_time_list.append(time.time() - t_start)
-
-        if option == 'reconstructors':
-            recon = cdtools.reconstructors.Adam(model, dataset)
-            for loss in recon.optimize(20, lr=0.005, batch_size=50):
-                report_n_record()
-            for loss in recon.optimize(20, lr=0.002, batch_size=100):
-                report_n_record()
-            for loss in recon.optimize(20, lr=0.001, batch_size=100):
-                report_n_record()
-        
-        elif option == 'old_method':
-            for loss in model.Adam_optimize(20, dataset, lr=0.005, batch_size=50):
-                report_n_record()
-            for loss in model.Adam_optimize(20, dataset, lr=0.002, batch_size=100):
-                report_n_record()
-            for loss in model.Adam_optimize(20, dataset, lr=0.001, batch_size=100):
-                report_n_record()
-
-        # After reconstructing, store the loss history and time values
-        loss_hist_list.append(model.loss_history)
-        time_list.append(local_time_list)
-
-    # After testing either the new or old method, calculate the statistics and plot
-    time_mean = np.array(time_list).mean(axis=0)/60
-    time_std = np.array(time_list).std(axis=0)/60
-    loss_mean = np.array(loss_hist_list).mean(axis=0)
-    loss_std = np.array(loss_hist_list).std(axis=0)
-
-    ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
-                    label=option)
-    ax2.errorbar(np.arange(0,loss_mean.shape[0]), loss_mean, yerr=loss_std,
-                label=option)
-
-# Plot                     
-fig.suptitle(f'Comparing old and new optimization refactor | {numiter} runs performed')
-ax1.set_yscale('log')
-ax1.set_xscale('linear')
-ax2.set_yscale('log')
-ax2.set_xscale('linear')
-ax1.legend()
-ax2.legend()
-ax1.set_xlabel('Time (min)')
-ax1.set_ylabel('Loss')
-ax2.set_xlabel('Epochs')
-plt.show()
-
-
diff --git a/examples/fancy_ptycho_multi_gpu.py b/examples/fancy_ptycho_multi_gpu.py
deleted file mode 100644
index 07eee3aa..00000000
--- a/examples/fancy_ptycho_multi_gpu.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import cdtools
-from matplotlib import pyplot as plt
-
-# We need to import the distributed package from CDTools
-from cdtools.tools.distributed import distributed
-
-filename = r'example_data/lab_ptycho_data.cxi'
-
-# Wrap the rest of the script inside of a function. This function will be
-# distributed across several GPUs for multiprocessing at the end.
-
-def multi_gpu_reconstruct():
-    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
-
-    model = cdtools.models.FancyPtycho.from_dataset(
-        dataset,
-        n_modes=3,
-        oversampling=2, 
-        probe_support_radius=120, 
-        propagation_distance=5e-3,
-        units='mm', 
-        obj_view_crop=-50)
-
-    model.background.requires_grad=True
-
-    device= f'cuda'
-    model.to(device=device)
-    dataset.get_as(device=device)
-
-    for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=50):
-        # We can still perform model.report, but we want only 1 GPU printing stuff.
-        if model.rank == 0: 
-            print(model.report())
-        
-        # You don't need to add the `if rank == 0` here. 
-        if model.epoch % 20 == 0:
-            model.inspect(dataset)
-
-    for loss in model.Adam_optimize(50, dataset, lr=0.005, batch_size=50):
-        if model.rank == 0: 
-            print(model.report())
-        
-        if model.epoch % 20 == 0:
-            model.inspect(dataset)
-
-    #model.tidy_probes()
-    model.inspect(dataset)
-
-    # You don't need to add the `if rank == 0` here either...
-    model.compare(dataset)
-    
-    # ...but you do have to add it here.
-    if model.rank == 0: plt.show()
-    
-
-# This will execute the multi_gpu_reconstruct upon running this file
-# Here, we're...
-#   - ...setting up `world_size=4` GPUs to run
-#   - ...telling CDTools the machine setting up all the connections (called
-#        the "rank 0 node/machine") is on address `master_addr`
-#   - ...telling CDTools we have a free port on `master_port` on the machine
-#        with rank 0.
-#   - ...going to wait 30 seconds for the GPUs to do something before 
-#        we terminate the reconstruction. If you want to inspect/compare
-#        the model after reconstruction, consider increasing the timeout.
-#
-# If you're using a single node (single machine/computer), you can try setting
-# master_addr = 'localhost'.
-if __name__ == '__main__':
-    distributed.spawn(multi_gpu_reconstruct, 
-                      device_ids = [0,1,2,3],
-                      master_addr='localhost',
-                      master_port='8888',
-                      timeout=30)
diff --git a/examples/fancy_ptycho_optimizer.py b/examples/fancy_ptycho_optimizer.py
deleted file mode 100644
index 79f9c4ed..00000000
--- a/examples/fancy_ptycho_optimizer.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import cdtools
-from matplotlib import pyplot as plt
-
-filename = 'examples/example_data/lab_ptycho_data.cxi'
-dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
-
-# FancyPtycho is the workhorse model
-model = cdtools.models.FancyPtycho.from_dataset(
-    dataset,
-    n_modes=3, # Use 3 incoherently mixing probe modes
-    oversampling=2, # Simulate the probe on a 2xlarger real-space array
-    probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
-    propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
-    units='mm', # Set the units for the live plots
-    obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
-)
-
-device = 'cuda'
-model.to(device=device)
-dataset.get_as(device=device)
-
-# An Adam Reconstructor object is created to perform Adam 
-# optimization on the FancyPtycho model and dataset
-recon = cdtools.reconstructors.Adam(model, dataset)
-
-# The learning rate parameter sets the alpha for Adam.
-# The beta parameters are (0.9, 0.999) by default
-# The batch size sets the minibatch size
-for loss in recon.optimize(50, lr=0.02, batch_size=10):
-    print(model.report())
-    # Plotting is expensive, so we only do it every tenth epoch
-    if model.epoch % 10 == 0:
-        model.inspect(dataset)
-
-# It's common to chain several different reconstruction loops. Here, we
-# started with an aggressive refinement to find the probe, and now we
-# polish the reconstruction with a lower learning rate and larger minibatch
-for loss in recon.optimize(50, lr=0.005, batch_size=50):
-    print(model.report())
-    if model.epoch % 10 == 0:
-        model.inspect(dataset)
-
-# This orthogonalizes the recovered probe modes
-model.tidy_probes()
-
-model.inspect(dataset)
-model.compare(dataset)
-plt.show()
diff --git a/examples/single_to_multi_gpu.py b/examples/single_to_multi_gpu.py
deleted file mode 100644
index 3578d02b..00000000
--- a/examples/single_to_multi_gpu.py
+++ /dev/null
@@ -1,51 +0,0 @@
-"""
-A wrapper script intended to run single-GPU scripts as
-a multi-GPU job when called by torchrun.
-
-This script is intended to be called by torchrun. It is set
-up so that the group process handling (init and destroy) and 
-definition of several environmental variables are handled here. 
-The reconstruction script of interest is called by simply
-importing the name of the file (minus the .py extension).
-
-Currently, this is the torchrun command I'm calling to use
-4 GPUs:
-
-torchrun --nnodes=1 --nproc_per_node=4 single_to_multi_gpu.py
-"""
-import os
-import datetime
-import torch.distributed as dist
-
-
-if __name__ == '__main__':
-    # Kill the process if it hangs/pauses for a certain amount
-    # of time.
-    timeout = datetime.timedelta(seconds=30)
-
-    # Enable/disable NVidia Collective Communications Library (NCCL)
-    # peer-to-peer communication. If you find that all your GPUs
-    # are at 100% use but don't seem to be doing anything, try enabling
-    # this variable.
-    os.environ['NCCL_P2P_DISABLE'] = str(int(True))
-
-    # If this script is called by torchrun, the GPU rank is
-    # visible as an environment variable.
-    rank = int(os.environ.get('RANK'))
-
-    # We need to prevent each subprocess from seeing GPUs other
-    # than the one it has been assigned by torchrun.
-    os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
-
-    # Start up the process group (needed so the different
-    # subprocesses can talk with each other)
-    dist.init_process_group(backend='nccl',
-                            timeout=timeout)
-      
-    try:     
-        # Run the single-GPU reconstruction script by importing it
-        import fancy_ptycho
-
-    finally:
-        # Kill the process group
-        dist.destroy_process_group()
\ No newline at end of file

From f2d618b2d5273a26edcbc0a0510e718c79a0cacd Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 20 Jun 2025 23:05:14 +0000
Subject: [PATCH 076/115] Distributive methods support GPU ID selection

---
 src/cdtools/tools/distributed/distributed.py | 46 ++++++++++++++++----
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 3a8109b2..7bc95c76 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -32,6 +32,7 @@
 from multiprocessing.connection import Connection
 from typing import Callable, List
 from pathlib import Path
+from ast import literal_eval
 
 DISTRIBUTED_PATH = os.path.dirname(os.path.abspath(__file__))
 
@@ -92,6 +93,14 @@ def run_single_to_multi_gpu():
     cdt-torchrun --nproc_per_node=4 s script_path=reconstruct.py
     ```
 
+    If you want to use specific GPU IDs for reconstructions, you need to set up
+    the environment variable `CDTOOLS_GPU_IDS` rather than `CUDA_VISIBLE_DEVICES`. 
+    If you wanted to use GPU IDs `1, 3, 4` for example, write:
+    
+    ```
+    CDTOOLS_GPU_IDS=1,3,4 cdt-torchrun --nnodes=1 --nproc_per_node=3 reconstruct.py
+    ```
+
     Arguments:
         script_path: str
             Path of the single-GPU script (either full or partial path).
@@ -141,12 +150,14 @@ def run_single_to_multi_gpu():
     # Get the arguments
     args = parser.parse_args()
 
+    # Set an environment variable for OMP_NUM_THREADS (sets number of threads)
+
     # Don't let the user die in anticipation
-    print(f'\n[CDTools]: Starting up multi-GPU reconstructions with {args.ngpus} GPUs.\n')
+    print(f'\n[INFO]: Starting up multi-GPU reconstructions with {args.ngpus} GPUs.\n')
     
     # Perform the torchrun call of the wrapped function
     subprocess.run(['torchrun', # We set up the torchrun arguments first
-                    '--nnodes=1', 
+                    f'--nnodes={args.nnodes}', 
                     f'--nproc_per_node={args.ngpus}', 
                     os.path.join(DISTRIBUTED_PATH,'single_to_multi_gpu.py'), # Make the call to the single-to-multi-gpu wrapper script
                     f'--backend={args.backend}',
@@ -155,7 +166,7 @@ def run_single_to_multi_gpu():
                     f'--script_path={args.script_path}'])
     
     # Let the user know the job is done
-    print(f'\n[CDTools]: Reconstructions complete.\n')
+    print(f'\n[INFO]: Reconstructions complete.\n')
 
 
 def wrap_single_gpu_script(script_path: str,
@@ -182,6 +193,14 @@ def wrap_single_gpu_script(script_path: str,
     torchrun --nnodes=1 --nproc_per_node=4 single_to_multi_gpu.py
     ```
 
+    If you want to use specific GPU IDs for reconstructions, you need to set up
+    the environment variable `CDTOOLS_GPU_IDS` rather than `CUDA_VISIBLE_DEVICES`. 
+    If you wanted to use GPU IDs `1, 3, 4` for example, write:
+    
+    ```
+    CDTOOLS_GPU_IDS=1,3,4 torchrun --nnodes=1 --nproc_per_node=3 single_to_multi_gpu.py
+    ```
+    
     Parameters:
         script_name: str
             The file path of the single-GPU script (either full or relative).
@@ -197,7 +216,9 @@ def wrap_single_gpu_script(script_path: str,
             Default is 30 seconds. After timeout has been reached, all subprocesses
             will be aborted and the process calling this method will crash. 
         nccl_p2p_disable: bool
-            Disable NCCL peer-2-peer communication
+            Disable NCCL peer-2-peer communication. If you find that all your GPUs
+            are at 100% useage but the program isn't doing anything, try enabling
+            this variable.
     """
     # Check if the script path actually exists
     if not os.path.exists(script_path):
@@ -216,17 +237,26 @@ def wrap_single_gpu_script(script_path: str,
     timeout = datetime.timedelta(seconds=timeout)
 
     # Enable/disable NVidia Collective Communications Library (NCCL)
-    # peer-to-peer communication. If you find that all your GPUs are at 100% use 
-    # but don't seem to be doing anything, try enabling this variable.
+    # peer-to-peer communication.
     os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
 
     # If this script is called by torchrun, the GPU rank is visible as an 
     # environment variable.
     rank = int(os.environ.get('RANK'))
 
+    # Assign a GPU ID to the subprocess.
+    # If the CDTOOLS_GPU_IDS environment variable is defined, then assign based
+    # on the GPU IDS provided in that list. Otherwise, use the rank for the GPU ID.
+    gpu_ids = os.environ.get('CDTOOLS_GPU_IDS')
+    
+    if gpu_ids is None:
+        gpu_id = rank
+    else:
+        gpu_id = literal_eval(gpu_ids)[rank]
+
     # We need to prevent each subprocess from seeing GPUs other than the one it has 
-    # been assigned by torchrun.
-    os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
+    # been assigned.
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
 
     # Start up the process group (needed so the different subprocesses can talk with 
     # each other)

From 664b0b204cbd12841362bf65ab040946abfee480 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 20 Jun 2025 23:05:57 +0000
Subject: [PATCH 077/115] Single-node multi-worker enforced in cdt-torchrun

---
 src/cdtools/tools/distributed/distributed.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 7bc95c76..7a959ffa 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -157,6 +157,7 @@ def run_single_to_multi_gpu():
     
     # Perform the torchrun call of the wrapped function
     subprocess.run(['torchrun', # We set up the torchrun arguments first
+                    '--standalone', # Indicates that we're running a single machine, multiple GPU job.
                     f'--nnodes={args.nnodes}', 
                     f'--nproc_per_node={args.ngpus}', 
                     os.path.join(DISTRIBUTED_PATH,'single_to_multi_gpu.py'), # Make the call to the single-to-multi-gpu wrapper script

From 400e5efe2f538156c3b3e49598ed4bffa83372d2 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 27 Jun 2025 17:10:13 +0000
Subject: [PATCH 078/115] refactored cdt-torchrun to use runpy

---
 src/cdtools/tools/distributed/distributed.py | 46 +++++---------------
 1 file changed, 12 insertions(+), 34 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 7a959ffa..77e1a3c8 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -24,14 +24,11 @@
 import torch.multiprocessing as mp
 import datetime
 import os
-import sys
-import importlib
 import subprocess
-import inspect
 import argparse
+import runpy
 from multiprocessing.connection import Connection
 from typing import Callable, List
-from pathlib import Path
 from ast import literal_eval
 
 DISTRIBUTED_PATH = os.path.dirname(os.path.abspath(__file__))
@@ -221,19 +218,18 @@ def wrap_single_gpu_script(script_path: str,
             are at 100% useage but the program isn't doing anything, try enabling
             this variable.
     """
-    # Check if the script path actually exists
+    ######################   Check if the script is safe to run  ######################
+    # 1) Check if the file path actually exists
     if not os.path.exists(script_path):
         raise FileNotFoundError(
             f'Cannot open file: {os.path.join(os.getcwd(), script_path)}')
 
-    # Make sure that the script_name doesn't contain `.py` and share
-    # the same name as any of the imported modules
-    script_name = Path(script_path).stem
-    if script_name in sys.modules:
-        raise NameError(
-            f'The file name {script_name} cannot share the same name as modules'
-             ' imported in CDTools. Please change the script file name.')
-
+    # 2) Check if the file is a CDTools reconstruction script.
+    with open(script_path, 'r') as f:
+        source_code = f.read()
+    if not ('import cdtools' in source_code or 'from cdtools' in source_code):
+        raise ValueError('File is not a CDTools reconstruction script (the script must import cdtools modules).')
+    
     # Kill the process if it hangs/pauses for a certain amount of time.
     timeout = datetime.timedelta(seconds=timeout)
 
@@ -261,29 +257,11 @@ def wrap_single_gpu_script(script_path: str,
 
     # Start up the process group (needed so the different subprocesses can talk with 
     # each other)
-    dist.init_process_group(backend=backend,
-                            timeout=timeout)
+    dist.init_process_group(backend=backend, timeout=timeout)
       
     try:     
-        # Run the single-GPU reconstruction script by importing it using either full 
-        # or partial paths to the script.
-
-        # We need to create a specification for a module's import-system-related state
-        spec = importlib.util.spec_from_file_location(script_name, script_path)
-
-        # Next, we need to import the module from spec
-        module = importlib.util.module_from_spec(spec)
-        sys.modules[script_name] = module
-
-        # As a safeguard against opening something other than a reconstruction
-        # script, check if the script imports CDTools.
-        source_code = inspect.getsource(module)
-        if not ('import cdtools' in source_code or 'from cdtools' in source_code):
-            raise ValueError('Only CDTools reconstruction scripts can be used with this method.')
-
-        # Execute the script
-        spec.loader.exec_module(module)
-        #importlib.import_module(script_path)
+        # Run the single-GPU reconstruction script 
+        runpy.run_path(script_path, run_name='__main__')
 
     finally:
         # Kill the process group

From 526afdb46f6bc21dc808790a6b9419cb06340118 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 27 Jun 2025 17:37:32 +0000
Subject: [PATCH 079/115] tidied up wrap_single_gpu_script

---
 src/cdtools/tools/distributed/distributed.py | 36 +++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 77e1a3c8..7dd08f60 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -173,7 +173,7 @@ def wrap_single_gpu_script(script_path: str,
                            nccl_p2p_disable: bool = True):
     """
     Wraps single-GPU reconstruction scripts to be ran as a multi-GPU job via
-    torchrun calls.
+    torchrun calls. 
 
     This function is intended to be called in a script (say, single_to_multi_gpu.py) 
     with the following form:
@@ -199,6 +199,9 @@ def wrap_single_gpu_script(script_path: str,
     CDTOOLS_GPU_IDS=1,3,4 torchrun --nnodes=1 --nproc_per_node=3 single_to_multi_gpu.py
     ```
     
+    NOTE: For each subprocess `cdt-torchrun` creates, the environment variable
+          `CUDA_VISIBLE_DEVICES` will be (re)defined as the GPU rank.
+
     Parameters:
         script_name: str
             The file path of the single-GPU script (either full or relative).
@@ -219,29 +222,24 @@ def wrap_single_gpu_script(script_path: str,
             this variable.
     """
     ######################   Check if the script is safe to run  ######################
+    ###################################################################################
+
     # 1) Check if the file path actually exists
     if not os.path.exists(script_path):
-        raise FileNotFoundError(
-            f'Cannot open file: {os.path.join(os.getcwd(), script_path)}')
+        raise FileNotFoundError(f'Cannot open file: {os.path.join(os.getcwd(), script_path)}')
 
     # 2) Check if the file is a CDTools reconstruction script.
     with open(script_path, 'r') as f:
         source_code = f.read()
     if not ('import cdtools' in source_code or 'from cdtools' in source_code):
         raise ValueError('File is not a CDTools reconstruction script (the script must import cdtools modules).')
-    
-    # Kill the process if it hangs/pauses for a certain amount of time.
-    timeout = datetime.timedelta(seconds=timeout)
 
-    # Enable/disable NVidia Collective Communications Library (NCCL)
-    # peer-to-peer communication.
-    os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
+    ##########   Force each subprocess to see only the GPU ID we assign it  ###########
+    ###################################################################################
 
-    # If this script is called by torchrun, the GPU rank is visible as an 
-    # environment variable.
+    # The GPU rank is visible as an environment variable through torchrun calls.
     rank = int(os.environ.get('RANK'))
 
-    # Assign a GPU ID to the subprocess.
     # If the CDTOOLS_GPU_IDS environment variable is defined, then assign based
     # on the GPU IDS provided in that list. Otherwise, use the rank for the GPU ID.
     gpu_ids = os.environ.get('CDTOOLS_GPU_IDS')
@@ -251,13 +249,17 @@ def wrap_single_gpu_script(script_path: str,
     else:
         gpu_id = literal_eval(gpu_ids)[rank]
 
-    # We need to prevent each subprocess from seeing GPUs other than the one it has 
-    # been assigned.
     os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
 
-    # Start up the process group (needed so the different subprocesses can talk with 
-    # each other)
-    dist.init_process_group(backend=backend, timeout=timeout)
+    ################################  Run the script  #################################
+    ###################################################################################
+    
+    # Enable/disable NCCL peer-to-peer communication. The boolean needs to be converted into
+    # a string for the environment variable.
+    os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
+
+    # Start up the process group (lets the different subprocesses can talk with each other)
+    dist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout))
       
     try:     
         # Run the single-GPU reconstruction script 

From f7e4c2cd24cacb483b2efc2ae0ecca376ee40647 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 30 Jun 2025 19:42:50 +0000
Subject: [PATCH 080/115] Fix to synchronize RNG seed for multi-gpu

---
 src/cdtools/tools/distributed/distributed.py  | 19 ++++++++++++++++---
 .../tools/distributed/single_to_multi_gpu.py  |  8 ++++++--
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 7dd08f60..a970fa5a 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -20,6 +20,7 @@
 DistributedSampler with the DataLoader.
 """
 
+import torch as t
 import torch.distributed as dist
 import torch.multiprocessing as mp
 import datetime
@@ -151,6 +152,9 @@ def run_single_to_multi_gpu():
 
     # Don't let the user die in anticipation
     print(f'\n[INFO]: Starting up multi-GPU reconstructions with {args.ngpus} GPUs.\n')
+
+    # Set a random seed that all participaring workers will use
+    seed = t.initial_seed()
     
     # Perform the torchrun call of the wrapped function
     subprocess.run(['torchrun', # We set up the torchrun arguments first
@@ -161,7 +165,8 @@ def run_single_to_multi_gpu():
                     f'--backend={args.backend}',
                     f'--timeout={args.timeout}',
                     f'--nccl_p2p_disable={args.nccl_p2p_disable}',
-                    f'--script_path={args.script_path}'])
+                    f'--script_path={args.script_path}',
+                    f'--seed={seed}'])
     
     # Let the user know the job is done
     print(f'\n[INFO]: Reconstructions complete.\n')
@@ -170,7 +175,8 @@ def run_single_to_multi_gpu():
 def wrap_single_gpu_script(script_path: str,
                            backend: str = 'nccl',
                            timeout: int = 30,
-                           nccl_p2p_disable: bool = True):
+                           nccl_p2p_disable: bool = True,
+                           seed: int = 0):
     """
     Wraps single-GPU reconstruction scripts to be ran as a multi-GPU job via
     torchrun calls. 
@@ -220,6 +226,10 @@ def wrap_single_gpu_script(script_path: str,
             Disable NCCL peer-2-peer communication. If you find that all your GPUs
             are at 100% useage but the program isn't doing anything, try enabling
             this variable.
+        seed: int
+            Seed for generating random numbers. This value must be identical across all
+            participating devices.
+            
     """
     ######################   Check if the script is safe to run  ######################
     ###################################################################################
@@ -260,6 +270,10 @@ def wrap_single_gpu_script(script_path: str,
 
     # Start up the process group (lets the different subprocesses can talk with each other)
     dist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout))
+
+    # Force this subprocess to use a given RNG seed (should be identical across all subprocesses)
+    t.manual_seed(seed)
+    t.cuda.manual_seed_all(seed)
       
     try:     
         # Run the single-GPU reconstruction script 
@@ -269,7 +283,6 @@ def wrap_single_gpu_script(script_path: str,
         # Kill the process group
         dist.destroy_process_group()   
 
-
 def _spawn_wrapper(rank: int, 
                   func: Callable[[int, int], None], 
                   device_ids: List[int],
diff --git a/src/cdtools/tools/distributed/single_to_multi_gpu.py b/src/cdtools/tools/distributed/single_to_multi_gpu.py
index d1eef4e5..e365349e 100644
--- a/src/cdtools/tools/distributed/single_to_multi_gpu.py
+++ b/src/cdtools/tools/distributed/single_to_multi_gpu.py
@@ -38,6 +38,10 @@ def get_args():
                         default=1,
                         choices=[0,1],
                         help='Disable (1) or enable (0) NCCL peer-to-peer communication')
+    parser.add_argument('--seed',
+                        type=int,
+                        default=0,
+                        help='Sets the RNG seed for all devices')
     
     return parser.parse_args()
 
@@ -45,9 +49,9 @@ def get_args():
 if __name__ == '__main__':
     # Get args
     args = get_args()
-    
     # Pass arguments to dist.script_wrapper
     dist.wrap_single_gpu_script(script_path=args.script_path,
                                 backend=args.backend,
                                 timeout=args.timeout,
-                                nccl_p2p_disable=bool(args.nccl_p2p_disable))
\ No newline at end of file
+                                nccl_p2p_disable=bool(args.nccl_p2p_disable),
+                                seed=args.seed)
\ No newline at end of file

From dd72143149874edc75623eaa390220fcabd617b9 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 4 Jul 2025 22:24:32 +0000
Subject: [PATCH 081/115] Modified speed test to work with cdt-torchrun

---
 examples/distributed_speed_test.py           | 261 +------------------
 examples/fancy_ptycho_speed_test.py          |  67 +++++
 src/cdtools/tools/distributed/distributed.py | 102 ++++++++
 3 files changed, 182 insertions(+), 248 deletions(-)
 create mode 100644 examples/fancy_ptycho_speed_test.py

diff --git a/examples/distributed_speed_test.py b/examples/distributed_speed_test.py
index 6a4928b4..e465e788 100644
--- a/examples/distributed_speed_test.py
+++ b/examples/distributed_speed_test.py
@@ -9,262 +9,27 @@
 
 '''
 
-import cdtools
-from cdtools.models import CDIModel
-from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
-from cdtools.tools.distributed import distributed
-import torch.multiprocessing as mp
-import torch as t
-from multiprocessing.connection import Connection
-from typing import Tuple, List
-from matplotlib import pyplot as plt
-import time
-import numpy as np
-from copy import deepcopy
+from cdtools.tools import distributed as dist
+import os
 
-#The test to run:
-#   'fancy_ptycho' - Runs reconstruction parameters from examples/fancy_ptycho.py
-#   'gold_balls' - Runs reconstruction parameters from examples/gold_ball_ptycho.py
-TEST = 'fancy_ptycho'
-
-# Multi-GPU supported reconstruction
-def reconstruct(model: CDIModel,
-                dataset: Ptycho2DDataset,
-                rank: int, 
-                device_ids: List[int],
-                conn: Connection = None,
-                schedule: bool = False) -> Tuple[np.ndarray, np.ndarray]:
-    """Perform the reconstruction using several GPUs
-    If only one GPU is used, we don't bother loading the the process group
-    or doing any of the fancy stuff associated with multi-GPU operation.
-
-    Parameters:
-        model: CDIModel
-                Model for CDI/ptychography reconstruction
-        dataset: Ptycho2DDataset
-            The dataset to reconstruct against
-        rank: int
-            The rank of the GPU to be used. Value should be within
-            [0, #_of_GPUs_used-1]
-        device_ids: list[int]
-            List of GPU IDs to use
-        conn: Connection
-            A Connection object representing one end of a communication pipe. This
-            parameter is needed if you're trying to get some values back from the
-            wrapped function.
-        schedule: bool
-            Toggles the use of the scheduler
-    
-    Returns:
-        time_history: np.array
-            Array of when each loss was measured
-        loss_history: np.array
-            The total history of the model
-    """
-    # Create a list to keep track of when each module report was printed
-    t_list = []
-    # Start counting time
-    t_start = time.time()
-
-    # Check if we're using only a single GPU
-    if not model.multi_gpu_used:
-        # Use the 1st GPU in device_ids
-        device = f'cuda:{device_ids[0]}'
-        model.to(device=device)
-        dataset.get_as(device=device)
-
-    # Set up the Reconstructor with the Adam reconstructor
-    recon = cdtools.reconstructors.Adam(model,dataset)
-
-    # Perform reconstructions on either single or multi-GPU workflows.
-    if TEST == 'fancy_ptycho':
-        for loss in recon.optimize(50, lr=0.02, batch_size=40):
-            if rank == 0:
-                print(model.report())
-                t_list.append(time.time() - t_start)
-
-        for loss in recon.optimize(25, lr=0.005, batch_size=40):
-            if rank == 0:
-                print(model.report())
-                t_list.append(time.time() - t_start)
-
-        for loss in recon.optimize(25, lr=0.001, batch_size=40):
-            if rank == 0:
-                print(model.report())
-                t_list.append(time.time() - t_start)
-
-    elif TEST == 'gold_balls':
-        for loss in recon.optimize(20, lr=0.005, batch_size=50):
-            if rank == 0: 
-                print(model.report())
-                t_list.append(time.time() - t_start)
-
-        for loss in recon.optimize(50, lr=0.002, batch_size=100):
-            if rank == 0:
-                print(model.report())
-                t_list.append(time.time() - t_start)
-
-        for loss in recon.optimize(100, lr=0.001, batch_size=100):
-            if rank == 0:
-                print(model.report())
-                t_list.append(time.time() - t_start)
-
-    # We need to send the time_history and loss_history through
-    # the child connection to the parent (sitting in the name-main block)
-    if rank == 0:
-        loss_history = np.array(model.loss_history)
-        time_history = np.array(t_list)
-
-        if conn is not None: 
-            conn.send((time_history, loss_history))
-
-    # Return the measured time and loss history if we're on a single GPU
-    if not model.multi_gpu_used:
-        return time_history, loss_history
-
-
-def run_test(world_sizes: int, 
-             device_ids: int, 
-             runs: int):
-    """Runs a series of reconstructions (defined in the local function 
-    `reconstruct`) using several GPUs and several trials per GPU count.
-
-    Parameters:
-        world_sizes: list[int]
-            Number of GPUs to use. User can specify several GPU counts in a list.
-        device_ids: list[int] or int
-            List of the GPU ID numbers to use for the study
-        runs: int
-            How many repeat reconstructions to perform
-    """
-    # Load the dataset and model
-    if TEST == 'fancy_ptycho':
-        filename = 'examples/example_data/lab_ptycho_data.cxi'
-    elif TEST == 'gold_balls':
-        filename = 'examples/example_data/AuBalls_700ms_30nmStep_3_6SS_filter.cxi'
-
-    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
-    
-    if TEST == 'fancy_ptycho':
-        model = cdtools.models.FancyPtycho.from_dataset(
-            dataset,
-            n_modes=3,
-            oversampling=2, 
-            probe_support_radius=120, 
-            propagation_distance=5e-3,
-            units='mm', 
-            obj_view_crop=-50)
-        
-    elif TEST == 'gold_balls':
-        pad = 10
-        dataset.pad(pad)
-        model = cdtools.models.FancyPtycho.from_dataset(
-            dataset,
-            n_modes=3,
-            probe_support_radius=50,
-            propagation_distance=2e-6,
-            units='um',
-            probe_fourier_crop=pad)
-        model.translation_offsets.data += 0.7 * t.randn_like(model.translation_offsets)
-        model.weights.requires_grad = False
-
-    # Set up a parent/child connection to get some info from the GPU-accelerated function
-    parent_conn, child_conn = mp.Pipe()
-    
-    # Set stuff up for plots
-    fig, (ax1,ax2,ax3) = plt.subplots(1,3)
-
-    # Store the value of the single GPU time
-    time_1gpu = 0
-    std_1gpu = 0
-
-    for world_size in world_sizes:
-        # Get the GPU IDs to use
-        dev_id = device_ids[0:world_size] 
-        print(f'\nNumber of GPU(s): {world_size} | Using GPU IDs {*dev_id,}')
-
-        # Make a list to store the values
-        time_list = []
-        loss_hist_list = []
-
-        for i in range(runs):
-            print(f'Resetting the model...')
-            print(f'Starting run {i+1}/{runs} on {world_size} GPU(s)')
-            model_copy = deepcopy(model)
-            if world_size == 1:
-                final_time, loss_history = reconstruct(model=model_copy, 
-                                                        dataset=dataset,
-                                                        rank=0,
-                                                        device_ids=dev_id)
-                time_list.append(final_time)
-                loss_hist_list.append(loss_history)
-            else:
-                # Spawn the processes
-                distributed.spawn(reconstruct,
-                                    model=model_copy,
-                                    dataset=dataset,
-                                    device_ids = dev_id,
-                                    master_addr = 'localhost',
-                                    master_port = '8888',
-                                    timeout=300,
-                                    pipe=child_conn,
-                                    nccl_p2p_disable=True)
-                while parent_conn.poll():
-                    final_time, loss_history = parent_conn.recv()
-                    time_list.append(final_time)
-                    loss_hist_list.append(loss_history)
-            
-        # Calculate the statistics
-        time_mean = np.array(time_list).mean(axis=0)/60
-        time_std = np.array(time_list).std(axis=0)/60
-        loss_mean = np.array(loss_hist_list).mean(axis=0)
-        loss_std = np.array(loss_hist_list).std(axis=0)
-
-        # If a single GPU is used, store the time
-        if world_size == 1:
-            time_1gpu = time_mean[-1]
-            std_1gpu = time_std[-1]
-
-        # Calculate the speed-up relative to using a single GPU
-        speed_up_mean = time_1gpu / time_mean[-1] 
-        speed_up_std = speed_up_mean * \
-            np.sqrt((std_1gpu/time_1gpu)**2 + (time_std[-1]/time_mean[-1])**2)
-
-        # Add another plot
-        ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
-                    label=f'{world_size} GPUs')
-        ax2.errorbar(np.arange(0,loss_mean.shape[0]), loss_mean, yerr=loss_std,
-                    label=f'{world_size} GPUs')
-        ax3.errorbar(world_size, speed_up_mean, yerr=speed_up_std, fmt='o')
-        
-    # Plot
-    fig.suptitle(f'Multi-GPU performance test | {runs} runs performed')
-    ax1.set_yscale('log')
-    ax1.set_xscale('linear')
-    ax2.set_yscale('log')
-    ax2.set_xscale('linear')
-    ax3.set_yscale('linear')
-    ax3.set_xscale('linear')
-    ax1.legend()
-    ax2.legend()
-    ax1.set_xlabel('Time (min)')
-    ax1.set_ylabel('Loss')
-    ax2.set_xlabel('Epochs')
-    ax3.set_xlabel('Number of GPUs')
-    ax3.set_ylabel('Speed-up relative to single GPU')
-    plt.show()
+# If you're running on AMD CPUs, you need to include this or else you will get a
+# threading layer error. 
+os.environ['MKL_THREADING_LAYER'] = 'GNU'
 
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
     # Define the number of GPUs to use.
     world_sizes = [1, 2, 4] 
 
-    # Define which GPU IDs to use
-    device_ids = [1, 2, 5, 7]
-
     # How many reconstruction runs to perform for statistics
-    runs = 1
+    runs = 3
+
+    # Define where the single-GPU script is located
+    script_path = 'fancy_ptycho_distributed.py'
+
+    # Define where the loss-vs-time data is being stored in
+    output_dir = 'example_loss_data'
 
     # Run the test
-    run_test(world_sizes, device_ids, runs)
+    dist.run_speed_test(world_sizes, runs, script_path, output_dir)
     
\ No newline at end of file
diff --git a/examples/fancy_ptycho_speed_test.py b/examples/fancy_ptycho_speed_test.py
new file mode 100644
index 00000000..dfb0a3ff
--- /dev/null
+++ b/examples/fancy_ptycho_speed_test.py
@@ -0,0 +1,67 @@
+import cdtools
+import time
+import pickle
+import os
+
+
+# Script is intended to be called by distributed_speed_test.py.
+# We need to know which trial number this script is running
+TRIAL_NUMBER = int(os.environ.get('CDTOOLS_TRIAL_NUMBER'))
+
+# Create a list to keep track of when each module report was printed
+t_list = []
+# Start counting time
+t_start = time.time()
+
+filename = 'example_data/lab_ptycho_data.cxi'
+dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+# FancyPtycho is the workhorse model
+model = cdtools.models.FancyPtycho.from_dataset(
+    dataset,
+    n_modes=3, # Use 3 incoherently mixing probe modes
+    oversampling=2, # Simulate the probe on a 2xlarger real-space array
+    probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
+    propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
+    units='mm', # Set the units for the live plots
+    obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
+)
+
+device = 'cuda'
+model.to(device=device)
+dataset.get_as(device=device)
+
+for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
+    if model.rank == 0:
+        print(model.report())
+        t_list.append(time.time() - t_start)
+
+for loss in model.Adam_optimize(25, dataset,  lr=0.005, batch_size=40):
+    if model.rank == 0:
+        print(model.report())
+        t_list.append(time.time() - t_start)
+
+for loss in model.Adam_optimize(25, dataset,  lr=0.001, batch_size=40):
+    if model.rank == 0:
+        print(model.report())
+        t_list.append(time.time() - t_start)
+
+# This orthogonalizes the recovered probe modes
+model.tidy_probes()
+
+# Save the model and loss history
+if model.rank == 0:
+    # Set up the file name:
+    file_name = f'speed_test_nGPUs_{model.world_size}_TRIAL_{TRIAL_NUMBER}'
+    # Grab the loss and time history
+    loss_history = model.loss_history
+    time_history = t_list
+    # Store quantities in a dictionary
+    dict = {'loss history':loss_history,
+            'time history':time_history,
+            'nGPUs':model.world_size,
+            'trial':TRIAL_NUMBER}
+    # Save the quantities
+    with open (f'example_loss_data/'+file_name+'.pkl', 'wb') as f:
+        pickle.dump(dict, f)
+
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index a970fa5a..e2674bc4 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -31,6 +31,9 @@
 from multiprocessing.connection import Connection
 from typing import Callable, List
 from ast import literal_eval
+from matplotlib import pyplot as plt
+import pickle
+import numpy as np
 
 DISTRIBUTED_PATH = os.path.dirname(os.path.abspath(__file__))
 
@@ -38,6 +41,7 @@
            'torchrunner',
            'run_single_to_multi_gpu',
            'wrap_single_gpu_script',
+           'run_speed_test',
            '_spawn_wrapper',
            'spawn']
 
@@ -283,6 +287,104 @@ def wrap_single_gpu_script(script_path: str,
         # Kill the process group
         dist.destroy_process_group()   
 
+
+def run_speed_test(world_sizes: int, 
+                   runs: int,
+                   script_path: str,
+                   output_dir: str):
+    """
+    Executes a reconstruction script `n` x `m` times using `n` GPUs and `m` trials 
+    per GPU count using cdt-torchrun.
+
+    This function assumes that
+
+    Parameters:
+        world_sizes: list[int]
+            Number of GPUs to use. User can specify several GPU counts in a list.
+        runs: int
+            How many repeat reconstructions to perform
+        script_path: str
+            Path of the single-gpu reconstruction script.
+        output_dir: str
+            Directory of the loss-vs-time/epoch data generated for the speed test.
+    """
+    # Set stuff up for plots
+    fig, (ax1,ax2,ax3) = plt.subplots(1,3)
+
+    # Store the value of the single GPU time
+    time_1gpu = 0
+    std_1gpu = 0
+
+    for world_size in world_sizes:
+        # Get the GPU IDs to use
+        #dev_id = device_ids[0:world_size] 
+        #print(f'\nNumber of GPU(s): {world_size} | Using GPU IDs {*dev_id,}')
+
+        # Make a list to store the values
+        time_list = []
+        loss_hist_list = []
+
+        for i in range(runs):
+            print(f'Resetting the model...')
+            print(f'Starting run {i+1}/{runs} on {world_size} GPU(s)')
+
+            # The scripts running speed tests need to read the trial number
+            # they are on using an environment variable
+            os.environ['CDTOOLS_TRIAL_NUMBER'] = str(i)
+            
+            # Run cdt-torchrun
+            subprocess.run(['cdt-torchrun',
+                            f'--ngpus={world_size}',
+                            f'{script_path}'])
+
+            print(f'[INFO]: Reconstruction complete. Loading loss results...')
+            with open(os.path.join(output_dir, f'speed_test_nGPUs_{world_size}_TRIAL_{i}.pkl'), 'rb') as f:
+                results = pickle.load(f)
+            time_list.append(results['time history'])
+            loss_hist_list.append(results['loss history'])
+
+            
+        # Calculate the statistics
+        time_mean = np.array(time_list).mean(axis=0)/60
+        time_std = np.array(time_list).std(axis=0)/60
+        loss_mean = np.array(loss_hist_list).mean(axis=0)
+        loss_std = np.array(loss_hist_list).std(axis=0)
+
+        # If a single GPU is used, store the time
+        if world_size == 1:
+            time_1gpu = time_mean[-1]
+            std_1gpu = time_std[-1]
+
+        # Calculate the speed-up relative to using a single GPU
+        speed_up_mean = time_1gpu / time_mean[-1] 
+        speed_up_std = speed_up_mean * \
+            np.sqrt((std_1gpu/time_1gpu)**2 + (time_std[-1]/time_mean[-1])**2)
+
+        # Add another plot
+        ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
+                    label=f'{world_size} GPUs')
+        ax2.errorbar(np.arange(0,loss_mean.shape[0]), loss_mean, yerr=loss_std,
+                    label=f'{world_size} GPUs')
+        ax3.errorbar(world_size, speed_up_mean, yerr=speed_up_std, fmt='o')
+        
+    # Plot
+    fig.suptitle(f'Multi-GPU performance test | {runs} runs performed')
+    ax1.set_yscale('log')
+    ax1.set_xscale('linear')
+    ax2.set_yscale('log')
+    ax2.set_xscale('linear')
+    ax3.set_yscale('linear')
+    ax3.set_xscale('linear')
+    ax1.legend()
+    ax2.legend()
+    ax1.set_xlabel('Time (min)')
+    ax1.set_ylabel('Loss')
+    ax2.set_xlabel('Epochs')
+    ax3.set_xlabel('Number of GPUs')
+    ax3.set_ylabel('Speed-up relative to single GPU')
+    plt.show()
+
+
 def _spawn_wrapper(rank: int, 
                   func: Callable[[int, int], None], 
                   device_ids: List[int],

From e5cfd2550688059a85e703371fed45a80963260a Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 4 Jul 2025 22:35:29 +0000
Subject: [PATCH 082/115] Corrected script name in distributed_speed_test.py

---
 examples/distributed_speed_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/distributed_speed_test.py b/examples/distributed_speed_test.py
index e465e788..15a5d9c0 100644
--- a/examples/distributed_speed_test.py
+++ b/examples/distributed_speed_test.py
@@ -25,7 +25,7 @@
     runs = 3
 
     # Define where the single-GPU script is located
-    script_path = 'fancy_ptycho_distributed.py'
+    script_path = 'fancy_ptycho_speed_test.py'
 
     # Define where the loss-vs-time data is being stored in
     output_dir = 'example_loss_data'

From e5689c300bd1637af34cfde551a2728b34e24d2a Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 4 Jul 2025 22:36:53 +0000
Subject: [PATCH 083/115] Multi-gpu-related model attributes are no longer
 stored as reconstructor attributes

---
 src/cdtools/reconstructors/adam.py  |  4 +---
 src/cdtools/reconstructors/base.py  | 20 +++++++-------------
 src/cdtools/reconstructors/lbfgs.py |  4 +---
 src/cdtools/reconstructors/sgd.py   |  4 +---
 4 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/src/cdtools/reconstructors/adam.py b/src/cdtools/reconstructors/adam.py
index cbf4378d..c4d2e463 100644
--- a/src/cdtools/reconstructors/adam.py
+++ b/src/cdtools/reconstructors/adam.py
@@ -37,9 +37,6 @@ class Adam(Reconstructor):
 
     Important attributes:
     - **model** -- Always points to the core model used.
-    - **multi_gpu_used** -- Whether or not multi-GPU computation will be performed
-      using a distributed data approach. This attribute will be pulled from the
-      CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
     - **optimizer** -- This class by default uses `torch.optim.Adam` to perform
       optimizations.
     - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the `optimize` method.
@@ -56,6 +53,7 @@ def __init__(self,
         # Define the optimizer for use in this subclass
         self.optimizer = t.optim.Adam(self.model.parameters())
     
+    
     def adjust_optimizer(self,
                          lr: int = 0.005,
                          betas: Tuple[float] = (0.9, 0.999),
diff --git a/src/cdtools/reconstructors/base.py b/src/cdtools/reconstructors/base.py
index 4b6ded77..6cc10ef4 100644
--- a/src/cdtools/reconstructors/base.py
+++ b/src/cdtools/reconstructors/base.py
@@ -44,9 +44,6 @@ class Reconstructor:
 
     Important attributes:
     - **model** -- Always points to the core model used.
-    - **multi_gpu_used** -- Whether or not multi-GPU computation will be performed
-      using a distributed data approach. This attribute will be pulled from the
-      CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
     - **optimizer** -- A `torch.optim.Optimizer` that must be defined when initializing the
       Reconstructor subclass.
     - **scheduler** -- A `torch.optim.lr_scheduler` that may be defined during the `optimize` method.
@@ -59,9 +56,6 @@ def __init__(self,
                  subset: Union[int, List[int]] = None):
         # Store parameters as attributes of Reconstructor
         self.subset = subset
-        self.multi_gpu_used = model.multi_gpu_used
-        self.world_size = model.world_size
-        self.rank = model.rank
 
         # Initialize attributes that must be defined by the subclasses
         self.optimizer = None       # Defined in the __init__ of the subclass as a torch.optim.Optimizer
@@ -94,16 +88,16 @@ def setup_dataloader(self,
             Optional, enable/disable shuffling of the dataset. This option
             is intended for diagnostic purposes and should be left as True.
         """
-        if self.multi_gpu_used:
+        if self.model.multi_gpu_used:
             # First, create a sampler to load subsets of dataset to the GPUs
             self.sampler = DistributedSampler(self.dataset,
-                                              num_replicas=self.world_size,
-                                              rank=self.rank,
+                                              num_replicas=self.model.world_size,
+                                              rank=self.model.rank,
                                               shuffle=shuffle,
                                               drop_last=False)
             # Now create the dataloader
             self.data_loader = torchdata.DataLoader(self.dataset,
-                                                    batch_size=batch_size//self.world_size,
+                                                    batch_size=batch_size//self.model.world_size,
                                                     num_workers=0, # Creating extra threads in children processes may cause problems. Leave this at 0.
                                                     drop_last=False,
                                                     pin_memory=False,
@@ -154,7 +148,7 @@ def _run_epoch(self,
         """
         # If we're using DistributedSampler (i.e., multi-GPU useage), we need to 
         # tell it which epoch we're on. Otherwise data shuffling will not work properly
-        if self.multi_gpu_used: 
+        if self.model.multi_gpu_used: 
             self.data_loader.sampler.set_epoch(self.model.epoch)
 
         # Initialize some tracking variables
@@ -203,7 +197,7 @@ def closure():
 
                     # For multi-GPU, average and sync the gradients + losses across all 
                     # participating GPUs with an all-reduce call. Also sum the losses.             
-                    if self.multi_gpu_used:
+                    if self.model.multi_gpu_used:
                         cdtdist.sync_and_avg_gradients(self.model)
                         dist.all_reduce(loss, op=dist.ReduceOp.SUM) 
 
@@ -218,7 +212,7 @@ def closure():
 
                     # For multi-GPU optimization, average and sync the gradients + 
                     # losses across all participating GPUs with an all-reduce call.
-                    if self.multi_gpu_used:
+                    if self.model.multi_gpu_used:
                         cdtdist.sync_and_avg_gradients(self.model)
                 
                 return total_loss
diff --git a/src/cdtools/reconstructors/lbfgs.py b/src/cdtools/reconstructors/lbfgs.py
index ad1a3a77..7a42e372 100644
--- a/src/cdtools/reconstructors/lbfgs.py
+++ b/src/cdtools/reconstructors/lbfgs.py
@@ -32,9 +32,6 @@ class LBFGS(Reconstructor):
 
     Important attributes:
     - **model** -- Always points to the core model used.
-    - **multi_gpu_used** -- Whether or not multi-GPU computation will be performed
-      using a distributed data approach. This attribute will be pulled from the
-      CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
     - **optimizer** -- This class by default uses `torch.optim.LBFGS` to perform
       optimizations.
     - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the `optimize` method.
@@ -51,6 +48,7 @@ def __init__(self,
         # Define the optimizer for use in this subclass
         self.optimizer = t.optim.LBFGS(self.model.parameters())
     
+    
     def adjust_optimizer(self,
                          lr: int = 0.005,
                          history_size: int = 2,
diff --git a/src/cdtools/reconstructors/sgd.py b/src/cdtools/reconstructors/sgd.py
index fdcfe89f..bdd3c195 100644
--- a/src/cdtools/reconstructors/sgd.py
+++ b/src/cdtools/reconstructors/sgd.py
@@ -30,9 +30,6 @@ class SGD(Reconstructor):
 
     Important attributes:
     - **model** -- Always points to the core model used.
-    - **multi_gpu_used** -- Whether or not multi-GPU computation will be performed
-      using a distributed data approach. This attribute will be pulled from the
-      CDIModel (this flag is automatically set when using cdtools.tools.distributed.spawn).
     - **optimizer** -- This class by default uses `torch.optim.Adam` to perform
       optimizations.
     - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the `optimize` method.
@@ -49,6 +46,7 @@ def __init__(self,
         # Define the optimizer for use in this subclass
         self.optimizer = t.optim.SGD(self.model.parameters())
     
+    
     def adjust_optimizer(self,
                          lr: int = 0.005,
                          momentum: float = 0,

From 037151d83506651882bf76048fe61de596215495 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 5 Jul 2025 02:14:09 +0000
Subject: [PATCH 084/115] Depracated spawn-based distributed methods

---
 src/cdtools/tools/distributed/distributed.py | 136 -------------------
 1 file changed, 136 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index e2674bc4..3a9e1ef8 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -22,14 +22,11 @@
 
 import torch as t
 import torch.distributed as dist
-import torch.multiprocessing as mp
 import datetime
 import os
 import subprocess
 import argparse
 import runpy
-from multiprocessing.connection import Connection
-from typing import Callable, List
 from ast import literal_eval
 from matplotlib import pyplot as plt
 import pickle
@@ -383,136 +380,3 @@ def run_speed_test(world_sizes: int,
     ax3.set_xlabel('Number of GPUs')
     ax3.set_ylabel('Speed-up relative to single GPU')
     plt.show()
-
-
-def _spawn_wrapper(rank: int, 
-                  func: Callable[[int, int], None], 
-                  device_ids: List[int],
-                  backend: str = 'nccl', 
-                  timeout: int = 30,
-                  pipe: Connection = None):
-    """
-    Wraps functions containing reconstruction loops (i.e., `for loss in 
-    model.Adam_optimize`) to enable multi-GPU operations to be set up. The 
-    wrapped function needs to passed to `torch.multiprocessing.spawn` or 
-    `cdtools.tools.distributed.distributed.spawn`
-
-    Parameters:
-        rank: int
-            Rank of the GPU, with value ranging from [0, world_size-1]. This
-            is defined by the spawning methods and not directly by the user.
-        func: Callable[[CDIModel, Ptycho2DDataset, int, int]]
-            Function wrapping user-defined reconstruction loops. The function must
-            have the following format: func(model, dataset, rank, world_size).
-        device_ids: list[int]
-            List of GPU IDs to use
-        backend: str
-            Multi-gpu communication backend to use. Default is the 'nccl' backend,
-            which is the only supported backend for CDTools.
-            See https://pytorch.org/docs/stable/distributed.html for additional info
-            about PyTorch-supported backends.
-        timeout: int
-            Timeout for operations executed against the process group in seconds. 
-            Default is 30 seconds. After timeout has been reached, all subprocesses
-            will be aborted and the process calling this method will crash. 
-        pipe: Connection
-            A Connection object representing one end of a communication pipe. This
-            parameter is needed if you're trying to get some values back from the
-            wrapped function.
-            BUG: Passing a CDIModel through connection generated with mp.Pipe or
-                 query will cause the connection to hang.
-    """
-    # Convert timeout from int to datetime
-    timeout = datetime.timedelta(seconds=timeout)
-
-    # Define the world_size
-    world_size = len(device_ids)
-
-    # Allow the process to only see the GPU is has been assigned
-    os.environ['CUDA_VISIBLE_DEVICES'] = str(rank) 
-
-    # Within the called reconstruction function/script, we need to somehow
-    # set up the multi-GPU model flags (model.rank, model.world_size,
-    # and model.multi_gpu_used).
-    #
-    # One way to do this (without having to modify CDIModel here or explicitly
-    # setting up the CDIModel attributes in the reconstruction script) is to
-    # create environment variables for each subprocess. Then, when a model
-    # is created within each subprocess, it can loop up its own local environment
-    # variable and set the rank/world_size/multi_gpu_used flags accordingly.
-    os.environ['WORLD_SIZE'] = str(world_size)
-    os.environ['RANK'] = str(rank)
-    
-    # Initialize the process group
-    dist.init_process_group(backend=backend, rank=rank, 
-                            world_size=world_size, timeout=timeout)
-    
-    try:
-        # Run the reconstruction script
-        # We also need to check if we want to pass a pipe to the function
-        if pipe is None:
-            func()    
-        else:
-            func(pipe)   
-    finally:                 
-        # Destroy process group
-        dist.destroy_process_group()        
-
-
-def spawn(func: Callable[[int, int], None],
-          device_ids: List[int],
-          master_addr: str,
-          master_port: str,
-          backend: str = 'nccl',
-          timeout: int = 30,
-          nccl_p2p_disable: bool = True,
-          pipe: Connection = None):
-    """
-    Spawns subprocesses on `world_size` GPUs that runs reconstruction
-    loops wrapped around a function `func`.
-    
-    This is a wrapper around `torch.multiprocessing.spawn` which includes 
-    the setup of OS environmental variables needed for initializing the 
-    distributed backend.
-
-    Parameters:
-        func: Callable[[CDIModel, Ptycho2DDataset, int, int]]
-            Function wrapping user-defined reconstruction loops. The function must
-            have the following format: func(model, dataset, rank, world_size).
-        device_ids: list[int]
-            List of GPU IDs to use
-        master_addr: str
-            IP address of the machine that will host the process with rank 0
-        master_port: str
-            A free port on the machine that will host the process with rank 0
-        backend: str
-            Multi-gpu communication backend to use. Default is the 'nccl' backend,
-            which is the only supported backend for CDTools.
-            See https://pytorch.org/docs/stable/distributed.html for additional info
-            about PyTorch-supported backends.
-        timeout: int
-            Timeout for operations executed against the process group in seconds. 
-            Default is 30 seconds. After timeout has been reached, all subprocesses
-            will be aborted and the process calling this method will crash.   
-        nccl_p2p_disable: bool
-            Disable NCCL peer-2-peer communication
-        pipe: Connection
-            A Connection object representing one end of a communication pipe. This
-            parameter is needed if you're trying to get some values back from the
-            wrapped function.
-            BUG: Passing a CDIModel through connection generated with mp.Pipe or
-                 query will cause the connection to hang.
-    """
-    # Set up environment variables
-    os.environ['MASTER_ADDR'] = master_addr
-    os.environ['MASTER_PORT'] = master_port
-    os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
-
-    # Ensure a "graceful" termination of subprocesses if something goes wrong.
-    print('\nStarting up multi-GPU reconstructions...\n')
-    mp.spawn(_spawn_wrapper,
-             args=(func, device_ids, backend, timeout, pipe),
-             nprocs=len(device_ids),
-             join=True)
-    print('Reconstructions complete...')
-    
\ No newline at end of file

From 7a16bbca01a67491a086580ccba5189e14ee459b Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Sat, 5 Jul 2025 02:26:25 +0000
Subject: [PATCH 085/115] Depracated torchrunner from distributed

---
 src/cdtools/tools/distributed/distributed.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 3a9e1ef8..dffb5447 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -58,25 +58,6 @@ def sync_and_avg_gradients(model):
             param.grad.data /= model.world_size
 
 
-def torchrunner(script_name: str,
-                n_gpus: int = 4):
-    """
-    Executes a torchrun command in a python script or jupyter notebook.
-
-    Parameters:
-        script_name: str
-            The file name of the target script
-        n_gpus: int
-            Number of GPUs to distribute the job over
-    """
-
-    # Perform the torchrun call of the wrapped function
-    subprocess.run(['torchrun', 
-                    '--nnodes=1', 
-                    f'--nproc_per_node={n_gpus}', 
-                    f'{script_name}'])
-
-
 def run_single_to_multi_gpu():
     """
     Runs a single-GPU reconstruction script as a multi-GPU job via torchrun.

From 8906eb31b25507bc6f047e553d2a0a11d44bed4c Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 7 Jul 2025 23:33:41 +0000
Subject: [PATCH 086/115] Force pytest to only run stuff in the tests directory

---
 pyproject.toml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b7fd46a9..6ac3baa3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,6 @@
 [tool.ruff]
 # Decrease the maximum line length to 79 characters.
-line-length = 79
\ No newline at end of file
+line-length = 79
+
+[tool.pytest.ini_options]
+testpaths = 'tests'
\ No newline at end of file

From 0699a8bad38ef3e80c20dc1a7c1c47163c944ef4 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 7 Jul 2025 23:43:19 +0000
Subject: [PATCH 087/115] Reconstructors are imported in CDIModel only when
 self.reconstructor is initialized

---
 src/cdtools/models/base.py | 41 +++++++++++++-------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index a704b92f..b82d1124 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -404,23 +404,21 @@ def Adam_optimize(
         
         """
         # We want to have model.Adam_optimize call AND store cdtools.reconstructors.Adam
-        # to be able to perform reconstructions without creating a new
-        # optimizer each time we update the hyperparameters.
+        # to perform reconstructions without creating a new reconstructor each time we 
+        # update the hyperparameters.
         # 
         # The only way to do this is to make the Adam reconstructor an attribute
         # of the model. But since the Adam reconstructor also depends on CDIModel,
         # this seems to give rise to a circular import error unless
         # we import cdtools.reconstructors within this method:
-        from cdtools.reconstructors import Adam
-
-        # Next, we want to create an Optimizer.Adam if one does not already exist.
-        if not hasattr(self, 'optimizer'):
-            self.optimizer = Adam(model=self, 
+        if not hasattr(self, 'reconstructor'):
+            from cdtools.reconstructors import Adam
+            self.reconstructor = Adam(model=self, 
                                   dataset=dataset, 
                                   subset=subset)
         
         # Run some reconstructions
-        return self.optimizer.optimize(iterations=iterations,
+        return self.reconstructor.optimize(iterations=iterations,
                                        batch_size=batch_size,
                                        lr=lr,
                                        betas=betas,
@@ -481,16 +479,14 @@ def LBFGS_optimize(self,
         # 
         # Since the LBFGS reconstructor also depends on CDIModel, a circular import error 
         # arises unless we import cdtools.reconstructors within this method:
-        from cdtools.reconstructors import LBFGS
-
-        # Next, we want to create an Optimizer.Adam if one does not already exist.
-        if not hasattr(self, 'optimizer'):
-            self.optimizer = LBFGS(model=self, 
+        if not hasattr(self, 'reconstructor'):
+            from cdtools.reconstructors import LBFGS
+            self.reconstructor = LBFGS(model=self, 
                                    dataset=dataset, 
                                    subset=subset)
         
         # Run some reconstructions
-        return self.optimizer.optimize(iterations=iterations,
+        return self.reconstructor.optimize(iterations=iterations,
                                        lr=lr,
                                        history_size=history_size,
                                        regularization_factor=regularization_factor,
@@ -557,23 +553,14 @@ def SGD_optimize(self,
         # 
         # Since the SGD reconstructor also depends on CDIModel, a circular import error 
         # arises unless we import cdtools.reconstructors within this method:
-        from cdtools.reconstructors import SGD
-        
-        # Next, we want to create an Optimizer.Adam if one does not already exist.
-        if not hasattr(self, 'optimizer'):
-            self.optimizer = SGD(model=self, 
+        if not hasattr(self, 'reconstructor'):
+            from cdtools.reconstructors import SGD
+            self.reconstructor = SGD(model=self, 
                                  dataset=dataset, 
                                  subset=subset)
-
-        # Define the optimizer
-        optimizer = t.optim.SGD(self.parameters(),
-                                lr = lr, momentum=momentum,
-                                dampening=dampening,
-                                weight_decay=weight_decay,
-                                nesterov=nesterov)
     
         # Run some reconstructions
-        return self.optimizer.optimize(iterations=iterations,
+        return self.reconstructor.optimize(iterations=iterations,
                                        batch_size=batch_size,
                                        lr=lr,
                                        momentum=momentum,

From 245e28adc8efbc83238906aefb811b39707f25c0 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 7 Jul 2025 23:44:30 +0000
Subject: [PATCH 088/115] Created test for the Adam reconstructor

---
 tests/test_reconstructors.py | 140 +++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 tests/test_reconstructors.py

diff --git a/tests/test_reconstructors.py b/tests/test_reconstructors.py
new file mode 100644
index 00000000..983b4a31
--- /dev/null
+++ b/tests/test_reconstructors.py
@@ -0,0 +1,140 @@
+import pytest
+import cdtools
+import torch as t
+import numpy as np
+from copy import deepcopy
+
+@pytest.mark.slow
+def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
+    """
+    This test checks out several things with the Au particle dataset
+        1) Calls to Reconstructor.adjust_optimizer is updating the hyperparameters
+        2) We are only using the single-GPU dataloading method
+        3) Ensure `recon.model` points to the original `model` 
+        4) Reconstructions performed by `Adam.optimize` and `model.Adam_optimize`
+           calls produce identical results.
+        5) The quality of the reconstruction remains below a specified threshold.
+    """
+
+    print('\nTesting performance on the standard gold balls dataset with reconstructors.Adam')
+    
+    # Setup dataset and model
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(gold_ball_cxi)
+
+    pad = 10
+    dataset.pad(pad)
+
+    model = cdtools.models.FancyPtycho.from_dataset(
+        dataset,
+        n_modes=3,
+        probe_support_radius=50,
+        propagation_distance=2e-6,
+        units='um',
+        probe_fourier_crop=pad
+    )
+
+    model.translation_offsets.data += 0.7 * t.randn_like(model.translation_offsets)
+    model.weights.requires_grad = False
+
+    # Make a copy of the model
+    model_recon = deepcopy(model)
+
+    # Load models and datasets to devices
+    model.to(device=reconstruction_device)
+    model_recon.to(device=reconstruction_device)
+    dataset.get_as(device=reconstruction_device)
+
+    # Make sure that we're not going to perform reconstructions on the same model
+    assert id(model_recon) != id(model)
+
+    ######### Reconstructions with cdtools.reconstructors.Adam.optimize #########
+    #############################################################################
+
+    print('Running reconstruction using cdtools.reconstructors.Adam.optimize on provided reconstruction_device,',
+          reconstruction_device)
+
+    recon = cdtools.reconstructors.Adam(model=model_recon, dataset=dataset)
+    t.manual_seed(0)
+    for loss in recon.optimize(20, lr=0.005, batch_size=50):
+        print(model_recon.report())
+        if show_plot and model_recon.epoch % 10 == 0:
+            model_recon.inspect(dataset)
+
+    # Test 1a: Ensure that the Adam.optimizer.param_groups learning rate and
+    #          batch size got updated
+    assert recon.optimizer.param_groups[0]['lr'] == 0.005
+    assert recon.data_loader.batch_size == 50
+
+    # Test 2: Ensure that recon does not have sampler as an attribute (for multi-GPU)
+    assert not hasattr(recon, 'sampler')
+
+    for loss in recon.optimize(50, lr=0.002, batch_size=100):
+        print(model_recon.report())
+        if show_plot and model_recon.epoch % 10 == 0:
+            model_recon.inspect(dataset)
+
+    # Test 1b: Ensure that the Adam.optimizer.param_groups learning rate and
+    #          batch size got updated
+    assert recon.optimizer.param_groups[0]['lr'] == 0.002
+    assert recon.data_loader.batch_size == 100
+
+    for loss in recon.optimize(100, lr=0.001, batch_size=100,
+                                    schedule=True):
+        print(model_recon.report())
+        if show_plot and model_recon.epoch % 10 == 0:
+            model_recon.inspect(dataset)
+    
+    # Test 1c: Ensure that the Adam.optimizer.param_groups learning rate and
+    #          batch size got updated
+    assert recon.optimizer.param_groups[0]['lr'] == 0.001
+    assert recon.data_loader.batch_size == 100
+
+    # Test 3:  Ensure recon.model points to the original model 
+    assert id(model_recon) == id(recon.model)        
+    
+    model_recon.tidy_probes()
+
+    if show_plot:
+        model_recon.inspect(dataset)
+        model_recon.compare(dataset)
+
+    loss_recon = model_recon.loss_history[-1]
+
+    ############ Reconstructions with cdtools.CDIModel.Adam_optimize ############
+    #############################################################################
+    print('Running reconstruction using CDIModel.Adam_optimize on provided reconstruction_device,',
+          reconstruction_device)
+    t.manual_seed(0)
+
+    for loss in model.Adam_optimize(20, dataset, lr=0.005, batch_size=50):
+        print(model.report())
+        if show_plot and model.epoch % 10 == 0:
+            model.inspect(dataset)
+
+    for loss in model.Adam_optimize(50, dataset, lr=0.002, batch_size=100):
+        print(model.report())
+        if show_plot and model.epoch % 10 == 0:
+            model.inspect(dataset)
+
+    for loss in model.Adam_optimize(100, dataset, lr=0.001, batch_size=100,
+                                    schedule=True):
+        print(model.report())
+        if show_plot and model.epoch % 10 == 0:
+            model.inspect(dataset)
+            
+    model.tidy_probes()
+
+    if show_plot:
+        model.inspect(dataset)
+        model.compare(dataset)
+
+    loss_model = model.loss_history[-1]
+
+    # Test 4: Ensure equivalency between the model reconstructions
+    assert np.allclose(loss_recon, loss_model)
+
+    # Test 5: Ensure reconstructions have reached a certain loss tolerance
+    #         This just comes from running a reconstruction when it was working well
+    #         and choosing a rough value. If it triggers this assertion error,
+    #         something changed to make the final quality worse!
+    assert loss_model < 0.0001

From 9f1b89ee8b45478858dba6e471b1b6c82fee0f17 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 7 Jul 2025 23:45:24 +0000
Subject: [PATCH 089/115] Removed au particle test from test_fancy_ptycho
 (already in test_reconstructors)

---
 tests/models/test_fancy_ptycho.py | 61 +------------------------------
 1 file changed, 1 insertion(+), 60 deletions(-)

diff --git a/tests/models/test_fancy_ptycho.py b/tests/models/test_fancy_ptycho.py
index 3f78a737..503e12e7 100644
--- a/tests/models/test_fancy_ptycho.py
+++ b/tests/models/test_fancy_ptycho.py
@@ -87,64 +87,5 @@ def test_lab_ptycho(lab_ptycho_cxi, reconstruction_device, show_plot):
         model.compare(dataset)
 
     # If this fails, the reconstruction has gotten worse
-    assert model.loss_history[-1] < 0.001
-
-
-@pytest.mark.slow
-def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
-
-    print('\nTesting performance on the standard gold balls dataset')
-    
-    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(gold_ball_cxi)
-
-    pad = 10
-    dataset.pad(pad)
-
-    model = cdtools.models.FancyPtycho.from_dataset(
-        dataset,
-        n_modes=3,
-        probe_support_radius=50,
-        propagation_distance=2e-6,
-        units='um',
-        probe_fourier_crop=pad
-    )
-
-    model.translation_offsets.data += \
-        0.7 * t.randn_like(model.translation_offsets)
-
-    # Not much probe intensity instability in this dataset, no need for this
-    model.weights.requires_grad = False
-
-    print('Running reconstruction on provided --reconstruction_device,',
-          reconstruction_device)
-    model.to(device=reconstruction_device)
-    dataset.get_as(device=reconstruction_device)
-    
-    for loss in model.Adam_optimize(20, dataset, lr=0.005, batch_size=50):
-        print(model.report())
-        if show_plot and model.epoch % 10 == 0:
-            model.inspect(dataset)
-
-    for loss in model.Adam_optimize(50, dataset, lr=0.002, batch_size=100):
-        print(model.report())
-        if show_plot and model.epoch % 10 == 0:
-            model.inspect(dataset)
-
-    for loss in model.Adam_optimize(100, dataset, lr=0.001, batch_size=100,
-                                    schedule=True):
-        print(model.report())
-        if show_plot and model.epoch % 10 == 0:
-            model.inspect(dataset)
-            
-    model.tidy_probes()
-
-    if show_plot:
-        model.inspect(dataset)
-        model.compare(dataset)
-
-    # This just comes from running a reconstruction when it was working well
-    # and choosing a rough value. If it triggers this assertion error,
-    # something changed to make the final quality worse!
-    assert model.loss_history[-1] < 0.0001
-
+    assert model.loss_history[-1] < 0.0013
 

From 9b0731b53de8f6ecbdb70941d373683fc6cdcfa8 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Mon, 7 Jul 2025 23:53:47 +0000
Subject: [PATCH 090/115] Removed depracated methods from the __all__ in
 distributed.py

---
 src/cdtools/tools/distributed/distributed.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index dffb5447..812d0715 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -35,12 +35,9 @@
 DISTRIBUTED_PATH = os.path.dirname(os.path.abspath(__file__))
 
 __all__ = ['sync_and_avg_gradients', 
-           'torchrunner',
            'run_single_to_multi_gpu',
            'wrap_single_gpu_script',
-           'run_speed_test',
-           '_spawn_wrapper',
-           'spawn']
+           'run_speed_test']
 
 
 def sync_and_avg_gradients(model):

From 564f499d7b3376e454187679d789b2a578edd30e Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 11 Jul 2025 16:11:10 +0000
Subject: [PATCH 091/115] Altered the seed synchronization step and reorganized
 bits of the scripts

---
 src/cdtools/tools/distributed/distributed.py  | 82 +++++++++----------
 .../tools/distributed/single_to_multi_gpu.py  | 18 ++--
 2 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 812d0715..b01d368c 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -33,10 +33,12 @@
 import numpy as np
 
 DISTRIBUTED_PATH = os.path.dirname(os.path.abspath(__file__))
+MIN_INT64 = np.iinfo(np.int64).min
+MAX_INT64 = np.iinfo(np.int64).max
 
 __all__ = ['sync_and_avg_gradients', 
            'run_single_to_multi_gpu',
-           'wrap_single_gpu_script',
+           'run_single_gpu_script',
            'run_speed_test']
 
 
@@ -120,42 +122,35 @@ def run_single_to_multi_gpu():
                         default=1,
                         choices=[0,1],
                         help='Disable (1) or enable (0) NCCL peer-to-peer communication')
+    parser.add_argument('--seed',
+                        type=int,
+                        default=None,
+                        help='Sets the RNG seed for all devices')
     parser.add_argument('script_path', 
                         type=str, 
                         help='Single GPU script file name (with or without .py extension)')
     
     # Get the arguments
     args = parser.parse_args()
-
-    # Set an environment variable for OMP_NUM_THREADS (sets number of threads)
-
-    # Don't let the user die in anticipation
-    print(f'\n[INFO]: Starting up multi-GPU reconstructions with {args.ngpus} GPUs.\n')
-
-    # Set a random seed that all participaring workers will use
-    seed = t.initial_seed()
     
     # Perform the torchrun call of the wrapped function
     subprocess.run(['torchrun', # We set up the torchrun arguments first
                     '--standalone', # Indicates that we're running a single machine, multiple GPU job.
                     f'--nnodes={args.nnodes}', 
                     f'--nproc_per_node={args.ngpus}', 
-                    os.path.join(DISTRIBUTED_PATH,'single_to_multi_gpu.py'), # Make the call to the single-to-multi-gpu wrapper script
+                    '-m',
+                    'cdtools.tools.distributed.single_to_multi_gpu', # Make the call to the single-to-multi-gpu wrapper script
                     f'--backend={args.backend}',
                     f'--timeout={args.timeout}',
                     f'--nccl_p2p_disable={args.nccl_p2p_disable}',
-                    f'--script_path={args.script_path}',
-                    f'--seed={seed}'])
+                    f'{args.script_path}'])
     
-    # Let the user know the job is done
-    print(f'\n[INFO]: Reconstructions complete.\n')
-
-
-def wrap_single_gpu_script(script_path: str,
+    
+def run_single_gpu_script(script_path: str,
                            backend: str = 'nccl',
                            timeout: int = 30,
                            nccl_p2p_disable: bool = True,
-                           seed: int = 0):
+                           seed: int = None):
     """
     Wraps single-GPU reconstruction scripts to be ran as a multi-GPU job via
     torchrun calls. 
@@ -206,28 +201,24 @@ def wrap_single_gpu_script(script_path: str,
             are at 100% useage but the program isn't doing anything, try enabling
             this variable.
         seed: int
-            Seed for generating random numbers. This value must be identical across all
-            participating devices.
+            Seed for generating random numbers.
             
     """
-    ######################   Check if the script is safe to run  ######################
-    ###################################################################################
-
-    # 1) Check if the file path actually exists
+    
+    # Check if the file path actually exists before starting the process group
     if not os.path.exists(script_path):
         raise FileNotFoundError(f'Cannot open file: {os.path.join(os.getcwd(), script_path)}')
-
-    # 2) Check if the file is a CDTools reconstruction script.
-    with open(script_path, 'r') as f:
-        source_code = f.read()
-    if not ('import cdtools' in source_code or 'from cdtools' in source_code):
-        raise ValueError('File is not a CDTools reconstruction script (the script must import cdtools modules).')
-
+    
+    # Enable/disable NCCL peer-to-peer communication. The boolean needs to be converted into
+    # a string for the environment variable.
+    os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
+    
     ##########   Force each subprocess to see only the GPU ID we assign it  ###########
     ###################################################################################
 
-    # The GPU rank is visible as an environment variable through torchrun calls.
+    # The GPU rank and world_size is visible as an environment variable through torchrun calls.
     rank = int(os.environ.get('RANK'))
+    world_size = int(os.environ.get('WORLD_SIZE'))
 
     # If the CDTOOLS_GPU_IDS environment variable is defined, then assign based
     # on the GPU IDS provided in that list. Otherwise, use the rank for the GPU ID.
@@ -242,25 +233,34 @@ def wrap_single_gpu_script(script_path: str,
 
     ################################  Run the script  #################################
     ###################################################################################
-    
-    # Enable/disable NCCL peer-to-peer communication. The boolean needs to be converted into
-    # a string for the environment variable.
-    os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
+
+    if rank == 0:
+        print(f'\n[INFO]: Starting up multi-GPU reconstructions with {world_size} GPUs.')
 
     # Start up the process group (lets the different subprocesses can talk with each other)
     dist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout))
 
-    # Force this subprocess to use a given RNG seed (should be identical across all subprocesses)
-    t.manual_seed(seed)
-    t.cuda.manual_seed_all(seed)
-      
     try:     
+        # Force all subprocesses to either use the pre-specified or Rank 0's RNG seed
+        if seed is None:
+            seed_local = t.tensor(np.random.randint(MIN_INT64, MAX_INT64), device='cuda', dtype=t.int64)
+            dist.broadcast(seed_local, 0)
+            seed = seed_local.item()
+
+        t.manual_seed(seed)
+
         # Run the single-GPU reconstruction script 
-        runpy.run_path(script_path, run_name='__main__')
+        script_variables = runpy.run_path(script_path, run_name='__main__')
+
+        # Let the user know the job is done
+        if rank == 0:
+            print(f'[INFO]: Reconstructions complete. Terminating process group.')
 
     finally:
         # Kill the process group
         dist.destroy_process_group()   
+        if rank == 0:
+            print(f'[INFO]: Process group terminated. Multi-GPU job complete.')
 
 
 def run_speed_test(world_sizes: int, 
diff --git a/src/cdtools/tools/distributed/single_to_multi_gpu.py b/src/cdtools/tools/distributed/single_to_multi_gpu.py
index e365349e..25f565b7 100644
--- a/src/cdtools/tools/distributed/single_to_multi_gpu.py
+++ b/src/cdtools/tools/distributed/single_to_multi_gpu.py
@@ -21,9 +21,6 @@
 def get_args():
     # Define the arguments we need to pass to dist.script_wrapper
     parser = argparse.ArgumentParser()
-    parser.add_argument('--script_path', 
-                        type=str, 
-                        help='Single GPU script file name (with or without .py extension)')
     parser.add_argument('--backend', 
                         type=str, 
                         default='nccl',
@@ -40,18 +37,25 @@ def get_args():
                         help='Disable (1) or enable (0) NCCL peer-to-peer communication')
     parser.add_argument('--seed',
                         type=int,
-                        default=0,
+                        default=None,
                         help='Sets the RNG seed for all devices')
+    parser.add_argument('script_path', 
+                        type=str, 
+                        help='Single GPU script file name (with or without .py extension)')
     
     return parser.parse_args()
 
 
-if __name__ == '__main__':
+def main():
     # Get args
     args = get_args()
     # Pass arguments to dist.script_wrapper
-    dist.wrap_single_gpu_script(script_path=args.script_path,
+    dist.run_single_gpu_script(script_path=args.script_path,
                                 backend=args.backend,
                                 timeout=args.timeout,
                                 nccl_p2p_disable=bool(args.nccl_p2p_disable),
-                                seed=args.seed)
\ No newline at end of file
+                                seed=args.seed)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 2dbb3075d38720173730d3d651639b9d0a8a8f7c Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 11 Jul 2025 16:37:30 +0000
Subject: [PATCH 092/115] Record the time when each CDIModel.loss_history value
 is stored

---
 examples/fancy_ptycho_speed_test.py | 10 +---------
 src/cdtools/models/base.py          |  7 ++++++-
 src/cdtools/reconstructors/base.py  |  1 +
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/examples/fancy_ptycho_speed_test.py b/examples/fancy_ptycho_speed_test.py
index dfb0a3ff..b09e6671 100644
--- a/examples/fancy_ptycho_speed_test.py
+++ b/examples/fancy_ptycho_speed_test.py
@@ -8,11 +8,6 @@
 # We need to know which trial number this script is running
 TRIAL_NUMBER = int(os.environ.get('CDTOOLS_TRIAL_NUMBER'))
 
-# Create a list to keep track of when each module report was printed
-t_list = []
-# Start counting time
-t_start = time.time()
-
 filename = 'example_data/lab_ptycho_data.cxi'
 dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
@@ -34,17 +29,14 @@
 for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
     if model.rank == 0:
         print(model.report())
-        t_list.append(time.time() - t_start)
 
 for loss in model.Adam_optimize(25, dataset,  lr=0.005, batch_size=40):
     if model.rank == 0:
         print(model.report())
-        t_list.append(time.time() - t_start)
 
 for loss in model.Adam_optimize(25, dataset,  lr=0.001, batch_size=40):
     if model.rank == 0:
         print(model.report())
-        t_list.append(time.time() - t_start)
 
 # This orthogonalizes the recovered probe modes
 model.tidy_probes()
@@ -55,7 +47,7 @@
     file_name = f'speed_test_nGPUs_{model.world_size}_TRIAL_{TRIAL_NUMBER}'
     # Grab the loss and time history
     loss_history = model.loss_history
-    time_history = t_list
+    time_history = model.loss_times
     # Store quantities in a dictionary
     dict = {'loss history':loss_history,
             'time history':time_history,
diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index b82d1124..f564d5c8 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -77,7 +77,12 @@ def __init__(self):
         self.rank = int(rank) if rank is not None else 0 
         # Total number of GPUs being used.    
         self.world_size = int(world_size) if world_size is not None else 1   
-        self.multi_gpu_used = int(self.world_size) > 1               
+        self.multi_gpu_used = int(self.world_size) > 1     
+
+        # Keep track of the time each loss history point was taken relative to
+        # the initialization of this model.
+        self.INITIAL_TIME = time.time()
+        self.loss_times = []        
 
 
     def from_dataset(self, dataset):
diff --git a/src/cdtools/reconstructors/base.py b/src/cdtools/reconstructors/base.py
index 6cc10ef4..cca04883 100644
--- a/src/cdtools/reconstructors/base.py
+++ b/src/cdtools/reconstructors/base.py
@@ -227,6 +227,7 @@ def closure():
             self.scheduler.step(loss)
 
         self.model.loss_history.append(loss)
+        self.model.loss_times.append(time.time() - self.model.INITIAL_TIME)
         self.model.epoch = len(self.model.loss_history)
         self.model.latest_iteration_time = time.time() - t0
         self.model.training_history += self.model.report() + '\n'

From 54691fb7821f5f402889662bf636154f9b03f587 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 11 Jul 2025 20:25:04 +0000
Subject: [PATCH 093/115] Created speed test decorator, changed speed test
 environ variables, and cleaned up documentation in distributed

---
 examples/distributed_speed_test.py           |  12 +-
 examples/fancy_ptycho_speed_test.py          |  99 +++++-----
 src/cdtools/tools/distributed/distributed.py | 179 ++++++++++++++-----
 3 files changed, 188 insertions(+), 102 deletions(-)

diff --git a/examples/distributed_speed_test.py b/examples/distributed_speed_test.py
index 15a5d9c0..14e2a0bb 100644
--- a/examples/distributed_speed_test.py
+++ b/examples/distributed_speed_test.py
@@ -19,17 +19,21 @@
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
     # Define the number of GPUs to use.
-    world_sizes = [1, 2, 4] 
+    world_sizes = [1+i for i in range(7)] 
 
     # How many reconstruction runs to perform for statistics
     runs = 3
 
     # Define where the single-GPU script is located
-    script_path = 'fancy_ptycho_speed_test.py'
+    #script_path = 'fancy_ptycho_speed_test.py'
+    script_path = 'gold_ball_ptycho.py'
 
     # Define where the loss-vs-time data is being stored in
-    output_dir = 'example_loss_data'
+    output_dir = 'example_loss_data4'
+
+    # Define what prefix you want on the file
+    file_prefix = 'speed_test'
 
     # Run the test
-    dist.run_speed_test(world_sizes, runs, script_path, output_dir)
+    dist.run_speed_test(world_sizes, runs, script_path, output_dir, file_prefix)
     
\ No newline at end of file
diff --git a/examples/fancy_ptycho_speed_test.py b/examples/fancy_ptycho_speed_test.py
index b09e6671..e15c4f7e 100644
--- a/examples/fancy_ptycho_speed_test.py
+++ b/examples/fancy_ptycho_speed_test.py
@@ -1,59 +1,44 @@
 import cdtools
-import time
-import pickle
-import os
-
-
-# Script is intended to be called by distributed_speed_test.py.
-# We need to know which trial number this script is running
-TRIAL_NUMBER = int(os.environ.get('CDTOOLS_TRIAL_NUMBER'))
-
-filename = 'example_data/lab_ptycho_data.cxi'
-dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
-
-# FancyPtycho is the workhorse model
-model = cdtools.models.FancyPtycho.from_dataset(
-    dataset,
-    n_modes=3, # Use 3 incoherently mixing probe modes
-    oversampling=2, # Simulate the probe on a 2xlarger real-space array
-    probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
-    propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
-    units='mm', # Set the units for the live plots
-    obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
-)
-
-device = 'cuda'
-model.to(device=device)
-dataset.get_as(device=device)
-
-for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
-    if model.rank == 0:
-        print(model.report())
-
-for loss in model.Adam_optimize(25, dataset,  lr=0.005, batch_size=40):
-    if model.rank == 0:
-        print(model.report())
-
-for loss in model.Adam_optimize(25, dataset,  lr=0.001, batch_size=40):
-    if model.rank == 0:
-        print(model.report())
-
-# This orthogonalizes the recovered probe modes
-model.tidy_probes()
-
-# Save the model and loss history
-if model.rank == 0:
-    # Set up the file name:
-    file_name = f'speed_test_nGPUs_{model.world_size}_TRIAL_{TRIAL_NUMBER}'
-    # Grab the loss and time history
-    loss_history = model.loss_history
-    time_history = model.loss_times
-    # Store quantities in a dictionary
-    dict = {'loss history':loss_history,
-            'time history':time_history,
-            'nGPUs':model.world_size,
-            'trial':TRIAL_NUMBER}
-    # Save the quantities
-    with open (f'example_loss_data/'+file_name+'.pkl', 'wb') as f:
-        pickle.dump(dict, f)
+
+@cdtools.tools.distributed.report_speed_test
+def main():
+    filename = 'example_data/lab_ptycho_data.cxi'
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+    # FancyPtycho is the workhorse model
+    model = cdtools.models.FancyPtycho.from_dataset(
+        dataset,
+        n_modes=3, # Use 3 incoherently mixing probe modes
+        oversampling=2, # Simulate the probe on a 2xlarger real-space array
+        probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
+        propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
+        units='mm', # Set the units for the live plots
+        obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
+    )
+
+    device = 'cuda'
+    model.to(device=device)
+    dataset.get_as(device=device)
+
+
+    for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
+        if model.rank == 0:
+            print(model.report())
+
+    for loss in model.Adam_optimize(25, dataset,  lr=0.005, batch_size=40):
+        if model.rank == 0:
+            print(model.report())
+
+    for loss in model.Adam_optimize(25, dataset,  lr=0.001, batch_size=40):
+        if model.rank == 0:
+            print(model.report())
+
+    # This orthogonalizes the recovered probe modes
+    model.tidy_probes()
+
+    return model
+
+
+if __name__ == '__main__':
+    main()
 
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index b01d368c..42ba80aa 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -31,6 +31,8 @@
 from matplotlib import pyplot as plt
 import pickle
 import numpy as np
+from typing import Callable
+from pathlib import Path
 
 DISTRIBUTED_PATH = os.path.dirname(os.path.abspath(__file__))
 MIN_INT64 = np.iinfo(np.int64).min
@@ -39,6 +41,7 @@
 __all__ = ['sync_and_avg_gradients', 
            'run_single_to_multi_gpu',
            'run_single_gpu_script',
+           'report_speed_test',
            'run_speed_test']
 
 
@@ -59,27 +62,48 @@ def sync_and_avg_gradients(model):
 
 def run_single_to_multi_gpu():
     """
-    Runs a single-GPU reconstruction script as a multi-GPU job via torchrun.
+    Runs a single-GPU reconstruction script as a single-node multi-GPU job via torchrun.
     
-    This function can be executed as `cdt-torchrun` in the command line.
-
-    This function is a wrapper over both the single-GPU wrapping sc
+    This function can be executed as a python console script as `cdt-torchrun` and
+    serves as a wrapper over a `torchrun` call to `cdtools.tools.distributed.single_to_multi_gpu`.
     
-    For example, if we have the reconstruction script `reconstruct.py` and want to use
-    4 GPUs, we can write the following:
-
+    In the simplest case, a single-GPU script can be ran as a multi-GPU job using
+    the following `cdt-torchrun` call in the command line
+    ```
+    cdt-torchrun --ngpus=<number of GPUs> YOUR_RECONSTRUCTION_SCRIPT.py
     ```
-    cdt-torchrun --nproc_per_node=4 s script_path=reconstruct.py
+    
+    which is equivalent to the following `torchrun` call
+    ```
+    torchrun 
+        --standalone 
+        --nnodes=1 
+        --nproc_per_node=<number of GPUs> 
+        -m cdtools.tools.distributed.single_to_multi_gpu
+        --backend='nccl'
+        --timeout=30
+        --nccl_p2p_disable=1
+        YOUR_RECONSTRUCTION_SCRIPT.py
     ```
+    
+    With a single node (--nnodes=1), `cdt-torchrun` will launch a given number of subprocesses 
+    equivalent to the number of GPUs specified. This number must be less than or equal to the
+    actual number of GPUs available on your node.
 
     If you want to use specific GPU IDs for reconstructions, you need to set up
     the environment variable `CDTOOLS_GPU_IDS` rather than `CUDA_VISIBLE_DEVICES`. 
     If you wanted to use GPU IDs `1, 3, 4` for example, write:
     
     ```
-    CDTOOLS_GPU_IDS=1,3,4 cdt-torchrun --nnodes=1 --nproc_per_node=3 reconstruct.py
+    CDTOOLS_GPU_IDS=1,3,4 cdt-torchrun --ngpus=3 YOUR_RECONSTRUCTION_SCRIPT.py
     ```
 
+    If additional `torchrun` arguments need to be passed, you may need to make a direct
+    `torchrun` call rather than use `cdt-torchrun`. You may also submit an issue/PR.
+
+    NOTE: `cdt-torchrun` has only been tested using the 'nccl' backend, NCCL peer-to-peer communication
+          disabled, and using 1 node. 
+
     Arguments:
         script_path: str
             Path of the single-GPU script (either full or partial path).
@@ -122,10 +146,6 @@ def run_single_to_multi_gpu():
                         default=1,
                         choices=[0,1],
                         help='Disable (1) or enable (0) NCCL peer-to-peer communication')
-    parser.add_argument('--seed',
-                        type=int,
-                        default=None,
-                        help='Sets the RNG seed for all devices')
     parser.add_argument('script_path', 
                         type=str, 
                         help='Single GPU script file name (with or without .py extension)')
@@ -139,7 +159,7 @@ def run_single_to_multi_gpu():
                     f'--nnodes={args.nnodes}', 
                     f'--nproc_per_node={args.ngpus}', 
                     '-m',
-                    'cdtools.tools.distributed.single_to_multi_gpu', # Make the call to the single-to-multi-gpu wrapper script
+                    'cdtools.tools.distributed.single_to_multi_gpu', 
                     f'--backend={args.backend}',
                     f'--timeout={args.timeout}',
                     f'--nccl_p2p_disable={args.nccl_p2p_disable}',
@@ -147,36 +167,49 @@ def run_single_to_multi_gpu():
     
     
 def run_single_gpu_script(script_path: str,
-                           backend: str = 'nccl',
-                           timeout: int = 30,
-                           nccl_p2p_disable: bool = True,
-                           seed: int = None):
+                          backend: str = 'nccl',
+                          timeout: int = 30,
+                          nccl_p2p_disable: bool = True,
+                          seed: int = None):
     """
     Wraps single-GPU reconstruction scripts to be ran as a multi-GPU job via
     torchrun calls. 
 
-    This function is intended to be called in a script (say, single_to_multi_gpu.py) 
-    with the following form:
+    `cdtools.tools.distributed.run_single_gpu_script` is intended to be called in a script
+    (e.g., cdtools.tools.distributed.single_to_multi_gpu) with the following form:
 
     ```
+    # multi_gpu_job.py
     import cdtools.tools.distributed as dist
     if __name__ == '__main__':
-        dist.torchrun_single_to_multi_gpu(**kwargs)
+        dist.run_single_to_multi_gpu(script_path='YOUR_RECONSTRUCTION_SCRIPT.py',
+                                     backend='nccl',
+                                     timeout=30,
+                                     nccl_p2p_disable=True)
     ```
     
-    torchrun should then be used to run this script as a distributive job using,
+    `torchrun` should then be used to run this script as a distributive job using,
     for instance:
     
     ```
-    torchrun --nnodes=1 --nproc_per_node=4 single_to_multi_gpu.py
+    torchrun --nnodes=1 --nproc_per_node=<number of GPUs> multi_gpu_job.py
     ```
 
+    `torchrun` will spawn a number of subprocesses equal to the number of GPUs specified 
+    (--nproc_per_node). On each subprocess, `cdtools.tools.distributed.run_single_gpu_script`
+    will set up process groups (lets each GPU communicate with each other) and environment
+    variables necessary for multi-GPU jobs. The single-GPU script will then be ran by
+    each subprocess, where gradient synchronization will be faciliated by
+    `cdtools.tools.distributed.sync_and_avg_gradients` calls from `cdtools.Reconstructors`
+    while data shuffling/loading is handled by `cdtools.Reconstructor.setup_dataloader`.
+
+    
     If you want to use specific GPU IDs for reconstructions, you need to set up
     the environment variable `CDTOOLS_GPU_IDS` rather than `CUDA_VISIBLE_DEVICES`. 
     If you wanted to use GPU IDs `1, 3, 4` for example, write:
     
     ```
-    CDTOOLS_GPU_IDS=1,3,4 torchrun --nnodes=1 --nproc_per_node=3 single_to_multi_gpu.py
+    CDTOOLS_GPU_IDS=1,3,4 torchrun --nnodes=1 --nproc_per_node=<number of GPUs> multi_gpu_job.py
     ```
     
     NOTE: For each subprocess `cdt-torchrun` creates, the environment variable
@@ -214,7 +247,12 @@ def run_single_gpu_script(script_path: str,
     os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
     
     ##########   Force each subprocess to see only the GPU ID we assign it  ###########
-    ###################################################################################
+    # Why am I doing this? If this constraint is not imposed, then calling all_reduce will
+    # cause all subprocess Ranks to occupy memory on both their own respective GPUs (normal) 
+    # as well as Rank 0's GPU (not intended behavior). The root cause is not entirely clear 
+    # but there are two ways to avoid this empirically:
+    #   1) Force each subprocesses' CUDA_VISIBLE_DEVICE to be their assigned GPU ids.
+    #   2) Within the reconstruction script, change `device='cuda'` to `device=f'cuda{model.rank}'`
 
     # The GPU rank and world_size is visible as an environment variable through torchrun calls.
     rank = int(os.environ.get('RANK'))
@@ -232,12 +270,11 @@ def run_single_gpu_script(script_path: str,
     os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
 
     ################################  Run the script  #################################
-    ###################################################################################
 
     if rank == 0:
-        print(f'\n[INFO]: Starting up multi-GPU reconstructions with {world_size} GPUs.')
+        print(f'[INFO]: Starting up multi-GPU reconstructions with {world_size} GPUs.')
 
-    # Start up the process group (lets the different subprocesses can talk with each other)
+    # Start up the process group (lets the different subprocesses talk with each other)
     dist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout))
 
     try:     
@@ -250,7 +287,7 @@ def run_single_gpu_script(script_path: str,
         t.manual_seed(seed)
 
         # Run the single-GPU reconstruction script 
-        script_variables = runpy.run_path(script_path, run_name='__main__')
+        runpy.run_path(script_path, run_name='__main__')
 
         # Let the user know the job is done
         if rank == 0:
@@ -263,15 +300,69 @@ def run_single_gpu_script(script_path: str,
             print(f'[INFO]: Process group terminated. Multi-GPU job complete.')
 
 
+def report_speed_test(func: Callable):
+    """
+    Decorator function which saves the loss-versus-time/epoch history of a 
+    function-wrapped reconstruction script to a pickle dump file. This function
+    is intended to be used for multi-GPU test studies performed with
+    `cdtools.tools.distributed.run_speed_test`, which sets several environment
+    variables specifing the name and directory of the result files to be saved.
+    
+    If the directory specified by `CDTOOLS_SPEED_TEST_RESULT_DIR` does not exist,
+    one will be created in the current directory.
+
+    Parameters:
+        func: Callable
+            The entire reconstruction script wrapped in a function. Within the
+            script, the function must be called with an if-name-main statement.
+            Additionally, the function must return the reconstructed model.
+    """
+    def wrapper():
+        # Figure out how to name the save file and where to save it to
+        # These environment variables are provided by run_speed_test
+        trial_number = int(os.environ.get('CDTOOLS_TRIAL_NUMBER'))
+        save_dir = os.environ.get('CDTOOLS_SPEED_TEST_RESULT_DIR')
+        file_prefix = os.environ.get('CDTOOLS_SPEED_TEST_PREFIX')
+
+        # Check if the save path is valid
+        # Make sure the directory exists; or else create it
+        Path(save_dir).mkdir(parents=False, exist_ok=True)
+
+        # Run the script
+        model = func()
+
+        # Save the model and loss history, but only using the rank 0 process
+        if model.rank == 0:
+            # Set up the file name:
+            file_name = f'{file_prefix}_nGPUs_{model.world_size}_TRIAL_{trial_number}.pkl'
+            # Grab the loss and time history
+            loss_history = model.loss_history
+            time_history = model.loss_times
+            # Store quantities in a dictionary
+            dict = {'loss history':loss_history,
+                    'time history':time_history,
+                    'nGPUs':model.world_size,
+                    'trial':trial_number}
+            
+            # Save the quantities
+            with open (os.path.join(save_dir, file_name), 'wb') as save_file:
+                pickle.dump(dict, save_file)
+            
+            print(f'[INFO]: Saved results to: {file_name}') 
+    return wrapper
+
+
 def run_speed_test(world_sizes: int, 
                    runs: int,
                    script_path: str,
-                   output_dir: str):
+                   output_dir: str,
+                   file_prefix: str = 'speed_test'):
     """
     Executes a reconstruction script `n` x `m` times using `n` GPUs and `m` trials 
     per GPU count using cdt-torchrun.
 
-    This function assumes that
+    If the directory specified by `output_dir` does not exist,
+    one will be created in the current directory.
 
     Parameters:
         world_sizes: list[int]
@@ -282,6 +373,8 @@ def run_speed_test(world_sizes: int,
             Path of the single-gpu reconstruction script.
         output_dir: str
             Directory of the loss-vs-time/epoch data generated for the speed test.
+        file_prefix: str
+            Prefix of the speed test result file names
     """
     # Set stuff up for plots
     fig, (ax1,ax2,ax3) = plt.subplots(1,3)
@@ -291,29 +384,32 @@ def run_speed_test(world_sizes: int,
     std_1gpu = 0
 
     for world_size in world_sizes:
-        # Get the GPU IDs to use
-        #dev_id = device_ids[0:world_size] 
-        #print(f'\nNumber of GPU(s): {world_size} | Using GPU IDs {*dev_id,}')
-
         # Make a list to store the values
         time_list = []
         loss_hist_list = []
 
         for i in range(runs):
-            print(f'Resetting the model...')
-            print(f'Starting run {i+1}/{runs} on {world_size} GPU(s)')
+            print(f'[INFO]: Resetting the model...')
+            print(f'[INFO]: Starting run {i+1}/{runs} on {world_size} GPU(s)')
 
             # The scripts running speed tests need to read the trial number
             # they are on using an environment variable
             os.environ['CDTOOLS_TRIAL_NUMBER'] = str(i)
+            os.environ['CDTOOLS_SPEED_TEST_RESULT_DIR'] = output_dir
+            os.environ['CDTOOLS_SPEED_TEST_PREFIX'] = file_prefix
             
             # Run cdt-torchrun
-            subprocess.run(['cdt-torchrun',
-                            f'--ngpus={world_size}',
-                            f'{script_path}'])
+            try:
+                subprocess.run(['cdt-torchrun',
+                                f'--ngpus={world_size}',
+                                f'{script_path}'],
+                                check=True)
+            except subprocess.CalledProcessError as e:
+                print(e)
 
             print(f'[INFO]: Reconstruction complete. Loading loss results...')
-            with open(os.path.join(output_dir, f'speed_test_nGPUs_{world_size}_TRIAL_{i}.pkl'), 'rb') as f:
+
+            with open(os.path.join(output_dir, f'{file_prefix}_nGPUs_{world_size}_TRIAL_{i}.pkl'), 'rb') as f:
                 results = pickle.load(f)
             time_list.append(results['time history'])
             loss_hist_list.append(results['loss history'])
@@ -358,3 +454,4 @@ def run_speed_test(world_sizes: int,
     ax3.set_xlabel('Number of GPUs')
     ax3.set_ylabel('Speed-up relative to single GPU')
     plt.show()
+    print(f'[INFO]: Multi-GPU speed test completed.')
\ No newline at end of file

From 7c59d661259e8b471b8c5d22c062b17369319256 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 11 Jul 2025 20:26:20 +0000
Subject: [PATCH 094/115] Rearranged distributed.py

---
 src/cdtools/tools/distributed/distributed.py | 212 +++++++++----------
 1 file changed, 106 insertions(+), 106 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 42ba80aa..a8910a00 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -60,112 +60,6 @@ def sync_and_avg_gradients(model):
             param.grad.data /= model.world_size
 
 
-def run_single_to_multi_gpu():
-    """
-    Runs a single-GPU reconstruction script as a single-node multi-GPU job via torchrun.
-    
-    This function can be executed as a python console script as `cdt-torchrun` and
-    serves as a wrapper over a `torchrun` call to `cdtools.tools.distributed.single_to_multi_gpu`.
-    
-    In the simplest case, a single-GPU script can be ran as a multi-GPU job using
-    the following `cdt-torchrun` call in the command line
-    ```
-    cdt-torchrun --ngpus=<number of GPUs> YOUR_RECONSTRUCTION_SCRIPT.py
-    ```
-    
-    which is equivalent to the following `torchrun` call
-    ```
-    torchrun 
-        --standalone 
-        --nnodes=1 
-        --nproc_per_node=<number of GPUs> 
-        -m cdtools.tools.distributed.single_to_multi_gpu
-        --backend='nccl'
-        --timeout=30
-        --nccl_p2p_disable=1
-        YOUR_RECONSTRUCTION_SCRIPT.py
-    ```
-    
-    With a single node (--nnodes=1), `cdt-torchrun` will launch a given number of subprocesses 
-    equivalent to the number of GPUs specified. This number must be less than or equal to the
-    actual number of GPUs available on your node.
-
-    If you want to use specific GPU IDs for reconstructions, you need to set up
-    the environment variable `CDTOOLS_GPU_IDS` rather than `CUDA_VISIBLE_DEVICES`. 
-    If you wanted to use GPU IDs `1, 3, 4` for example, write:
-    
-    ```
-    CDTOOLS_GPU_IDS=1,3,4 cdt-torchrun --ngpus=3 YOUR_RECONSTRUCTION_SCRIPT.py
-    ```
-
-    If additional `torchrun` arguments need to be passed, you may need to make a direct
-    `torchrun` call rather than use `cdt-torchrun`. You may also submit an issue/PR.
-
-    NOTE: `cdt-torchrun` has only been tested using the 'nccl' backend, NCCL peer-to-peer communication
-          disabled, and using 1 node. 
-
-    Arguments:
-        script_path: str
-            Path of the single-GPU script (either full or partial path).
-        --ngpus: int
-            Number of GPUs to use.
-        --nnodes: int
-            Optional, number of nodes. Default 1; more than 1 nodes has not been tested.
-        --backend: str
-            Optional, communication backend for distributed computing (either `nccl` or `gloo`).
-            Default is `nccl`
-        --timeout: int
-            Optional, time in seconds before the distributed process is killed. 
-            Default is 30 seconds.
-        --nccl_p2p_disable: int
-            Optional, disable (1) or enable (0) NCCL peer-to-peer communication. Default
-            is 1.
-        
-    """
-    # Define the arguments we need to pass to dist.script_wrapper
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--ngpus',
-                        type=int,
-                        help='Number of GPUs to use (called --nproc_per_node in torchrun)')
-    parser.add_argument('--nnodes', 
-                        type=str, 
-                        default=1,
-                        help='Number of nodes participating in distributive computing.')
-    parser.add_argument('--backend', 
-                        type=str, 
-                        default='nccl',
-                        choices=['nccl', 'gloo'],
-                        help='Communication backend (nccl or gloo)')
-    parser.add_argument('--timeout', 
-                        type=int, 
-                        default=30,
-                        help='Time before process is killed in seconds')
-    parser.add_argument('--nccl_p2p_disable', 
-                        type=int, 
-                        default=1,
-                        choices=[0,1],
-                        help='Disable (1) or enable (0) NCCL peer-to-peer communication')
-    parser.add_argument('script_path', 
-                        type=str, 
-                        help='Single GPU script file name (with or without .py extension)')
-    
-    # Get the arguments
-    args = parser.parse_args()
-    
-    # Perform the torchrun call of the wrapped function
-    subprocess.run(['torchrun', # We set up the torchrun arguments first
-                    '--standalone', # Indicates that we're running a single machine, multiple GPU job.
-                    f'--nnodes={args.nnodes}', 
-                    f'--nproc_per_node={args.ngpus}', 
-                    '-m',
-                    'cdtools.tools.distributed.single_to_multi_gpu', 
-                    f'--backend={args.backend}',
-                    f'--timeout={args.timeout}',
-                    f'--nccl_p2p_disable={args.nccl_p2p_disable}',
-                    f'{args.script_path}'])
-    
-    
 def run_single_gpu_script(script_path: str,
                           backend: str = 'nccl',
                           timeout: int = 30,
@@ -300,6 +194,112 @@ def run_single_gpu_script(script_path: str,
             print(f'[INFO]: Process group terminated. Multi-GPU job complete.')
 
 
+def run_single_to_multi_gpu():
+    """
+    Runs a single-GPU reconstruction script as a single-node multi-GPU job via torchrun.
+    
+    This function can be executed as a python console script as `cdt-torchrun` and
+    serves as a wrapper over a `torchrun` call to `cdtools.tools.distributed.single_to_multi_gpu`.
+    
+    In the simplest case, a single-GPU script can be ran as a multi-GPU job using
+    the following `cdt-torchrun` call in the command line
+    ```
+    cdt-torchrun --ngpus=<number of GPUs> YOUR_RECONSTRUCTION_SCRIPT.py
+    ```
+    
+    which is equivalent to the following `torchrun` call
+    ```
+    torchrun 
+        --standalone 
+        --nnodes=1 
+        --nproc_per_node=<number of GPUs> 
+        -m cdtools.tools.distributed.single_to_multi_gpu
+        --backend='nccl'
+        --timeout=30
+        --nccl_p2p_disable=1
+        YOUR_RECONSTRUCTION_SCRIPT.py
+    ```
+    
+    With a single node (--nnodes=1), `cdt-torchrun` will launch a given number of subprocesses 
+    equivalent to the number of GPUs specified. This number must be less than or equal to the
+    actual number of GPUs available on your node.
+
+    If you want to use specific GPU IDs for reconstructions, you need to set up
+    the environment variable `CDTOOLS_GPU_IDS` rather than `CUDA_VISIBLE_DEVICES`. 
+    If you wanted to use GPU IDs `1, 3, 4` for example, write:
+    
+    ```
+    CDTOOLS_GPU_IDS=1,3,4 cdt-torchrun --ngpus=3 YOUR_RECONSTRUCTION_SCRIPT.py
+    ```
+
+    If additional `torchrun` arguments need to be passed, you may need to make a direct
+    `torchrun` call rather than use `cdt-torchrun`. You may also submit an issue/PR.
+
+    NOTE: `cdt-torchrun` has only been tested using the 'nccl' backend, NCCL peer-to-peer communication
+          disabled, and using 1 node. 
+
+    Arguments:
+        script_path: str
+            Path of the single-GPU script (either full or partial path).
+        --ngpus: int
+            Number of GPUs to use.
+        --nnodes: int
+            Optional, number of nodes. Default 1; more than 1 nodes has not been tested.
+        --backend: str
+            Optional, communication backend for distributed computing (either `nccl` or `gloo`).
+            Default is `nccl`
+        --timeout: int
+            Optional, time in seconds before the distributed process is killed. 
+            Default is 30 seconds.
+        --nccl_p2p_disable: int
+            Optional, disable (1) or enable (0) NCCL peer-to-peer communication. Default
+            is 1.
+        
+    """
+    # Define the arguments we need to pass to dist.script_wrapper
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--ngpus',
+                        type=int,
+                        help='Number of GPUs to use (called --nproc_per_node in torchrun)')
+    parser.add_argument('--nnodes', 
+                        type=str, 
+                        default=1,
+                        help='Number of nodes participating in distributive computing.')
+    parser.add_argument('--backend', 
+                        type=str, 
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='Communication backend (nccl or gloo)')
+    parser.add_argument('--timeout', 
+                        type=int, 
+                        default=30,
+                        help='Time before process is killed in seconds')
+    parser.add_argument('--nccl_p2p_disable', 
+                        type=int, 
+                        default=1,
+                        choices=[0,1],
+                        help='Disable (1) or enable (0) NCCL peer-to-peer communication')
+    parser.add_argument('script_path', 
+                        type=str, 
+                        help='Single GPU script file name (with or without .py extension)')
+    
+    # Get the arguments
+    args = parser.parse_args()
+    
+    # Perform the torchrun call of the wrapped function
+    subprocess.run(['torchrun', # We set up the torchrun arguments first
+                    '--standalone', # Indicates that we're running a single machine, multiple GPU job.
+                    f'--nnodes={args.nnodes}', 
+                    f'--nproc_per_node={args.ngpus}', 
+                    '-m',
+                    'cdtools.tools.distributed.single_to_multi_gpu', 
+                    f'--backend={args.backend}',
+                    f'--timeout={args.timeout}',
+                    f'--nccl_p2p_disable={args.nccl_p2p_disable}',
+                    f'{args.script_path}'])
+    
+
 def report_speed_test(func: Callable):
     """
     Decorator function which saves the loss-versus-time/epoch history of a 

From ce9541acefaec57fceeebf74da4a1698ec927d23 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 11 Jul 2025 20:29:01 +0000
Subject: [PATCH 095/115] Updated the documentation in distributed.py

---
 src/cdtools/tools/distributed/distributed.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index a8910a00..397d10de 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -7,17 +7,9 @@
 are calculated (`loss.backwards()`) on each GPU, the gradients need to be
 synchronized and averaged across all participating GPUs. 
 
-The functions in this module assist with both gradient synchronization and
-setting up conditions necessary to perform distributive computing. Some
-functions in this module require parts of the user-written
-reconstruction script to be first wrapped in a function (as shown in 
-examples/fancy_ptycho_multi_gpu_ddp.py). The functions in this module
-are designed to wrap around/call these user-defined functions, enabling
-reconstructions to be performed across several GPUs.
-
-NOTE: These methods however do not define how the Dataset is
-distributed across each device; this process can be handled by using
-DistributedSampler with the DataLoader.
+The functions in this module assist with gradient synchronization,
+setting up conditions necessary to perform distributive computing, and
+executing multi-GPU jobs. 
 """
 
 import torch as t

From 5a523cf00fab76bcaa97123dae33b89f953a6bd0 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 11 Jul 2025 20:31:15 +0000
Subject: [PATCH 096/115] Got rid of unused imports in CDIModel

---
 src/cdtools/models/base.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index f564d5c8..19948df2 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -29,17 +29,11 @@
 """
 
 import torch as t
-from torch.utils import data as torchdata
-from torch.utils.data.distributed import DistributedSampler
-import torch.distributed as dist
 from matplotlib import pyplot as plt
 from matplotlib.widgets import Slider
 from matplotlib import ticker
 import numpy as np
-import threading
-import queue
 import time
-from scipy import io
 from contextlib import contextmanager
 from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
 from cdtools.datasets import CDataset

From 6bd71871aa3d56d0ea53b40472bb53321f2c04a9 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 11 Jul 2025 21:08:43 +0000
Subject: [PATCH 097/115] Added CDIModel import to distributed

---
 src/cdtools/tools/distributed/distributed.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 397d10de..1d9d36f5 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -25,6 +25,7 @@
 import numpy as np
 from typing import Callable
 from pathlib import Path
+from cdtools.models import CDIModel
 
 DISTRIBUTED_PATH = os.path.dirname(os.path.abspath(__file__))
 MIN_INT64 = np.iinfo(np.int64).min
@@ -37,7 +38,7 @@
            'run_speed_test']
 
 
-def sync_and_avg_gradients(model):
+def sync_and_avg_gradients(model: CDIModel):
     """
     Synchronizes the average of the model parameter gradients across all
     participating GPUs.

From 13cfee5c54ed75b7cce21ce4bfcf8afcd66bc27e Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Wed, 16 Jul 2025 18:41:43 +0000
Subject: [PATCH 098/115] Optional plotting and result saving/deleting added to
 speed test. Also removed numpy dependency from distributed

---
 examples/distributed_speed_test.py           |  24 +--
 examples/fancy_ptycho_speed_test.py          |   2 +-
 src/cdtools/tools/distributed/distributed.py | 211 +++++++++++++------
 3 files changed, 165 insertions(+), 72 deletions(-)

diff --git a/examples/distributed_speed_test.py b/examples/distributed_speed_test.py
index 14e2a0bb..581bbc4c 100644
--- a/examples/distributed_speed_test.py
+++ b/examples/distributed_speed_test.py
@@ -9,31 +9,31 @@
 
 '''
 
-from cdtools.tools import distributed as dist
-import os
-
-# If you're running on AMD CPUs, you need to include this or else you will get a
-# threading layer error. 
-os.environ['MKL_THREADING_LAYER'] = 'GNU'
+import cdtools.tools.distributed as dist
 
 # This will execute the multi_gpu_reconstruct upon running this file
 if __name__ == '__main__':
     # Define the number of GPUs to use.
-    world_sizes = [1+i for i in range(7)] 
-
+    world_sizes = [1,2]
+    
     # How many reconstruction runs to perform for statistics
     runs = 3
 
     # Define where the single-GPU script is located
-    #script_path = 'fancy_ptycho_speed_test.py'
-    script_path = 'gold_ball_ptycho.py'
+    script_path = 'fancy_ptycho_speed_test.py']
 
     # Define where the loss-vs-time data is being stored in
     output_dir = 'example_loss_data4'
 
     # Define what prefix you want on the file
     file_prefix = 'speed_test'
-
+    
     # Run the test
-    dist.run_speed_test(world_sizes, runs, script_path, output_dir, file_prefix)
+    results = dist.run_speed_test(world_sizes=world_sizes, 
+                                  runs=runs, 
+                                  script_path=script_path, 
+                                  output_dir=output_dir, 
+                                  file_prefix=file_prefix,
+                                  show_plot=True,
+                                  delete_output_files=True)
     
\ No newline at end of file
diff --git a/examples/fancy_ptycho_speed_test.py b/examples/fancy_ptycho_speed_test.py
index e15c4f7e..7ba9f954 100644
--- a/examples/fancy_ptycho_speed_test.py
+++ b/examples/fancy_ptycho_speed_test.py
@@ -20,7 +20,7 @@ def main():
     model.to(device=device)
     dataset.get_as(device=device)
 
-
+    # Remove or comment out plotting statements
     for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
         if model.rank == 0:
             print(model.report())
diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 1d9d36f5..8f9846fb 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -11,7 +11,6 @@
 setting up conditions necessary to perform distributive computing, and
 executing multi-GPU jobs. 
 """
-
 import torch as t
 import torch.distributed as dist
 import datetime
@@ -22,14 +21,14 @@
 from ast import literal_eval
 from matplotlib import pyplot as plt
 import pickle
-import numpy as np
-from typing import Callable
+import random
+from typing import Callable, Tuple, List
 from pathlib import Path
 from cdtools.models import CDIModel
 
 DISTRIBUTED_PATH = os.path.dirname(os.path.abspath(__file__))
-MIN_INT64 = np.iinfo(np.int64).min
-MAX_INT64 = np.iinfo(np.int64).max
+MIN_INT64 = t.iinfo(t.int64).min
+MAX_INT64 = t.iinfo(t.int64).max
 
 __all__ = ['sync_and_avg_gradients', 
            'run_single_to_multi_gpu',
@@ -41,7 +40,7 @@
 def sync_and_avg_gradients(model: CDIModel):
     """
     Synchronizes the average of the model parameter gradients across all
-    participating GPUs.
+    participating GPUs using all_reduce.
 
     Parameters:
         model: CDIModel
@@ -62,7 +61,7 @@ def run_single_gpu_script(script_path: str,
     Wraps single-GPU reconstruction scripts to be ran as a multi-GPU job via
     torchrun calls. 
 
-    `cdtools.tools.distributed.run_single_gpu_script` is intended to be called in a script
+    `run_single_gpu_script` is intended to be called in a script
     (e.g., cdtools.tools.distributed.single_to_multi_gpu) with the following form:
 
     ```
@@ -83,11 +82,11 @@ def run_single_gpu_script(script_path: str,
     ```
 
     `torchrun` will spawn a number of subprocesses equal to the number of GPUs specified 
-    (--nproc_per_node). On each subprocess, `cdtools.tools.distributed.run_single_gpu_script`
+    (--nproc_per_node). On each subprocess, `run_single_gpu_script`
     will set up process groups (lets each GPU communicate with each other) and environment
     variables necessary for multi-GPU jobs. The single-GPU script will then be ran by
     each subprocess, where gradient synchronization will be faciliated by
-    `cdtools.tools.distributed.sync_and_avg_gradients` calls from `cdtools.Reconstructors`
+    `sync_and_avg_gradients` calls from `cdtools.Reconstructors`
     while data shuffling/loading is handled by `cdtools.Reconstructor.setup_dataloader`.
 
     
@@ -118,7 +117,7 @@ def run_single_gpu_script(script_path: str,
             will be aborted and the process calling this method will crash. 
         nccl_p2p_disable: bool
             Disable NCCL peer-2-peer communication. If you find that all your GPUs
-            are at 100% useage but the program isn't doing anything, try enabling
+            are at 100% usege but the program isn't doing anything, try enabling
             this variable.
         seed: int
             Seed for generating random numbers.
@@ -140,6 +139,8 @@ def run_single_gpu_script(script_path: str,
     # but there are two ways to avoid this empirically:
     #   1) Force each subprocesses' CUDA_VISIBLE_DEVICE to be their assigned GPU ids.
     #   2) Within the reconstruction script, change `device='cuda'` to `device=f'cuda{model.rank}'`
+    #
+    # Option 1 is chosen here to use single-GPU reconstruction scripts AS-IS for multi-GPU jobs.
 
     # The GPU rank and world_size is visible as an environment variable through torchrun calls.
     rank = int(os.environ.get('RANK'))
@@ -167,7 +168,7 @@ def run_single_gpu_script(script_path: str,
     try:     
         # Force all subprocesses to either use the pre-specified or Rank 0's RNG seed
         if seed is None:
-            seed_local = t.tensor(np.random.randint(MIN_INT64, MAX_INT64), device='cuda', dtype=t.int64)
+            seed_local = t.tensor(random.randint(MIN_INT64, MAX_INT64), device='cuda', dtype=t.int64)
             dist.broadcast(seed_local, 0)
             seed = seed_local.item()
 
@@ -314,13 +315,9 @@ def wrapper():
         # Figure out how to name the save file and where to save it to
         # These environment variables are provided by run_speed_test
         trial_number = int(os.environ.get('CDTOOLS_TRIAL_NUMBER'))
-        save_dir = os.environ.get('CDTOOLS_SPEED_TEST_RESULT_DIR')
+        output_dir = os.environ.get('CDTOOLS_SPEED_TEST_RESULTS_DIR')
         file_prefix = os.environ.get('CDTOOLS_SPEED_TEST_PREFIX')
 
-        # Check if the save path is valid
-        # Make sure the directory exists; or else create it
-        Path(save_dir).mkdir(parents=False, exist_ok=True)
-
         # Run the script
         model = func()
 
@@ -338,7 +335,7 @@ def wrapper():
                     'trial':trial_number}
             
             # Save the quantities
-            with open (os.path.join(save_dir, file_name), 'wb') as save_file:
+            with open (os.path.join(output_dir, file_name), 'wb') as save_file:
                 pickle.dump(dict, save_file)
             
             print(f'[INFO]: Saved results to: {file_name}') 
@@ -349,10 +346,21 @@ def run_speed_test(world_sizes: int,
                    runs: int,
                    script_path: str,
                    output_dir: str,
-                   file_prefix: str = 'speed_test'):
+                   file_prefix: str = 'speed_test',
+                   show_plot: bool = True,
+                   delete_output_files: bool = False,
+                   nnodes: int = 1,
+                   backend: str = 'nccl',
+                   timeout: int = 30,
+                   nccl_p2p_disable: bool = True,
+                   seed: int = None
+                   ) -> Tuple[List[float], 
+                              List[float], 
+                              List[float], 
+                              List[float]]:
     """
-    Executes a reconstruction script `n` x `m` times using `n` GPUs and `m` trials 
-    per GPU count using cdt-torchrun.
+    Executes a reconstruction script using `n` GPUs and `m` trials per GPU count using 
+    `torchrun` and `cdtools.tools.distributed.single_to_multi_gpu`.
 
     If the directory specified by `output_dir` does not exist,
     one will be created in the current directory.
@@ -360,6 +368,7 @@ def run_speed_test(world_sizes: int,
     Parameters:
         world_sizes: list[int]
             Number of GPUs to use. User can specify several GPU counts in a list.
+            But the first entry must be 1 (single-GPU).
         runs: int
             How many repeat reconstructions to perform
         script_path: str
@@ -368,14 +377,63 @@ def run_speed_test(world_sizes: int,
             Directory of the loss-vs-time/epoch data generated for the speed test.
         file_prefix: str
             Prefix of the speed test result file names
+        show_plot: bool
+            Show loss-versus-epoch/time and speed-up-versus-GPU count curves
+        delete_output_files: bool
+            Removes the results files produced by `report_speed_test` from
+            the output_dir after each trail run.
+        nnodes: int
+            Number of nodes to use. This module has only been tested with 1 node.
+        backend: str
+            Communication backend for distributive computing. NVidia Collective
+            Communications Library ('nccl') is the default and only tested option.
+            See https://docs.pytorch.org/docs/stable/distributed.html for other
+            backends supported by pytorch (but have not been tested in this package).
+        timeout: int
+            Timeout for operations to be executed in seconds. All processes will be
+            aborted after the timeout has been exceeded.
+        nccl_p2p_disable: bool
+            Sets the `NCCL_P2P_DISABLE` environment variable to enable/disable
+            nccl peer-to-peer communication. If you find that all your GPUs
+            are at 100% usage but the program isn't doing anything, try enabling
+            this variable.
+        seed: int
+            Seed for generating random numbers. Default is None (seed is randomly
+            generated).
+
+    Returns:
+        final_loss_mean_list: List[float]
+            Mean final loss value over `runs` iterations for each `world_size`
+            value specified.
+        final_loss_std_list: List[float]
+            Standard deviation of the final loss value over `runs` iterations
+            for each `world_size`.
+        speed_up_mean_list: List[float]
+            Mean runtime speed-up over `runs` iterations for each `world_size` 
+            value specified. Speed-up is defined as the `runtime_nGPUs / runtime_1_GPU`.
+        speed_up_std_list: List[float]
+            Standard deviation of the runtime speed-up over `runs` iterations
+            for each `world_size`.
     """
+    
+    # Make sure the directory exists; or else create it
+    Path(output_dir).mkdir(parents=False, exist_ok=True)
+
     # Set stuff up for plots
-    fig, (ax1,ax2,ax3) = plt.subplots(1,3)
+    if show_plot:
+        fig, (ax1,ax2,ax3) = plt.subplots(1,3)
 
     # Store the value of the single GPU time
     time_1gpu = 0
     std_1gpu = 0
 
+    # Store values of the different speed-up factors and final losses
+    # as a function of GPU count
+    speed_up_mean_list = []
+    speed_up_std_list = []
+    final_loss_mean_list = []
+    final_loss_std_list = []
+
     for world_size in world_sizes:
         # Make a list to store the values
         time_list = []
@@ -386,33 +444,57 @@ def run_speed_test(world_sizes: int,
             print(f'[INFO]: Starting run {i+1}/{runs} on {world_size} GPU(s)')
 
             # The scripts running speed tests need to read the trial number
-            # they are on using an environment variable
-            os.environ['CDTOOLS_TRIAL_NUMBER'] = str(i)
-            os.environ['CDTOOLS_SPEED_TEST_RESULT_DIR'] = output_dir
-            os.environ['CDTOOLS_SPEED_TEST_PREFIX'] = file_prefix
+            # they are on using. We send this information using environment 
+            # variables sent to the child processes spawned by subprocess.run
+            child_env = os.environ.copy()
+            child_env['CDTOOLS_TRIAL_NUMBER'] = str(i)
+            child_env['CDTOOLS_SPEED_TEST_RESULTS_DIR'] = output_dir
+            child_env['CDTOOLS_SPEED_TEST_PREFIX'] = file_prefix
+
+            # Set up the terminal commands
+            cmd = ['torchrun', # We set up the torchrun arguments first
+                   '--standalone', # Indicates that we're running a single machine, multiple GPU job.
+                   f'--nnodes={nnodes}', 
+                   f'--nproc_per_node={world_size}', 
+                   '-m',
+                   'cdtools.tools.distributed.single_to_multi_gpu', 
+                   f'--backend={backend}',
+                   f'--timeout={timeout}',
+                   f'--nccl_p2p_disable={int(nccl_p2p_disable)}']
+            
+            if seed is not None:
+                    cmd.append(f'--seed={seed}')
+
+            cmd.append(f'{script_path}')
             
-            # Run cdt-torchrun
+            # Run the single/multi-GPU job
             try:
-                subprocess.run(['cdt-torchrun',
-                                f'--ngpus={world_size}',
-                                f'{script_path}'],
-                                check=True)
+                subprocess.run(cmd, check=True, env=child_env)
+
             except subprocess.CalledProcessError as e:
-                print(e)
+                raise e
 
-            print(f'[INFO]: Reconstruction complete. Loading loss results...')
+            # Load the loss results
+            print('[INFO]: Reconstruction complete. Loading loss results...')
+            
+            save_path = os.path.join(output_dir, f'{file_prefix}_nGPUs_{world_size}_TRIAL_{i}.pkl')
 
-            with open(os.path.join(output_dir, f'{file_prefix}_nGPUs_{world_size}_TRIAL_{i}.pkl'), 'rb') as f:
+            with open(save_path, 'rb') as f:
                 results = pickle.load(f)
             time_list.append(results['time history'])
             loss_hist_list.append(results['loss history'])
 
+            print('[INFO]: Loss results loaded.')
+
+            if delete_output_files:
+                print(f'[INFO]: Removing {save_path}')
+                os.remove(save_path)
             
         # Calculate the statistics
-        time_mean = np.array(time_list).mean(axis=0)/60
-        time_std = np.array(time_list).std(axis=0)/60
-        loss_mean = np.array(loss_hist_list).mean(axis=0)
-        loss_std = np.array(loss_hist_list).std(axis=0)
+        time_mean = t.tensor(time_list).mean(dim=0)/60
+        time_std = t.tensor(time_list).std(dim=0)/60
+        loss_mean = t.tensor(loss_hist_list).mean(dim=0)
+        loss_std = t.tensor(loss_hist_list).std(dim=0)
 
         # If a single GPU is used, store the time
         if world_size == 1:
@@ -422,29 +504,40 @@ def run_speed_test(world_sizes: int,
         # Calculate the speed-up relative to using a single GPU
         speed_up_mean = time_1gpu / time_mean[-1] 
         speed_up_std = speed_up_mean * \
-            np.sqrt((std_1gpu/time_1gpu)**2 + (time_std[-1]/time_mean[-1])**2)
+            t.sqrt((std_1gpu/time_1gpu)**2 + (time_std[-1]/time_mean[-1])**2)
+        
+        # Store the final lossess and speed-ups
+        final_loss_mean_list.append(loss_mean[-1].item())
+        final_loss_std_list.append(loss_std[-1].item())
+        speed_up_mean_list.append(speed_up_mean.item())
+        speed_up_std_list.append(speed_up_std.item())
 
         # Add another plot
-        ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
-                    label=f'{world_size} GPUs')
-        ax2.errorbar(np.arange(0,loss_mean.shape[0]), loss_mean, yerr=loss_std,
-                    label=f'{world_size} GPUs')
-        ax3.errorbar(world_size, speed_up_mean, yerr=speed_up_std, fmt='o')
+        if show_plot:
+            ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
+                        label=f'{world_size} GPUs')
+            ax2.errorbar(t.arange(0,loss_mean.shape[0]), loss_mean, yerr=loss_std,
+                        label=f'{world_size} GPUs')
+            ax3.errorbar(world_size, speed_up_mean, yerr=speed_up_std, fmt='o')
         
     # Plot
-    fig.suptitle(f'Multi-GPU performance test | {runs} runs performed')
-    ax1.set_yscale('log')
-    ax1.set_xscale('linear')
-    ax2.set_yscale('log')
-    ax2.set_xscale('linear')
-    ax3.set_yscale('linear')
-    ax3.set_xscale('linear')
-    ax1.legend()
-    ax2.legend()
-    ax1.set_xlabel('Time (min)')
-    ax1.set_ylabel('Loss')
-    ax2.set_xlabel('Epochs')
-    ax3.set_xlabel('Number of GPUs')
-    ax3.set_ylabel('Speed-up relative to single GPU')
-    plt.show()
-    print(f'[INFO]: Multi-GPU speed test completed.')
\ No newline at end of file
+    if show_plot:
+        fig.suptitle(f'Multi-GPU performance test | {runs} runs performed')
+        ax1.set_yscale('log')
+        ax1.set_xscale('linear')
+        ax2.set_yscale('log')
+        ax2.set_xscale('linear')
+        ax3.set_yscale('linear')
+        ax3.set_xscale('linear')
+        ax1.legend()
+        ax2.legend()
+        ax1.set_xlabel('Time (min)')
+        ax1.set_ylabel('Loss')
+        ax2.set_xlabel('Epochs')
+        ax3.set_xlabel('Number of GPUs')
+        ax3.set_ylabel('Speed-up relative to single GPU')
+        plt.show()
+    
+    print(f'[INFO]: Multi-GPU speed test completed.')
+
+    return final_loss_mean_list, final_loss_std_list, speed_up_mean_list, speed_up_std_list
\ No newline at end of file

From 496e8a3be1d4c9f64c165bb7f919ab7f28dd5dec Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Wed, 16 Jul 2025 20:07:11 +0000
Subject: [PATCH 099/115] Created pytest to assess multi-gpu reconstruction
 quality

---
 tests/conftest.py                           | 39 +++++++--
 tests/multi_gpu/multi_gpu_script_quality.py | 44 ++++++++++
 tests/multi_gpu/test_multi_gpu.py           | 95 +++++++++++++++++++++
 3 files changed, 170 insertions(+), 8 deletions(-)
 create mode 100644 tests/multi_gpu/multi_gpu_script_quality.py
 create mode 100644 tests/multi_gpu/test_multi_gpu.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 781dd9ee..67eb069d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,6 @@
 import pytest
 import datetime
 
-
 #
 #
 # The following few fixtures define some standard data files
@@ -32,20 +31,34 @@ def pytest_addoption(parser):
         default=False,
         help="run slow tests, primarily full reconstruction tests."
     )
+    parser.addoption(
+        "--runmultigpu",
+        action="store_true",
+        default=False,
+        help="Runs tests using 2 NVIDIA CUDA GPUs."
+    )
 
 
 def pytest_configure(config):
     config.addinivalue_line("markers", "slow: mark test as slow to run")
+    config.addinivalue_line("markers", "multigpu: run the multigpu test using 2 NVIDIA GPUs")
 
 
 def pytest_collection_modifyitems(config, items):
-    if config.getoption("--runslow"):
-        # --runslow given in cli: do not skip slow tests
-        return
-    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
-    for item in items:
-        if "slow" in item.keywords:
-            item.add_marker(skip_slow)
+    # Skip the slow and/or multigpu tests if --runslow and/or --multigpu 
+    # is given in cli.
+
+    if config.getoption("--runslow") or config.getoption("--runmultigpu"):
+
+        skip_slow = pytest.mark.skip(reason="need --runslow option to run")
+        skip_multigpu = pytest.mark.skip(reason='need --runmultigpu option to run')
+
+        for item in items:
+            if "slow" in item.keywords and not config.getoption("--runslow"):
+                item.add_marker(skip_slow)
+            
+            if "multigpu" in item.keywords and not config.getoption("--runmultigpu"):
+                item.add_marker(skip_multigpu)
 
 @pytest.fixture
 def reconstruction_device(request):
@@ -403,3 +416,13 @@ def example_nested_dicts(pytestconfig):
     }
 
     return [test_dict_1, test_dict_2, test_dict_3]
+
+
+@pytest.fixture(scope='module')
+def multigpu_script_1(pytestconfig):
+    return str(pytestconfig.rootpath) + \
+        '/tests/multi_gpu/multi_gpu_script_quality.py'
+
+@pytest.fixture
+def mkl_threading_layer(request):
+    return request.config.getoption("--mkl_threading_layer")
\ No newline at end of file
diff --git a/tests/multi_gpu/multi_gpu_script_quality.py b/tests/multi_gpu/multi_gpu_script_quality.py
new file mode 100644
index 00000000..40e7e16f
--- /dev/null
+++ b/tests/multi_gpu/multi_gpu_script_quality.py
@@ -0,0 +1,44 @@
+import cdtools
+import os
+
+@cdtools.tools.distributed.report_speed_test
+def main():
+    filename = os.environ.get('CDTOOLS_TESTING_DATA_PATH')
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+    # FancyPtycho is the workhorse model
+    model = cdtools.models.FancyPtycho.from_dataset(
+        dataset,
+        n_modes=3, # Use 3 incoherently mixing probe modes
+        oversampling=2, # Simulate the probe on a 2xlarger real-space array
+        probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
+        propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
+        units='mm', # Set the units for the live plots
+        obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
+    )
+
+    device = 'cuda'
+    model.to(device=device)
+    dataset.get_as(device=device)
+
+    # Remove or comment out plotting statements
+    for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
+        if model.rank == 0 and model.epoch % 10:
+            print(model.report())
+
+    for loss in model.Adam_optimize(25, dataset,  lr=0.005, batch_size=40):
+        if model.rank == 0 and model.epoch % 10:
+            print(model.report())
+
+    for loss in model.Adam_optimize(25, dataset,  lr=0.001, batch_size=40):
+        if model.rank == 0 and model.epoch % 10:
+            print(model.report())
+
+    # This orthogonalizes the recovered probe modes
+    model.tidy_probes()
+
+    return model
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/tests/multi_gpu/test_multi_gpu.py b/tests/multi_gpu/test_multi_gpu.py
new file mode 100644
index 00000000..fc33ba5c
--- /dev/null
+++ b/tests/multi_gpu/test_multi_gpu.py
@@ -0,0 +1,95 @@
+import cdtools
+import cdtools.tools.distributed as dist
+import pytest
+import os
+
+"""
+This file contains several tests that are relevant to running
+multi-GPU operations in CDTools. 
+
+
+
+"""
+
+@pytest.mark.multigpu
+def test_reconstruction_quality(lab_ptycho_cxi,
+                                multigpu_script_1,
+                                tmp_path,
+                                show_plot):
+    """
+    Run a multi-GPU test based on fancy_ptycho_speed_test.py and make 
+    sure the final reconstructed loss using 2 GPUs is similar to 1 GPU.
+
+    This test requires us to have 2 NVIDIA GPUs and makes use of the
+    multi-GPU speed test.
+
+    If this test fails, it indicates that the reconstruction quality
+    is getting noticably worse with increased GPU counts. This may 
+    be a symptom of a synchronization/broadcasting issue between the 
+    different GPUs.
+    """
+    # Pass the cxi directory to the reconstruction script
+    os.environ['CDTOOLS_TESTING_DATA_PATH'] = lab_ptycho_cxi
+
+    # Set up and run a distributed speed test
+    world_sizes = [1, 2]
+    runs = 5
+    file_prefix = 'speed_test'
+
+    # Define a temporary directory
+    temp_dir = str(tmp_path / "reconstruction")
+
+    results = dist.run_speed_test(world_sizes=world_sizes,
+                                  runs=runs,
+                                  script_path=multigpu_script_1,
+                                  output_dir=temp_dir,
+                                  file_prefix=file_prefix,
+                                  show_plot=show_plot,
+                                  delete_output_files=True)
+
+    # Ensure that both single and 2 GPU results produce losses lower than
+    # a threshold value of 0.0013. This is the same threshold used in
+    # test_fancy_ptycho.py
+    loss_mean = results[0]
+    assert loss_mean[0] < 0.0013
+    assert loss_mean[1] < 0.0013
+
+    # Check if the two losses are similar to each other by seeing if their
+    # mean +- standard deviation intervals overlap with each other
+    loss_std = results[1]
+    single_gpu_loss_min = loss_mean[0] - loss_std[0]
+    single_gpu_loss_max = loss_mean[0] + loss_std[0]
+    multi_gpu_loss_min = loss_mean[1] - loss_std[1]
+    multi_gpu_loss_max = loss_mean[1] + loss_std[1]
+    has_overlap_loss = min(single_gpu_loss_max, multi_gpu_loss_max)\
+                       > max(single_gpu_loss_min, multi_gpu_loss_min)
+
+    print(f'Single GPU final loss: {loss_mean[0]} +- {loss_std[0]}')
+    print(f'Two GPU final loss: {loss_mean[1]} +- {loss_std[1]}')
+    print(f'Overlap between the mean +- std of the single/multi GPU losses: {has_overlap_loss}')
+
+    assert has_overlap_loss
+
+    # Also make sure that we actually get some kind of speed up with 
+    # multiple GPUs...
+    speed_mean = results[2]
+    speed_std = results[3]
+
+    single_gpu_speed_min = speed_mean[0] - speed_std[0]
+    single_gpu_speed_max = speed_mean[0] + speed_std[0]
+    multi_gpu_speed_min = speed_mean[1] - speed_std[1]
+    multi_gpu_speed_max = speed_mean[1] + speed_std[1]
+    has_overlap_speed = min(single_gpu_speed_max, multi_gpu_speed_max)\
+                        > max(single_gpu_speed_min, multi_gpu_speed_min)
+
+    print(f'Single GPU runtime: {speed_mean[0]} +- {speed_std[0]}')
+    print(f'Two GPU runtime: {speed_mean[1]} +- {speed_std[1]}')
+    print(f'Overlap between the mean +- std of the single/multi GPU runtimes: {has_overlap_speed}')
+
+    assert speed_mean[0] < speed_mean[1]
+    assert not has_overlap_speed
+
+    # Clear the environment variable we created here
+    os.environ.pop('CDTOOLS_TESTING_DATA_PATH')
+
+

From 3cc184eff47c81cb63aa0b7f342d9a9d2731e410 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Wed, 16 Jul 2025 21:50:42 +0000
Subject: [PATCH 100/115] Fixed bug that lets the slow and multigpu tests run
 without setting the parser options

---
 tests/conftest.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 32b56a09..5d0dd89a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -49,17 +49,15 @@ def pytest_collection_modifyitems(config, items):
     # Skip the slow and/or multigpu tests if --runslow and/or --multigpu 
     # is given in cli.
 
-    if config.getoption("--runslow") or config.getoption("--runmultigpu"):
+    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
+    skip_multigpu = pytest.mark.skip(reason='need --runmultigpu option to run')
 
-        skip_slow = pytest.mark.skip(reason="need --runslow option to run")
-        skip_multigpu = pytest.mark.skip(reason='need --runmultigpu option to run')
-
-        for item in items:
-            if "slow" in item.keywords and not config.getoption("--runslow"):
-                item.add_marker(skip_slow)
-            
-            if "multigpu" in item.keywords and not config.getoption("--runmultigpu"):
-                item.add_marker(skip_multigpu)
+    for item in items:
+        if "slow" in item.keywords and not config.getoption("--runslow"):
+            item.add_marker(skip_slow)
+        
+        if "multigpu" in item.keywords and not config.getoption("--runmultigpu"):
+            item.add_marker(skip_multigpu)
 
 
 @pytest.fixture
@@ -423,7 +421,3 @@ def example_nested_dicts(pytestconfig):
 def multigpu_script_1(pytestconfig):
     return str(pytestconfig.rootpath) + \
         '/tests/multi_gpu/multi_gpu_script_quality.py'
-
-@pytest.fixture
-def mkl_threading_layer(request):
-    return request.config.getoption("--mkl_threading_layer")
\ No newline at end of file

From 919c236d7d1a950e548fc2ee8fac8bb0c3eee4bc Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 01:59:39 +0000
Subject: [PATCH 101/115] Linted and updated documentation on distributed.py

---
 src/cdtools/tools/distributed/distributed.py | 520 ++++++++++++-------
 1 file changed, 323 insertions(+), 197 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 8f9846fb..6790431f 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -5,11 +5,11 @@
 each GPU is given identical copies of a model and performs optimization
 using different parts of the dataset. After the parameter gradients
 are calculated (`loss.backwards()`) on each GPU, the gradients need to be
-synchronized and averaged across all participating GPUs. 
+synchronized and averaged across all participating GPUs.
 
 The functions in this module assist with gradient synchronization,
 setting up conditions necessary to perform distributive computing, and
-executing multi-GPU jobs. 
+executing multi-GPU jobs.
 """
 import torch as t
 import torch.distributed as dist
@@ -30,7 +30,7 @@
 MIN_INT64 = t.iinfo(t.int64).min
 MAX_INT64 = t.iinfo(t.int64).max
 
-__all__ = ['sync_and_avg_gradients', 
+__all__ = ['sync_and_avg_gradients',
            'run_single_to_multi_gpu',
            'run_single_gpu_script',
            'report_speed_test',
@@ -44,11 +44,11 @@ def sync_and_avg_gradients(model: CDIModel):
 
     Parameters:
         model: CDIModel
-            Model for CDI/ptychography reconstruction  
+            Model for CDI/ptychography reconstruction
     """
     for param in model.parameters():
         if param.requires_grad:
-            dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 
+            dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
             param.grad.data /= model.world_size
 
 
@@ -59,10 +59,11 @@ def run_single_gpu_script(script_path: str,
                           seed: int = None):
     """
     Wraps single-GPU reconstruction scripts to be ran as a multi-GPU job via
-    torchrun calls. 
+    torchrun calls.
 
-    `run_single_gpu_script` is intended to be called in a script
-    (e.g., cdtools.tools.distributed.single_to_multi_gpu) with the following form:
+    `run_single_gpu_script` is intended to be called within a script
+    (e.g., cdtools.tools.distributed.single_to_multi_gpu) with the following
+    form:
 
     ```
     # multi_gpu_job.py
@@ -73,33 +74,56 @@ def run_single_gpu_script(script_path: str,
                                      timeout=30,
                                      nccl_p2p_disable=True)
     ```
-    
-    `torchrun` should then be used to run this script as a distributive job using,
-    for instance:
-    
+
+    `torchrun` should then be used to run this script as a single-node,
+    multi-gpu job through the command line interface using, for instance:
+
     ```
-    torchrun --nnodes=1 --nproc_per_node=<number of GPUs> multi_gpu_job.py
+    torchrun
+        --standalone
+        --nnodes=1
+        --nproc_per_node=$nGPUs
+        multi_gpu_job.py
     ```
 
-    `torchrun` will spawn a number of subprocesses equal to the number of GPUs specified 
-    (--nproc_per_node). On each subprocess, `run_single_gpu_script`
-    will set up process groups (lets each GPU communicate with each other) and environment
-    variables necessary for multi-GPU jobs. The single-GPU script will then be ran by
-    each subprocess, where gradient synchronization will be faciliated by
-    `sync_and_avg_gradients` calls from `cdtools.Reconstructors`
-    while data shuffling/loading is handled by `cdtools.Reconstructor.setup_dataloader`.
-
-    
     If you want to use specific GPU IDs for reconstructions, you need to set up
-    the environment variable `CDTOOLS_GPU_IDS` rather than `CUDA_VISIBLE_DEVICES`. 
-    If you wanted to use GPU IDs `1, 3, 4` for example, write:
-    
+    the environment variable `CDTOOLS_GPU_IDS` rather than
+    `CUDA_VISIBLE_DEVICES`. If you wanted to use GPU IDs `1, 3, 4` for example,
+    write:
+
     ```
-    CDTOOLS_GPU_IDS=1,3,4 torchrun --nnodes=1 --nproc_per_node=<number of GPUs> multi_gpu_job.py
+    CDTOOLS_GPU_IDS=1,3,4 torchrun
+                            --standalone
+                            --nnodes=1
+                            --nproc_per_node=$nGPUs
+                            multi_gpu_job.py
     ```
-    
-    NOTE: For each subprocess `cdt-torchrun` creates, the environment variable
-          `CUDA_VISIBLE_DEVICES` will be (re)defined as the GPU rank.
+
+    `torchrun` will spawn a number of subprocesses equal to the number of GPUs
+    specified (--nproc_per_node). Each subprocess will run the specified
+    script (e.g., `multi_gpu_job` in the above example) which make a call to
+    `run_single_gpu_script`.
+
+    `run_single_gpu_script` will first set up process group (lets the
+    different subprocesses and their respective GPUs communicate with each
+    other) and environment variables necessary for multi-GPU jobs. Afterwards,
+    each subprocess runs the single-GPU reconstruction script (e.g.,
+    `YOUR_RECONSTRUCTION_SCRIPT.py` in the above example). Methods within
+    the `cdtools.Reconstructor` class/subclasses handle gradient
+    synchronization after backpropagation (loss.backward()) as well as
+    distributive data shuffling/loading.
+
+
+    NOTE:
+        1) This method is indended to be called within a subprocess spawned
+           by `torchrun`.
+        2) For each subprocess `torchrun` creates, the environment variable
+           `CUDA_VISIBLE_DEVICES` will be (re)defined based on the GPU rank
+           or the GPU ID list if `CDTOOLS_GPU_IDS` is defined. The
+           environment variable `NCCL_P2P_DISABLE` will also be (re)defined
+           based on `nccl_p2p_disable`.
+        3) This method has only been tested using the `nccl` backend on a
+           single node, with `nccl_p2p_disable` set to `True`.
 
     Parameters:
         script_name: str
@@ -107,49 +131,63 @@ def run_single_gpu_script(script_path: str,
             If you're using a relative path, make sure the string doesn't start
             with a backslash.
         backend: str
-            Multi-gpu communication backend to use. Default is the 'nccl' backend,
-            which is the only supported backend for CDTools.
-            See https://pytorch.org/docs/stable/distributed.html for additional info
-            about PyTorch-supported backends.
+            Multi-gpu communication backend to use. Default is the 'nccl'
+            backend, which is the only supported backend for CDTools.
+            See https://pytorch.org/docs/stable/distributed.html for
+            additional info about PyTorch-supported backends.
         timeout: int
-            Timeout for operations executed against the process group in seconds. 
-            Default is 30 seconds. After timeout has been reached, all subprocesses
-            will be aborted and the process calling this method will crash. 
+            Timeout for operations executed against the process group in
+            seconds. Default is 30 seconds. After timeout has been reached,
+            all subprocesses will be aborted and the process calling this
+            method will crash.
         nccl_p2p_disable: bool
-            Disable NCCL peer-2-peer communication. If you find that all your GPUs
-            are at 100% usege but the program isn't doing anything, try enabling
-            this variable.
+            Disable NCCL peer-2-peer communication. If you find that all your
+            GPUs are at 100% usage but the program isn't doing anything, try
+            enabling this variable.
         seed: int
             Seed for generating random numbers.
-            
+
+    Environment variables created/redefined:
+        `NCCL_P2P_DISABLE`: Enables or disables NCCL peer-to-peer communication
+            defined by `nccl_p2p_disable`.
+        `CUDA_VISIBLE_DEVICES`: The GPU IDs visible to each subprocess. For
+            each subprocess, this variable is set to the GPU ID the subprocess
+            has been assigned.
     """
-    
+
     # Check if the file path actually exists before starting the process group
     if not os.path.exists(script_path):
-        raise FileNotFoundError(f'Cannot open file: {os.path.join(os.getcwd(), script_path)}')
-    
-    # Enable/disable NCCL peer-to-peer communication. The boolean needs to be converted into
-    # a string for the environment variable.
+        raise FileNotFoundError('Cannot open file: ' +
+                                f'{os.path.join(os.getcwd(), script_path)}')
+
+    # Enable/disable NCCL peer-to-peer communication. The boolean needs to be
+    # converted into a string for the environment variable.
     os.environ['NCCL_P2P_DISABLE'] = str(int(nccl_p2p_disable))
-    
-    ##########   Force each subprocess to see only the GPU ID we assign it  ###########
-    # Why am I doing this? If this constraint is not imposed, then calling all_reduce will
-    # cause all subprocess Ranks to occupy memory on both their own respective GPUs (normal) 
-    # as well as Rank 0's GPU (not intended behavior). The root cause is not entirely clear 
-    # but there are two ways to avoid this empirically:
-    #   1) Force each subprocesses' CUDA_VISIBLE_DEVICE to be their assigned GPU ids.
-    #   2) Within the reconstruction script, change `device='cuda'` to `device=f'cuda{model.rank}'`
-    #
-    # Option 1 is chosen here to use single-GPU reconstruction scripts AS-IS for multi-GPU jobs.
-
-    # The GPU rank and world_size is visible as an environment variable through torchrun calls.
+
+    """Force each subprocess to see only the GPU ID we assign it
+    Why do this? If this constraint is not imposed, then calling all_reduce
+    will cause all subprocess Ranks to occupy memory on both their own
+    respective GPUs (normal) as well as Rank 0's GPU (not intended behavior).
+    The root cause is not entirely clear but there are two ways to avoid
+    this behavior empirically:
+        1) Force each subprocesses' CUDA_VISIBLE_DEVICE to be their assigned
+           GPU ids.
+        2) Within the reconstruction script, change `device='cuda'` to
+           `device=f'cuda{model.rank}'`
+
+    Option 1 is chosen here to use single-GPU reconstruction scripts AS-IS
+    for multi-GPU jobs.
+    """
+    # The GPU rank and world_size is visible as an environment variable
+    # through torchrun calls.
     rank = int(os.environ.get('RANK'))
     world_size = int(os.environ.get('WORLD_SIZE'))
 
     # If the CDTOOLS_GPU_IDS environment variable is defined, then assign based
-    # on the GPU IDS provided in that list. Otherwise, use the rank for the GPU ID.
+    # on the GPU IDS provided in that list. Otherwise, use the rank for the
+    # GPU ID.
     gpu_ids = os.environ.get('CDTOOLS_GPU_IDS')
-    
+
     if gpu_ids is None:
         gpu_id = rank
     else:
@@ -157,80 +195,92 @@ def run_single_gpu_script(script_path: str,
 
     os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
 
-    ################################  Run the script  #################################
-
     if rank == 0:
-        print(f'[INFO]: Starting up multi-GPU reconstructions with {world_size} GPUs.')
-
-    # Start up the process group (lets the different subprocesses talk with each other)
-    dist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout))
-
-    try:     
-        # Force all subprocesses to either use the pre-specified or Rank 0's RNG seed
+        print('[INFO]: Starting up multi-GPU reconstructions with ' +
+              f'{world_size} GPUs.')
+
+    # Start up the process group (lets the different subprocesses talk with
+    # each other)
+    dist.init_process_group(backend=backend,
+                            timeout=datetime.timedelta(seconds=timeout))
+
+    # Run the script
+    try:
+        # Force all subprocesses to either use the pre-specified or Rank 0's
+        # RNG seed
         if seed is None:
-            seed_local = t.tensor(random.randint(MIN_INT64, MAX_INT64), device='cuda', dtype=t.int64)
+            seed_local = t.tensor(random.randint(MIN_INT64, MAX_INT64),
+                                  device='cuda',
+                                  dtype=t.int64)
             dist.broadcast(seed_local, 0)
             seed = seed_local.item()
 
         t.manual_seed(seed)
 
-        # Run the single-GPU reconstruction script 
         runpy.run_path(script_path, run_name='__main__')
 
-        # Let the user know the job is done
         if rank == 0:
-            print(f'[INFO]: Reconstructions complete. Terminating process group.')
+            print('[INFO]: Reconstructions complete.' +
+                  ' Terminating process group.')
 
     finally:
-        # Kill the process group
-        dist.destroy_process_group()   
+        dist.destroy_process_group()
         if rank == 0:
-            print(f'[INFO]: Process group terminated. Multi-GPU job complete.')
+            print('[INFO]: Process group terminated. Multi-GPU job complete.')
 
 
 def run_single_to_multi_gpu():
     """
-    Runs a single-GPU reconstruction script as a single-node multi-GPU job via torchrun.
-    
-    This function can be executed as a python console script as `cdt-torchrun` and
-    serves as a wrapper over a `torchrun` call to `cdtools.tools.distributed.single_to_multi_gpu`.
-    
-    In the simplest case, a single-GPU script can be ran as a multi-GPU job using
-    the following `cdt-torchrun` call in the command line
+    Runs a single-GPU reconstruction script as a single-node multi-GPU job via
+    torchrun.
+
+    This convienience function can be executed as a python console script as
+    `cdt-torchrun` and serves as a wrapper over a `torchrun` call to
+    `cdtools.tools.distributed.single_to_multi_gpu`.
+
+    In the simplest case, a reconstruction script can be ran as a multi-GPU job
+    (with `nGPU` number of GPUs) using the following `cdt-torchrun` call in
+    the command line interface:
     ```
-    cdt-torchrun --ngpus=<number of GPUs> YOUR_RECONSTRUCTION_SCRIPT.py
+    cdt-torchrun
+        --ngpus=<nGPUs>
+        YOUR_RECONSTRUCTION_SCRIPT.py
     ```
-    
+
     which is equivalent to the following `torchrun` call
     ```
-    torchrun 
-        --standalone 
-        --nnodes=1 
-        --nproc_per_node=<number of GPUs> 
+    torchrun
+        --standalone
+        --nnodes=1
+        --nproc_per_node=$nGPUs
         -m cdtools.tools.distributed.single_to_multi_gpu
-        --backend='nccl'
+        --backend=nccl
         --timeout=30
         --nccl_p2p_disable=1
         YOUR_RECONSTRUCTION_SCRIPT.py
     ```
-    
-    With a single node (--nnodes=1), `cdt-torchrun` will launch a given number of subprocesses 
-    equivalent to the number of GPUs specified. This number must be less than or equal to the
-    actual number of GPUs available on your node.
+
+    Within a single node, `cdt-torchrun` will launch a given number of
+    subprocesses equivalent to the number of GPUs specified. This number must
+    be less than or equal to the actual number of GPUs available on your node.
 
     If you want to use specific GPU IDs for reconstructions, you need to set up
-    the environment variable `CDTOOLS_GPU_IDS` rather than `CUDA_VISIBLE_DEVICES`. 
-    If you wanted to use GPU IDs `1, 3, 4` for example, write:
-    
+    the environment variable `CDTOOLS_GPU_IDS` rather than
+    `CUDA_VISIBLE_DEVICES`. If you wanted to use GPU IDs `1, 3, 4` for example,
+    write:
+
     ```
-    CDTOOLS_GPU_IDS=1,3,4 cdt-torchrun --ngpus=3 YOUR_RECONSTRUCTION_SCRIPT.py
+    CDTOOLS_GPU_IDS=1,3,4 cdt-torchrun
+                              --ngpus=3
+                              YOUR_RECONSTRUCTION_SCRIPT.py
     ```
 
-    If additional `torchrun` arguments need to be passed, you may need to make a direct
-    `torchrun` call rather than use `cdt-torchrun`. You may also submit an issue/PR.
+    If additional `torchrun` arguments need to be passed, consider making
+    a direct `torchrun` call rather than use `cdt-torchrun`.
 
-    NOTE: `cdt-torchrun` has only been tested using the 'nccl' backend, NCCL peer-to-peer communication
-          disabled, and using 1 node. 
+    NOTE:
+        1) This method has only been tested using the `nccl` backend on a
+           single node, with `nccl_p2p_disable` set to `True`.
 
     Arguments:
         script_path: str
@@ -238,78 +288,105 @@ def run_single_to_multi_gpu():
         --ngpus: int
             Number of GPUs to use.
         --nnodes: int
-            Optional, number of nodes. Default 1; more than 1 nodes has not been tested.
+            Optional, number of nodes. Default 1; more than 1 nodes has not
+            been tested.
         --backend: str
-            Optional, communication backend for distributed computing (either `nccl` or `gloo`).
+            Optional, communication backend for distributed computing (either
+            `nccl` or `gloo`).
             Default is `nccl`
         --timeout: int
-            Optional, time in seconds before the distributed process is killed. 
+            Optional, time in seconds before the distributed process is killed.
             Default is 30 seconds.
         --nccl_p2p_disable: int
-            Optional, disable (1) or enable (0) NCCL peer-to-peer communication. Default
-            is 1.
-        
+            Optional, disable (1) or enable (0) NCCL peer-to-peer
+            communication. Default is 1.
+
     """
     # Define the arguments we need to pass to dist.script_wrapper
     parser = argparse.ArgumentParser()
 
     parser.add_argument('--ngpus',
                         type=int,
-                        help='Number of GPUs to use (called --nproc_per_node in torchrun)')
-    parser.add_argument('--nnodes', 
-                        type=str, 
+                        help='Number of GPUs to use.')
+    parser.add_argument('--nnodes',
+                        type=str,
                         default=1,
-                        help='Number of nodes participating in distributive computing.')
-    parser.add_argument('--backend', 
-                        type=str, 
+                        help='Number of participating nodes.')
+    parser.add_argument('--backend',
+                        type=str,
                         default='nccl',
                         choices=['nccl', 'gloo'],
                         help='Communication backend (nccl or gloo)')
-    parser.add_argument('--timeout', 
-                        type=int, 
+    parser.add_argument('--timeout',
+                        type=int,
                         default=30,
                         help='Time before process is killed in seconds')
-    parser.add_argument('--nccl_p2p_disable', 
-                        type=int, 
+    parser.add_argument('--nccl_p2p_disable',
+                        type=int,
                         default=1,
-                        choices=[0,1],
-                        help='Disable (1) or enable (0) NCCL peer-to-peer communication')
-    parser.add_argument('script_path', 
-                        type=str, 
-                        help='Single GPU script file name (with or without .py extension)')
-    
+                        choices=[0, 1],
+                        help='Disable (1) or enable (0) NCCL peer-to-peer' +
+                             'communication')
+    parser.add_argument('script_path',
+                        type=str,
+                        help='Single GPU script file name (with or without ' +
+                        '.py extension)')
+
     # Get the arguments
     args = parser.parse_args()
-    
+
     # Perform the torchrun call of the wrapped function
-    subprocess.run(['torchrun', # We set up the torchrun arguments first
-                    '--standalone', # Indicates that we're running a single machine, multiple GPU job.
-                    f'--nnodes={args.nnodes}', 
-                    f'--nproc_per_node={args.ngpus}', 
+    subprocess.run(['torchrun',
+                    '--standalone',
+                    f'--nnodes={args.nnodes}',
+                    f'--nproc_per_node={args.ngpus}',
                     '-m',
-                    'cdtools.tools.distributed.single_to_multi_gpu', 
+                    'cdtools.tools.distributed.single_to_multi_gpu',
                     f'--backend={args.backend}',
                     f'--timeout={args.timeout}',
                     f'--nccl_p2p_disable={args.nccl_p2p_disable}',
                     f'{args.script_path}'])
-    
+
 
 def report_speed_test(func: Callable):
     """
-    Decorator function which saves the loss-versus-time/epoch history of a 
-    function-wrapped reconstruction script to a pickle dump file. This function
-    is intended to be used for multi-GPU test studies performed with
-    `cdtools.tools.distributed.run_speed_test`, which sets several environment
+    Decorator function which saves the loss-versus-time/epoch history of a
+    reconstruction script as a pickle dump file in a specified directory.
+
+    The entire reconstruction script (excluding import statements) must
+    be wrapped by a function which returns the model. The script must also
+    have an if-name-main block to call the wrapped script.
+
+    This decorator is intended to only be used by reconstruction scripts
+    that are called by `run_speed_test` to conduct multi-GPU performance
+    studies (loss-versus-time/epoch and runtime speed-ups) using `N` GPUs
+    and `M` trials per GPU count. `run_speed_test` sets several environment
     variables specifing the name and directory of the result files to be saved.
-    
-    If the directory specified by `CDTOOLS_SPEED_TEST_RESULT_DIR` does not exist,
-    one will be created in the current directory.
 
     Parameters:
         func: Callable
             The entire reconstruction script wrapped in a function. Within the
             script, the function must be called with an if-name-main statement.
             Additionally, the function must return the reconstructed model.
+
+    Expected environment variables:
+        `CDTOOLS_TRIAL_NUMBER`: The test trial number
+        `CDTOOLS_SPEED_TEST_RESULTS_DIR`: Directory to save the pickle dump
+            file.
+        `CDTOOLS_SPEED_TEST_PREFIX`: Prefix of the pickle dump file name.
+
+    Outputs in the pickle dump file:
+        study_dict: dict
+            Results of the `N` GPU `M`-th trial run. Contains the following
+            key-value pairs:
+            `study_dict['loss history']`: List[np.float32]
+                Loss values as a function of epoch
+            `study_dict['time history']`: List[float]
+                Time recorded at each epoch in seconds
+            `study_dict['nGPUs']`: int
+                Number of GPUs used
+            `study_dict['trial']`: int
+                Trial number
     """
     def wrapper():
         # Figure out how to name the save file and where to save it to
@@ -324,25 +401,30 @@ def wrapper():
         # Save the model and loss history, but only using the rank 0 process
         if model.rank == 0:
             # Set up the file name:
-            file_name = f'{file_prefix}_nGPUs_{model.world_size}_TRIAL_{trial_number}.pkl'
+            file_name = f'{file_prefix}_nGPUs_{model.world_size}_' +\
+                        f'TRIAL_{trial_number}.pkl'
             # Grab the loss and time history
             loss_history = model.loss_history
             time_history = model.loss_times
+
             # Store quantities in a dictionary
-            dict = {'loss history':loss_history,
-                    'time history':time_history,
-                    'nGPUs':model.world_size,
-                    'trial':trial_number}
-            
+            study_dict = {'loss history': loss_history,
+                          'time history': time_history,
+                          'nGPUs': model.world_size,
+                          'trial': trial_number}
+            print(type(loss_history[0]))
+            print(type(time_history[0]))
+            print(type(model.world_size))
+            print(type(trial_number))
             # Save the quantities
-            with open (os.path.join(output_dir, file_name), 'wb') as save_file:
-                pickle.dump(dict, save_file)
-            
-            print(f'[INFO]: Saved results to: {file_name}') 
+            with open(os.path.join(output_dir, file_name), 'wb') as save_file:
+                pickle.dump(study_dict, save_file)
+
+            print(f'[INFO]: Saved results to: {file_name}')
     return wrapper
 
 
-def run_speed_test(world_sizes: int, 
+def run_speed_test(world_sizes: List[int],
                    runs: int,
                    script_path: str,
                    output_dir: str,
@@ -354,27 +436,65 @@ def run_speed_test(world_sizes: int,
                    timeout: int = 30,
                    nccl_p2p_disable: bool = True,
                    seed: int = None
-                   ) -> Tuple[List[float], 
-                              List[float], 
-                              List[float], 
+                   ) -> Tuple[List[float],
+                              List[float],
+                              List[float],
                               List[float]]:
     """
-    Executes a reconstruction script using `n` GPUs and `m` trials per GPU count using 
-    `torchrun` and `cdtools.tools.distributed.single_to_multi_gpu`.
-
-    If the directory specified by `output_dir` does not exist,
-    one will be created in the current directory.
+    Executes a reconstruction script using `world_sizes` GPUs and `runs`
+    trials per GPU count using `torchrun` and
+    `cdtools.tools.distributed.single_to_multi_gpu`.
+
+    `run_speed_test` requires the tested reconstruction script to be wrapped
+    in a function, which returns the reconstructed model, along with a
+    if-name-main block which calls the function. The function needs to be
+    decorated with `report_speed_test`.
+
+    The speed test (specifically, `report_speed_test`) will generate pickle
+    dump files named `<file_prefix>_nGPUs_<world_size>_TRIAL_<run number>.pkl`
+    at the directory `output_dir` (see documentation for `report_speed_test`
+    for the file content). If `output_dir` does not exist, one will be created
+    in the current directory.
+
+    After each trial, the contents of the dump file are read and stored by
+    `run_speed_test` to calculate the mean and standard deviation of the
+    loss-versus-epoch/time and runtime speedup data over the `runs` trials
+    executed. If `delete_output_files` is enabled, then the pickle dump files
+    will be deleted after they have been read.
+
+    `run_speed_test` executes the following in a subprocess to run
+    single/multi-GPU jobs
+    ```
+    torchrun
+        --standalone
+        --nnodes=$NNODES
+        --nproc_per_node=$WORLD_SIZE
+        -m
+        cdtools.tools.distributed.single_to_multi_gpu
+        --backend=$BACKEND
+        --timeout=$TIMEOUT
+        --nccl_p2p_disable=$NCCL_P2P_DISABLE
+        YOUR_RECONSTRUCTION_SCRIPT.py
+    ```
+    and provides the following environment variables to the child environment
+    that are necessary for the pickle dump files to be generated by
+    `report_speed_test`:
+        `CDTOOLS_TRIAL_NUMBER`: The test trial number
+        `CDTOOLS_SPEED_TEST_RESULTS_DIR`: Directory to save the pickle dump
+            file.
+        `CDTOOLS_SPEED_TEST_PREFIX`: Prefix of the pickle dump file name.
 
     Parameters:
-        world_sizes: list[int]
-            Number of GPUs to use. User can specify several GPU counts in a list.
-            But the first entry must be 1 (single-GPU).
+        world_sizes: List[int]
+            Number of GPUs to use. User can specify several GPU counts in a
+            list. But the first entry must be 1 (single-GPU).
         runs: int
             How many repeat reconstructions to perform
         script_path: str
             Path of the single-gpu reconstruction script.
         output_dir: str
-            Directory of the loss-vs-time/epoch data generated for the speed test.
+            Directory of the loss-vs-time/epoch data generated for the speed
+            test.
         file_prefix: str
             Prefix of the speed test result file names
         show_plot: bool
@@ -383,23 +503,25 @@ def run_speed_test(world_sizes: int,
             Removes the results files produced by `report_speed_test` from
             the output_dir after each trail run.
         nnodes: int
-            Number of nodes to use. This module has only been tested with 1 node.
+            Number of nodes to use. This module has only been tested with 1
+            node.
         backend: str
             Communication backend for distributive computing. NVidia Collective
-            Communications Library ('nccl') is the default and only tested option.
-            See https://docs.pytorch.org/docs/stable/distributed.html for other
-            backends supported by pytorch (but have not been tested in this package).
+            Communications Library ('nccl') is the default and only tested
+            option. See https://docs.pytorch.org/docs/stable/distributed.html
+            for other backends supported by pytorch (but have not been tested
+            in this package).
         timeout: int
-            Timeout for operations to be executed in seconds. All processes will be
-            aborted after the timeout has been exceeded.
+            Timeout for operations to be executed in seconds. All processes
+            will be aborted after the timeout has been exceeded.
         nccl_p2p_disable: bool
             Sets the `NCCL_P2P_DISABLE` environment variable to enable/disable
             nccl peer-to-peer communication. If you find that all your GPUs
-            are at 100% usage but the program isn't doing anything, try enabling
-            this variable.
+            are at 100% usage but the program isn't doing anything, try
+            enabling this variable.
         seed: int
-            Seed for generating random numbers. Default is None (seed is randomly
-            generated).
+            Seed for generating random numbers. Default is None (seed is
+            randomly generated).
 
     Returns:
         final_loss_mean_list: List[float]
@@ -409,19 +531,20 @@ def run_speed_test(world_sizes: int,
             Standard deviation of the final loss value over `runs` iterations
             for each `world_size`.
         speed_up_mean_list: List[float]
-            Mean runtime speed-up over `runs` iterations for each `world_size` 
-            value specified. Speed-up is defined as the `runtime_nGPUs / runtime_1_GPU`.
+            Mean runtime speed-up over `runs` iterations for each `world_size`
+            value specified. Speed-up is defined as the
+            `runtime_nGPUs / runtime_1_GPU`.
         speed_up_std_list: List[float]
             Standard deviation of the runtime speed-up over `runs` iterations
             for each `world_size`.
     """
-    
+
     # Make sure the directory exists; or else create it
     Path(output_dir).mkdir(parents=False, exist_ok=True)
 
     # Set stuff up for plots
     if show_plot:
-        fig, (ax1,ax2,ax3) = plt.subplots(1,3)
+        fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
 
     # Store the value of the single GPU time
     time_1gpu = 0
@@ -440,33 +563,33 @@ def run_speed_test(world_sizes: int,
         loss_hist_list = []
 
         for i in range(runs):
-            print(f'[INFO]: Resetting the model...')
+            print('[INFO]: Resetting the model...')
             print(f'[INFO]: Starting run {i+1}/{runs} on {world_size} GPU(s)')
 
             # The scripts running speed tests need to read the trial number
-            # they are on using. We send this information using environment 
+            # they are on using. We send this information using environment
             # variables sent to the child processes spawned by subprocess.run
             child_env = os.environ.copy()
             child_env['CDTOOLS_TRIAL_NUMBER'] = str(i)
             child_env['CDTOOLS_SPEED_TEST_RESULTS_DIR'] = output_dir
             child_env['CDTOOLS_SPEED_TEST_PREFIX'] = file_prefix
 
-            # Set up the terminal commands
-            cmd = ['torchrun', # We set up the torchrun arguments first
-                   '--standalone', # Indicates that we're running a single machine, multiple GPU job.
-                   f'--nnodes={nnodes}', 
-                   f'--nproc_per_node={world_size}', 
+            # Set up the terminal commands for a single-node, multi-GPU job
+            cmd = ['torchrun',
+                   '--standalone',
+                   f'--nnodes={nnodes}',
+                   f'--nproc_per_node={world_size}',
                    '-m',
-                   'cdtools.tools.distributed.single_to_multi_gpu', 
+                   'cdtools.tools.distributed.single_to_multi_gpu',
                    f'--backend={backend}',
                    f'--timeout={timeout}',
                    f'--nccl_p2p_disable={int(nccl_p2p_disable)}']
-            
+
             if seed is not None:
-                    cmd.append(f'--seed={seed}')
+                cmd.append(f'--seed={seed}')
 
             cmd.append(f'{script_path}')
-            
+
             # Run the single/multi-GPU job
             try:
                 subprocess.run(cmd, check=True, env=child_env)
@@ -476,8 +599,10 @@ def run_speed_test(world_sizes: int,
 
             # Load the loss results
             print('[INFO]: Reconstruction complete. Loading loss results...')
-            
-            save_path = os.path.join(output_dir, f'{file_prefix}_nGPUs_{world_size}_TRIAL_{i}.pkl')
+
+            save_path = os.path.join(output_dir,
+                                     f'{file_prefix}_nGPUs_{world_size}_' +
+                                     f'TRIAL_{i}.pkl')
 
             with open(save_path, 'rb') as f:
                 results = pickle.load(f)
@@ -489,7 +614,7 @@ def run_speed_test(world_sizes: int,
             if delete_output_files:
                 print(f'[INFO]: Removing {save_path}')
                 os.remove(save_path)
-            
+
         # Calculate the statistics
         time_mean = t.tensor(time_list).mean(dim=0)/60
         time_std = t.tensor(time_list).std(dim=0)/60
@@ -502,10 +627,10 @@ def run_speed_test(world_sizes: int,
             std_1gpu = time_std[-1]
 
         # Calculate the speed-up relative to using a single GPU
-        speed_up_mean = time_1gpu / time_mean[-1] 
+        speed_up_mean = time_1gpu / time_mean[-1]
         speed_up_std = speed_up_mean * \
             t.sqrt((std_1gpu/time_1gpu)**2 + (time_std[-1]/time_mean[-1])**2)
-        
+
         # Store the final lossess and speed-ups
         final_loss_mean_list.append(loss_mean[-1].item())
         final_loss_std_list.append(loss_std[-1].item())
@@ -515,11 +640,11 @@ def run_speed_test(world_sizes: int,
         # Add another plot
         if show_plot:
             ax1.errorbar(time_mean, loss_mean, yerr=loss_std, xerr=time_std,
-                        label=f'{world_size} GPUs')
-            ax2.errorbar(t.arange(0,loss_mean.shape[0]), loss_mean, yerr=loss_std,
-                        label=f'{world_size} GPUs')
+                         label=f'{world_size} GPUs')
+            ax2.errorbar(t.arange(0, loss_mean.shape[0]), loss_mean,
+                         yerr=loss_std, label=f'{world_size} GPUs')
             ax3.errorbar(world_size, speed_up_mean, yerr=speed_up_std, fmt='o')
-        
+
     # Plot
     if show_plot:
         fig.suptitle(f'Multi-GPU performance test | {runs} runs performed')
@@ -537,7 +662,8 @@ def run_speed_test(world_sizes: int,
         ax3.set_xlabel('Number of GPUs')
         ax3.set_ylabel('Speed-up relative to single GPU')
         plt.show()
-    
-    print(f'[INFO]: Multi-GPU speed test completed.')
 
-    return final_loss_mean_list, final_loss_std_list, speed_up_mean_list, speed_up_std_list
\ No newline at end of file
+    print('[INFO]: Multi-GPU speed test completed.')
+
+    return (final_loss_mean_list, final_loss_std_list,
+            speed_up_mean_list, speed_up_std_list)

From 5ab3ab635f87e4166b2dbbf2eeef9d1a8007bc47 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 02:11:20 +0000
Subject: [PATCH 102/115] Got rid of some print statements from distributed.py

---
 src/cdtools/tools/distributed/distributed.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 6790431f..9e8d11d8 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -412,10 +412,6 @@ def wrapper():
                           'time history': time_history,
                           'nGPUs': model.world_size,
                           'trial': trial_number}
-            print(type(loss_history[0]))
-            print(type(time_history[0]))
-            print(type(model.world_size))
-            print(type(trial_number))
             # Save the quantities
             with open(os.path.join(output_dir, file_name), 'wb') as save_file:
                 pickle.dump(study_dict, save_file)

From 5de4e373be7da40c6eb1e1efa0f71b39e5806b6c Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 03:59:17 +0000
Subject: [PATCH 103/115] Linted Reconstructors

---
 src/cdtools/reconstructors/adam.py  |  85 +++++++------
 src/cdtools/reconstructors/base.py  | 189 +++++++++++++++-------------
 src/cdtools/reconstructors/lbfgs.py |  66 +++++-----
 src/cdtools/reconstructors/sgd.py   |  66 +++++-----
 4 files changed, 219 insertions(+), 187 deletions(-)

diff --git a/src/cdtools/reconstructors/adam.py b/src/cdtools/reconstructors/adam.py
index c4d2e463..5a489a06 100644
--- a/src/cdtools/reconstructors/adam.py
+++ b/src/cdtools/reconstructors/adam.py
@@ -7,11 +7,6 @@
 the 'training' of a model given some dataset and optimizer.
 """
 import torch as t
-from torch.utils import data as torchdata
-from torch.utils.data.distributed import DistributedSampler
-from scipy import io
-from contextlib import contextmanager
-from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.models import CDIModel
 from typing import Tuple, List, Union
@@ -19,29 +14,32 @@
 
 __all__ = ['Adam']
 
+
 class Adam(Reconstructor):
     """
-    The Adam Reconstructor subclass handles the optimization ('reconstruction') of 
-    ptychographic models and datasets using the Adam optimizer.
+    The Adam Reconstructor subclass handles the optimization ('reconstruction')
+    of ptychographic models and datasets using the Adam optimizer.
 
     Parameters
     ----------
     model: CDIModel
-        Model for CDI/ptychography reconstruction
+        Model for CDI/ptychography reconstruction.
     dataset: Ptycho2DDataset
-        The dataset to reconstruct against
+        The dataset to reconstruct against.
     subset : list(int) or int
-        Optional, a pattern index or list of pattern indices to use
+        Optional, a pattern index or list of pattern indices to use.
     schedule : bool
-        Optional, create a learning rate scheduler (torch.optim.lr_scheduler._LRScheduler)
+        Optional, create a learning rate scheduler
+        (torch.optim.lr_scheduler._LRScheduler).
 
     Important attributes:
     - **model** -- Always points to the core model used.
     - **optimizer** -- This class by default uses `torch.optim.Adam` to perform
-      optimizations.
-    - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the `optimize` method.
-    - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
-      `setup_dataloader` method.
+        optimizations.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the
+        `optimize` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that is defined by
+        calling the `setup_dataloader` method.
     """
     def __init__(self,
                  model: CDIModel,
@@ -49,11 +47,10 @@ def __init__(self,
                  subset: List[int] = None):
 
         super().__init__(model, dataset, subset)
-        
+
         # Define the optimizer for use in this subclass
         self.optimizer = t.optim.Adam(self.model.parameters())
-    
-    
+
     def adjust_optimizer(self,
                          lr: int = 0.005,
                          betas: Tuple[float] = (0.9, 0.999),
@@ -64,19 +61,19 @@ def adjust_optimizer(self,
         Parameters
         ----------
         lr : float
-            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
-            typically the highest possible value with any chance of being stable
+            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05
+            is typically the highest possible value with any chance of being
+            stable.
         betas : tuple
             Optional, the beta_1 and beta_2 to use. Default is (0.9, 0.999).
         amsgrad : bool
-            Optional, whether to use the AMSGrad variant of this algorithm
+            Optional, whether to use the AMSGrad variant of this algorithm.
         """
         for param_group in self.optimizer.param_groups:
             param_group['lr'] = lr
             param_group['betas'] = betas
             param_group['amsgrad'] = amsgrad
 
-
     def optimize(self,
                  iterations: int,
                  batch_size: int = 15,
@@ -92,7 +89,7 @@ def optimize(self,
         Runs a round of reconstruction using the Adam optimizer
 
         Formerly `CDIModel.Adam_optimize`
-        
+
         This calls the Reconstructor.optimize superclass method
         (formerly `CDIModel.AD_optimize`) to run a round of reconstruction
         once the dataloader and optimizer hyperparameters have been
@@ -101,27 +98,30 @@ def optimize(self,
         Parameters
         ----------
         iterations : int
-            How many epochs of the algorithm to run
+            How many epochs of the algorithm to run.
         batch_size : int
-            Optional, the size of the minibatches to use
+            Optional, the size of the minibatches to use.
         lr : float
-            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
-            typically the highest possible value with any chance of being stable
+            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05
+            is typically the highest possible value with any chance of being
+            stable.
         betas : tuple
             Optional, the beta_1 and beta_2 to use. Default is (0.9, 0.999).
         schedule : bool
-            Optional, create a learning rate scheduler (torch.optim.lr_scheduler._LRScheduler)
+            Optional, create a learning rate scheduler
+            (torch.optim.lr_scheduler._LRScheduler).
         amsgrad : bool
-            Optional, whether to use the AMSGrad variant of this algorithm
+            Optional, whether to use the AMSGrad variant of this algorithm.
         regularization_factor : float or list(float)
-            Optional, if the model has a regularizer defined, the set of parameters to pass 
-            the regularizer method
+            Optional, if the model has a regularizer defined, the set of
+            parameters to pass the regularizer method.
         thread : bool
-            Default True, whether to run the computation in a separate thread to allow 
-            interaction with plots during computation
+            Default True, whether to run the computation in a separate thread
+            to allow interaction with plots during computation.
         calculation_width : int
-            Default 10, how many translations to pass through at once for each round of 
-            gradient accumulation. Does not affect the result, only the calculation speed 
+            Default 10, how many translations to pass through at once for each
+            round of gradient accumulation. Does not affect the result, only
+            the calculation speed.
         shuffle : bool
             Optional, enable/disable shuffling of the dataset. This option
             is intended for diagnostic purposes and should be left as True.
@@ -138,15 +138,18 @@ def optimize(self,
         # 2) Set up / re-initialize the data laoder
         self.setup_dataloader(batch_size=batch_size, shuffle=shuffle)
 
-        # 3) The optimizer is created in self.__init__, but the 
+        # 3) The optimizer is created in self.__init__, but the
         #    hyperparameters need to be set up with self.adjust_optimizer
-        self.adjust_optimizer(lr, betas, amsgrad)
+        self.adjust_optimizer(lr=lr,
+                              betas=betas,
+                              amsgrad=amsgrad)
 
         # 4) Set up the scheduler
         if schedule:
-            self.scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 
-                                                                    factor=0.2,
-                                                                    threshold=1e-9)
+            self.scheduler = \
+                t.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer,
+                                                       factor=0.2,
+                                                       threshold=1e-9)
         else:
             self.scheduler = None
 
@@ -154,4 +157,4 @@ def optimize(self,
         return super(Adam, self).optimize(iterations,
                                           regularization_factor,
                                           thread,
-                                          calculation_width)
\ No newline at end of file
+                                          calculation_width)
diff --git a/src/cdtools/reconstructors/base.py b/src/cdtools/reconstructors/base.py
index cca04883..ec5e198a 100644
--- a/src/cdtools/reconstructors/base.py
+++ b/src/cdtools/reconstructors/base.py
@@ -10,26 +10,25 @@
 """
 
 import torch as t
-from torch.utils import data as torchdata
+from torch.utils import data as td
 from torch.utils.data.distributed import DistributedSampler
 import torch.distributed as dist
 import threading
 import queue
 import time
-from contextlib import contextmanager
-from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
 from cdtools.datasets import CDataset
 from cdtools.models import CDIModel
 import cdtools.tools.distributed as cdtdist
-from typing import Tuple, List, Union
+from typing import List, Union
 
 __all__ = ['Reconstructor']
 
+
 class Reconstructor:
     """
     Reconstructor handles the optimization ('reconstruction') of ptychographic
     models given a CDIModel (or subclass) and corresponding CDataset.
-    
+
     This is a base model that defines all functions Reconstructor subclasses
     must implement.
 
@@ -44,11 +43,12 @@ class Reconstructor:
 
     Important attributes:
     - **model** -- Always points to the core model used.
-    - **optimizer** -- A `torch.optim.Optimizer` that must be defined when initializing the
-      Reconstructor subclass.
-    - **scheduler** -- A `torch.optim.lr_scheduler` that may be defined during the `optimize` method.
-    - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
-      `setup_dataloader` method.
+    - **optimizer** -- A `torch.optim.Optimizer` that must be defined when
+        initializing the Reconstructor subclass.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that may be defined during
+        the `optimize` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that is defined by
+        calling the `setup_dataloader` method.
     """
     def __init__(self,
                  model: CDIModel,
@@ -58,27 +58,26 @@ def __init__(self,
         self.subset = subset
 
         # Initialize attributes that must be defined by the subclasses
-        self.optimizer = None       # Defined in the __init__ of the subclass as a torch.optim.Optimizer
-        self.scheduler = None       # Defined as a torch.optim.lr_scheduler
-        self.data_loader = None     # Defined as a torch.utils.data.DataLoader in the setup_dataloader method
-        
+        self.optimizer = None
+        self.scheduler = None
+        self.data_loader = None
+
         # Store the original model
         self.model = model
 
         # Store the dataset
         if subset is not None:
             # if subset is just one pattern, turn into a list for convenience
-            if type(subset) == type(1):
+            if isinstance(subset, int):
                 subset = [subset]
-            dataset = torchdata.Subset(dataset, subset)
+            dataset = td.Subset(dataset, subset)
         self.dataset = dataset
 
-        
-    def setup_dataloader(self, 
+    def setup_dataloader(self,
                          batch_size: int = None,
                          shuffle: bool = True):
         """
-        Sets up / re-initializes the dataloader. 
+        Sets up / re-initializes the dataloader.
 
         Parameters
         ----------
@@ -89,37 +88,40 @@ def setup_dataloader(self,
             is intended for diagnostic purposes and should be left as True.
         """
         if self.model.multi_gpu_used:
-            # First, create a sampler to load subsets of dataset to the GPUs
-            self.sampler = DistributedSampler(self.dataset,
-                                              num_replicas=self.model.world_size,
-                                              rank=self.model.rank,
-                                              shuffle=shuffle,
-                                              drop_last=False)
-            # Now create the dataloader
-            self.data_loader = torchdata.DataLoader(self.dataset,
-                                                    batch_size=batch_size//self.model.world_size,
-                                                    num_workers=0, # Creating extra threads in children processes may cause problems. Leave this at 0.
-                                                    drop_last=False,
-                                                    pin_memory=False,
-                                                    sampler=self.sampler)
+            self.sampler = \
+                DistributedSampler(self.dataset,
+                                   num_replicas=self.model.world_size,
+                                   rank=self.model.rank,
+                                   shuffle=shuffle,
+                                   drop_last=False)
+
+            # Creating extra threads in children processes may cause problems.
+            # Leave num_workers at 0.
+            self.data_loader = \
+                td.DataLoader(self.dataset,
+                              batch_size=batch_size//self.model.world_size,
+                              num_workers=0,
+                              drop_last=False,
+                              pin_memory=False,
+                              sampler=self.sampler)
         else:
             if batch_size is not None:
-                self.data_loader = torchdata.DataLoader(self.dataset,
-                                                        batch_size=batch_size,
-                                                        shuffle=shuffle)
+                self.data_loader = td.DataLoader(self.dataset,
+                                                 batch_size=batch_size,
+                                                 shuffle=shuffle)
             else:
-                self.data_loader = torchdata.Dataloader(self.dataset)
-    
+                self.data_loader = td.Dataloader(self.dataset)
 
     def adjust_optimizer(self, **kwargs):
         """
         Change hyperparameters for the utilized optimizer.
 
-        For each optimizer, the keyword arguments should be manually defined as parameters.
+        For each optimizer, the keyword arguments should be manually defined
+        as parameters.
         """
         raise NotImplementedError()
 
-    def _run_epoch(self, 
+    def _run_epoch(self,
                    stop_event: threading.Event = None,
                    regularization_factor: Union[float, List[float]] = None,
                    calculation_width: int = 10):
@@ -133,22 +135,23 @@ def _run_epoch(self,
             Default None, causes the reconstruction to stop when an exception
             occurs in Optimizer.optimize.
         regularization_factor : float or list(float)
-            Optional, if the model has a regularizer defined, the set of 
+            Optional, if the model has a regularizer defined, the set of
             parameters to pass the regularizer method
         calculation_width : int
-            Default 10, how many translations to pass through at once for each 
-            round of gradient accumulation. This does not affect the result, but 
-            may affect the calculation speed.
+            Default 10, how many translations to pass through at once for each
+            round of gradient accumulation. This does not affect the result,
+            but may affect the calculation speed.
 
-        Yields
+        Returns
         ------
         loss : float
-            The summed loss over the latest epoch, divided by the total diffraction 
-            pattern intensity
+            The summed loss over the latest epoch, divided by the total
+            diffraction pattern intensity
         """
-        # If we're using DistributedSampler (i.e., multi-GPU useage), we need to 
-        # tell it which epoch we're on. Otherwise data shuffling will not work properly
-        if self.model.multi_gpu_used: 
+        # If we're using DistributedSampler (i.e., multi-GPU useage), we need
+        # to tell it which epoch we're on. Otherwise data shuffling will not
+        # work properly
+        if self.model.multi_gpu_used:
             self.data_loader.sampler.set_epoch(self.model.epoch)
 
         # Initialize some tracking variables
@@ -162,6 +165,7 @@ def _run_epoch(self,
         for inputs, patterns in self.data_loader:
             normalization += t.sum(patterns).cpu().numpy()
             N += 1
+
             def closure():
                 self.optimizer.zero_grad()
 
@@ -170,56 +174,62 @@ def closure():
                 # on the GPU at once, while still doing batch processing
                 # for efficiency
                 input_chunks = [[inp[i:i + calculation_width]
-                                    for inp in inputs]
+                                for inp in inputs]
                                 for i in range(0, len(inputs[0]),
-                                                calculation_width)]
+                                calculation_width)]
                 pattern_chunks = [patterns[i:i + calculation_width]
-                                    for i in range(0, len(inputs[0]),
-                                                    calculation_width)]
+                                  for i in range(0, len(inputs[0]),
+                                  calculation_width)]
 
                 total_loss = 0
+
                 for inp, pats in zip(input_chunks, pattern_chunks):
                     # This check allows for graceful exit when threading
                     if stop_event is not None and stop_event.is_set():
                         exit()
 
                     # Run the simulation
-                    sim_patterns = self.model.forward(*inp) 
+                    sim_patterns = self.model.forward(*inp)
 
                     # Calculate the loss
                     if hasattr(self, 'mask'):
-                        loss = self.model.loss(pats,sim_patterns, mask=self.model.mask)
+                        loss = self.model.loss(pats,
+                                               sim_patterns,
+                                               mask=self.model.mask)
                     else:
-                        loss = self.model.loss(pats,sim_patterns)
+                        loss = self.model.loss(pats,
+                                               sim_patterns)
 
                     # And accumulate the gradients
                     loss.backward()
 
-                    # For multi-GPU, average and sync the gradients + losses across all 
-                    # participating GPUs with an all-reduce call. Also sum the losses.             
+                    # For multi-GPU, average and sync the gradients + losses
+                    # across all participating GPUs. Also sum the losses.
                     if self.model.multi_gpu_used:
                         cdtdist.sync_and_avg_gradients(self.model)
-                        dist.all_reduce(loss, op=dist.ReduceOp.SUM) 
+                        dist.all_reduce(loss, op=dist.ReduceOp.SUM)
 
-                    # Normalize the accumulating total loss by the numer of GPUs used
+                    # Normalize the accumulating total loss
                     total_loss += loss.detach() // self.model.world_size
 
                 # If we have a regularizer, we can calculate it separately,
                 # and the gradients will add to the minibatch gradient
-                if regularization_factor is not None and hasattr(self.model, 'regularizer'):
+                if regularization_factor is not None \
+                        and hasattr(self.model, 'regularizer'):
+
                     loss = self.model.regularizer(regularization_factor)
                     loss.backward()
 
-                    # For multi-GPU optimization, average and sync the gradients + 
-                    # losses across all participating GPUs with an all-reduce call.
+                    # For multi-GPU optimization, average and sync the
+                    # gradients + losses across all participating GPUs.
                     if self.model.multi_gpu_used:
                         cdtdist.sync_and_avg_gradients(self.model)
-                
+
                 return total_loss
 
             # This takes the step for this minibatch
             loss += self.optimizer.step(closure).detach().cpu().numpy()
-        
+
         loss /= normalization
 
         # We step the scheduler after the full epoch
@@ -233,7 +243,6 @@ def closure():
         self.model.training_history += self.model.report() + '\n'
         return loss
 
-
     def optimize(self,
                  iterations: int,
                  regularization_factor: Union[float, List[float]] = None,
@@ -241,7 +250,7 @@ def optimize(self,
                  calculation_width: int = 10):
         """
         Runs a round of reconstruction using the provided optimizer
-        
+
         Formerly CDIModel.AD_optimize
 
         This is the basic automatic differentiation reconstruction tool
@@ -250,32 +259,39 @@ def optimize(self,
         the specified number of iterations.
 
         By default, the computation will be run in a separate thread. This
-        is done to enable live plotting with matplotlib during a reconstruction.
+        is done to enable live plotting with matplotlib during a
+        reconstruction.
+
         If the computation was done in the main thread, this would freeze
         the plots. This behavior can be turned off by setting the keyword
-        argument 'thread' to False.        
+        argument 'thread' to False.
 
         Parameters
         ----------
         iterations : int
-            How many epochs of the algorithm to run
+            How many epochs of the algorithm to run.
         regularization_factor : float or list(float)
-            Optional, if the model has a regularizer defined, the set of parameters to pass the regularizer method
+            Optional, if the model has a regularizer defined, the set of
+            parameters to pass the regularizer method.
         thread : bool
-            Default True, whether to run the computation in a separate thread to allow interaction with plots during computation
+            Default True, whether to run the computation in a separate thread
+            to allow interaction with plots during computation.
         calculation_width : int
-            Default 10, how many translations to pass through at once for each round of gradient accumulation. This does not affect the result, but may affect the calculation speed.
+            Default 10, how many translations to pass through at once for each
+            round of gradient accumulation. This does not affect the result,
+            but may affect the calculation speed.
 
         Yields
         ------
         loss : float
-            The summed loss over the latest epoch, divided by the total diffraction pattern intensity
+            The summed loss over the latest epoch, divided by the total
+            diffraction pattern intensity.
         """
 
         # We store the current optimizer as a model parameter so that
         # it can be saved and loaded for checkpointing
         self.current_optimizer = self.optimizer
-        
+
         # If we don't want to run in a different thread, this is easy
         if not thread:
             for it in range(iterations):
@@ -287,19 +303,21 @@ def optimize(self,
                         yield float('nan')
                     continue
 
-                yield self._run_epoch(regularization_factor=regularization_factor,
+                yield self._run_epoch(regularization_factor=regularization_factor, # noqa
                                       calculation_width=calculation_width)
-                
+
         # But if we do want to thread, it's annoying:
         else:
             # Here we set up the communication with the computation thread
             result_queue = queue.Queue()
             stop_event = threading.Event()
+
             def target():
                 try:
-                    result_queue.put(self._run_epoch(stop_event=stop_event,
-                                                     regularization_factor=regularization_factor,
-                                                     calculation_width=calculation_width))
+                    result_queue.put(
+                        self._run_epoch(stop_event=stop_event,
+                                        regularization_factor=regularization_factor, # noqa
+                                        calculation_width=calculation_width))
                 except Exception as e:
                     # If something bad happens, put the exception into the
                     # result queue
@@ -308,14 +326,16 @@ def target():
             # And this actually starts and monitors the thread
             for it in range(iterations):
                 if self.model.skip_computation():
-                    self.model.epoch = self.model.epoch + 1                    
+                    self.model.epoch = self.model.epoch + 1
                     if len(self.model.loss_history) >= 1:
                         yield self.model.loss_history[-1]
                     else:
                         yield float('nan')
                     continue
 
-                calc = threading.Thread(target=target, name='calculator', daemon=True)
+                calc = threading.Thread(target=target,
+                                        name='calculator',
+                                        daemon=True)
                 try:
                     calc.start()
                     while calc.is_alive():
@@ -326,7 +346,8 @@ def target():
 
                 except KeyboardInterrupt as e:
                     stop_event.set()
-                    print('\nAsking execution thread to stop cleanly - please be patient.')
+                    print('\nAsking execution thread to stop cleanly - ' +
+                          'please be patient.')
                     calc.join()
                     raise e
 
@@ -339,4 +360,4 @@ def target():
                 yield res
 
         # And finally, we unset the current optimizer:
-        self.current_optimizer = None
\ No newline at end of file
+        self.current_optimizer = None
diff --git a/src/cdtools/reconstructors/lbfgs.py b/src/cdtools/reconstructors/lbfgs.py
index 7a42e372..0b51dfde 100644
--- a/src/cdtools/reconstructors/lbfgs.py
+++ b/src/cdtools/reconstructors/lbfgs.py
@@ -9,34 +9,38 @@
 import torch as t
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.models import CDIModel
-from typing import Tuple, List, Union
+from typing import List, Union
 from cdtools.reconstructors import Reconstructor
 
 __all__ = ['LBFGS']
 
+
 class LBFGS(Reconstructor):
     """
-    The LBFGS Reconstructor subclass handles the optimization ('reconstruction') of 
-    ptychographic models and datasets using the LBFGS optimizer.
+    The LBFGS Reconstructor subclass handles the optimization
+    ('reconstruction') of ptychographic models and datasets using the LBFGS
+    optimizer.
 
     Parameters
     ----------
     model: CDIModel
-        Model for CDI/ptychography reconstruction
+        Model for CDI/ptychography reconstruction.
     dataset: Ptycho2DDataset
-        The dataset to reconstruct against
+        The dataset to reconstruct against.
     subset : list(int) or int
-        Optional, a pattern index or list of pattern indices to use
+        Optional, a pattern index or list of pattern indices to use.
     schedule : bool
-        Optional, create a learning rate scheduler (torch.optim.lr_scheduler._LRScheduler)
+        Optional, create a learning rate scheduler
+        (torch.optim.lr_scheduler._LRScheduler).
 
     Important attributes:
     - **model** -- Always points to the core model used.
-    - **optimizer** -- This class by default uses `torch.optim.LBFGS` to perform
-      optimizations.
-    - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the `optimize` method.
-    - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
-      `setup_dataloader` method.
+    - **optimizer** -- This class by default uses `torch.optim.LBFGS` to
+        perform optimizations.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during
+        the `optimize` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that is defined by
+        calling the `setup_dataloader` method.
     """
     def __init__(self,
                  model: CDIModel,
@@ -44,11 +48,10 @@ def __init__(self,
                  subset: List[int] = None):
 
         super().__init__(model, dataset, subset)
-        
+
         # Define the optimizer for use in this subclass
         self.optimizer = t.optim.LBFGS(self.model.parameters())
-    
-    
+
     def adjust_optimizer(self,
                          lr: int = 0.005,
                          history_size: int = 2,
@@ -59,8 +62,9 @@ def adjust_optimizer(self,
         Parameters
         ----------
         lr : float
-            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
-            typically the highest possible value with any chance of being stable
+            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05
+            is typically the highest possible value with any chance of being
+            stable.
         history_size : int
             Optional, the length of the history to use.
         line_search_fn : str
@@ -71,7 +75,6 @@ def adjust_optimizer(self,
             param_group['history_size'] = history_size
             param_group['line_search_fn'] = line_search_fn
 
-
     def optimize(self,
                  iterations: int,
                  lr: float = 0.1,
@@ -84,7 +87,7 @@ def optimize(self,
         Runs a round of reconstruction using the LBFGS optimizer
 
         Formerly `CDIModel.LBFGS_optimize`
-        
+
         This algorithm is often less stable that Adam, however in certain
         situations or geometries it can be shockingly efficient. Like all
         the other optimization routines, it is defined as a generator
@@ -96,20 +99,21 @@ def optimize(self,
         Parameters
         ----------
         iterations : int
-            How many epochs of the algorithm to run
+            How many epochs of the algorithm to run.
         lr : float
-            Optional, The learning rate (alpha) to use. Default is 0.1. 
+            Optional, The learning rate (alpha) to use. Default is 0.1.
         history_size : int
             Optional, the length of the history to use.
         regularization_factor : float or list(float)
-            Optional, if the model has a regularizer defined, the set of parameters to pass 
-            the regularizer method
+            Optional, if the model has a regularizer defined, the set of
+            parameters to pass the regularizer method.
         thread : bool
-            Default True, whether to run the computation in a separate thread to allow 
-            interaction with plots during computation
+            Default True, whether to run the computation in a separate thread
+            to allow interaction with plots during computation.
         calculation_width : int
-            Default 10, how many translations to pass through at once for each round of 
-            gradient accumulation. Does not affect the result, only the calculation speed 
+            Default 10, how many translations to pass through at once for each
+            round of gradient accumulation. Does not affect the result, only
+            the calculation speed.
         """
         # 1) The subset statement is contained in Reconstructor.__init__
 
@@ -117,12 +121,14 @@ def optimize(self,
         #    all the data at once.
         self.setup_dataloader(batch_size=len(self.dataset))
 
-        # 3) The optimizer is created in self.__init__, but the 
+        # 3) The optimizer is created in self.__init__, but the
         #    hyperparameters need to be set up with self.adjust_optimizer
-        self.adjust_optimizer(lr=lr, history_size=history_size, line_search_fn=line_search_fn)
+        self.adjust_optimizer(lr=lr,
+                              history_size=history_size,
+                              line_search_fn=line_search_fn)
 
         # 4) This is analagous to making a call to CDIModel.AD_optimize
         return super(LBFGS, self).optimize(iterations,
                                            regularization_factor,
                                            thread,
-                                           calculation_width)
\ No newline at end of file
+                                           calculation_width)
diff --git a/src/cdtools/reconstructors/sgd.py b/src/cdtools/reconstructors/sgd.py
index bdd3c195..f2dd7b0c 100644
--- a/src/cdtools/reconstructors/sgd.py
+++ b/src/cdtools/reconstructors/sgd.py
@@ -9,32 +9,34 @@
 import torch as t
 from cdtools.datasets.ptycho_2d_dataset import Ptycho2DDataset
 from cdtools.models import CDIModel
-from typing import Tuple, List, Union
+from typing import List, Union
 from cdtools.reconstructors import Reconstructor
 
 __all__ = ['SGD']
 
+
 class SGD(Reconstructor):
     """
-    The Adam Reconstructor subclass handles the optimization ('reconstruction') of 
-    ptychographic models and datasets using the Adam optimizer.
+    The Adam Reconstructor subclass handles the optimization ('reconstruction')
+    of ptychographic models and datasets using the Adam optimizer.
 
     Parameters
     ----------
     model: CDIModel
-        Model for CDI/ptychography reconstruction
+        Model for CDI/ptychography reconstruction.
     dataset: Ptycho2DDataset
-        The dataset to reconstruct against
+        The dataset to reconstruct against.
     subset : list(int) or int
-        Optional, a pattern index or list of pattern indices to use
+        Optional, a pattern index or list of pattern indices to use.
 
     Important attributes:
     - **model** -- Always points to the core model used.
     - **optimizer** -- This class by default uses `torch.optim.Adam` to perform
-      optimizations.
-    - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the `optimize` method.
-    - **data_loader** -- A torch.utils.data.DataLoader that is defined by calling the 
-      `setup_dataloader` method.
+        optimizations.
+    - **scheduler** -- A `torch.optim.lr_scheduler` that is defined during the
+        `optimize` method.
+    - **data_loader** -- A torch.utils.data.DataLoader that is defined by
+        calling the `setup_dataloader` method.
     """
     def __init__(self,
                  model: CDIModel,
@@ -42,11 +44,10 @@ def __init__(self,
                  subset: List[int] = None):
 
         super().__init__(model, dataset, subset)
-        
+
         # Define the optimizer for use in this subclass
         self.optimizer = t.optim.SGD(self.model.parameters())
-    
-    
+
     def adjust_optimizer(self,
                          lr: int = 0.005,
                          momentum: float = 0,
@@ -59,17 +60,18 @@ def adjust_optimizer(self,
         Parameters
         ----------
         lr : float
-            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05 is 
-            typically the highest possible value with any chance of being stable
+            Optional, The learning rate (alpha) to use. Default is 0.005. 0.05
+            is typically the highest possible value with any chance of being
+            stable.
         momentum : float
             Optional, the length of the history to use.
         dampening : float
-            Optional, dampening for the momentum
+            Optional, dampening for the momentum.
         weight_decay : float
-            Optional, weight decay (L2 penalty)
+            Optional, weight decay (L2 penalty).
         nesterov : bool
-            Optional, enables Nesterov momentum. Only applicable when momentum 
-            is non-zero. 
+            Optional, enables Nesterov momentum. Only applicable when momentum
+            is non-zero.
         """
         for param_group in self.optimizer.param_groups:
             param_group['lr'] = lr
@@ -78,7 +80,6 @@ def adjust_optimizer(self,
             param_group['weight_decay'] = weight_decay
             param_group['nesterov'] = nesterov
 
-
     def optimize(self,
                  iterations: int,
                  batch_size: int = None,
@@ -95,7 +96,7 @@ def optimize(self,
         Runs a round of reconstruction using the Adam optimizer
 
         Formerly `CDIModel.Adam_optimize`
-        
+
         This calls the Reconstructor.optimize superclass method
         (formerly `CDIModel.AD_optimize`) to run a round of reconstruction
         once the dataloader and optimizer hyperparameters have been
@@ -106,7 +107,7 @@ def optimize(self,
         iterations : int
             How many epochs of the algorithm to run.
         batch_size : int
-            Optional, the size of the minibatches to use.  
+            Optional, the size of the minibatches to use.
         lr : float
             Optional, The learning rate to use. The default is 2e-7.
         momentum : float
@@ -116,17 +117,18 @@ def optimize(self,
         weight_decay : float
             Optional, weight decay (L2 penalty).
         nesterov : bool
-            Optional, enables Nesterov momentum. Only applicable when momentum 
-            is non-zero. 
+            Optional, enables Nesterov momentum. Only applicable when momentum
+            is non-zero.
         regularization_factor : float or list(float)
-            Optional, if the model has a regularizer defined, the set of parameters to pass 
-            the regularizer method.
+            Optional, if the model has a regularizer defined, the set of
+            parameters to pass the regularizer method.
         thread : bool
-            Default True, whether to run the computation in a separate thread to allow 
-            interaction with plots during computation.
+            Default True, whether to run the computation in a separate thread
+            to allow interaction with plots during computation.
         calculation_width : int
-            Default 10, how many translations to pass through at once for each round of 
-            gradient accumulation. Does not affect the result, only the calculation speed. 
+            Default 10, how many translations to pass through at once for each
+            round of gradient accumulation. Does not affect the result, only
+            the calculation speed.
         shuffle : bool
             Optional, enable/disable shuffling of the dataset. This option
             is intended for diagnostic purposes and should be left as True.
@@ -140,7 +142,7 @@ def optimize(self,
             # Use default torch dataloader parameters
             self.setup_dataloader(batch_size=1, shuffle=False)
 
-        # 3) The optimizer is created in self.__init__, but the 
+        # 3) The optimizer is created in self.__init__, but the
         #    hyperparameters need to be set up with self.adjust_optimizer
         self.adjust_optimizer(lr=lr,
                               momentum=momentum,
@@ -152,4 +154,4 @@ def optimize(self,
         return super(SGD, self).optimize(iterations,
                                          regularization_factor,
                                          thread,
-                                         calculation_width)
\ No newline at end of file
+                                         calculation_width)

From be6abcda004b00fc4640aa3a0c23fac39ce4935d Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 04:11:01 +0000
Subject: [PATCH 104/115] Linted single_to_multi_gpu.py

---
 .../tools/distributed/single_to_multi_gpu.py  | 58 ++++++++++---------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/cdtools/tools/distributed/single_to_multi_gpu.py b/src/cdtools/tools/distributed/single_to_multi_gpu.py
index 25f565b7..9b8dbd99 100644
--- a/src/cdtools/tools/distributed/single_to_multi_gpu.py
+++ b/src/cdtools/tools/distributed/single_to_multi_gpu.py
@@ -1,48 +1,52 @@
 """
-A wrapper script to run single-GPU reconstruction scripts as
-a multi-GPU job when called by torchrun.
+A wrapper script to run single-GPU reconstruction scripts as a multi-GPU job
+when called by torchrun.
 
-This script is intended to be called by torchrun. It is set
-up so that the group process handling (init and destroy), 
-definition of several environmental variables, and actual
-execution of the single-GPU script are handled by a single 
-call to dist.script_wrapper.
+This script is intended to be called by torchrun. It is set up so that the
+group process handling (init and destroy), definition of several environmental
+variables, and actual execution of the single-GPU script are handled by a
+single call to dist.run_single_gpu_script.
 
-For example, if we have the reconstruction script `reconstruct.py` and want to use
-4 GPUs, we can write the following:
+For example, if we have the reconstruction script `reconstruct.py` and want to
+use 4 GPUs, we can write the following:
 
 ```
-torchrun --nnodes=1 --nproc_per_node=4 single-to-multi-gpu.py --script_path=reconstruct.py
+torchrun
+    --nnodes=1
+    --nproc_per_node=4
+    single-to-multi-gpu.py
+    --script_path=reconstruct.py
 ```
 """
 import cdtools.tools.distributed as dist
 import argparse
 
+
 def get_args():
     # Define the arguments we need to pass to dist.script_wrapper
     parser = argparse.ArgumentParser()
-    parser.add_argument('--backend', 
-                        type=str, 
+    parser.add_argument('--backend',
+                        type=str,
                         default='nccl',
                         choices=['nccl', 'gloo'],
                         help='Communication backend (nccl or gloo)')
-    parser.add_argument('--timeout', 
-                        type=int, 
+    parser.add_argument('--timeout',
+                        type=int,
                         default=30,
                         help='Time before process is killed in seconds')
-    parser.add_argument('--nccl_p2p_disable', 
-                        type=int, 
+    parser.add_argument('--nccl_p2p_disable',
+                        type=int,
                         default=1,
-                        choices=[0,1],
-                        help='Disable (1) or enable (0) NCCL peer-to-peer communication')
+                        choices=[0, 1],
+                        help='Disable (1) or enable (0) NCCL peer-to-peer communication') # noqa
     parser.add_argument('--seed',
                         type=int,
                         default=None,
                         help='Sets the RNG seed for all devices')
-    parser.add_argument('script_path', 
-                        type=str, 
-                        help='Single GPU script file name (with or without .py extension)')
-    
+    parser.add_argument('script_path',
+                        type=str,
+                        help='Single GPU script file name')
+
     return parser.parse_args()
 
 
@@ -51,11 +55,11 @@ def main():
     args = get_args()
     # Pass arguments to dist.script_wrapper
     dist.run_single_gpu_script(script_path=args.script_path,
-                                backend=args.backend,
-                                timeout=args.timeout,
-                                nccl_p2p_disable=bool(args.nccl_p2p_disable),
-                                seed=args.seed)
+                               backend=args.backend,
+                               timeout=args.timeout,
+                               nccl_p2p_disable=bool(args.nccl_p2p_disable),
+                               seed=args.seed)
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()

From d4ec9ee68b9e33e7b698716603d2c8512b722b78 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 04:35:20 +0000
Subject: [PATCH 105/115] Linted test_reconstructors.py

---
 tests/test_reconstructors.py | 65 ++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/tests/test_reconstructors.py b/tests/test_reconstructors.py
index 983b4a31..ed148b2b 100644
--- a/tests/test_reconstructors.py
+++ b/tests/test_reconstructors.py
@@ -4,26 +4,27 @@
 import numpy as np
 from copy import deepcopy
 
+
 @pytest.mark.slow
 def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     """
     This test checks out several things with the Au particle dataset
-        1) Calls to Reconstructor.adjust_optimizer is updating the hyperparameters
+        1) Calls to Reconstructor.adjust_optimizer is updating the
+           hyperparameters
         2) We are only using the single-GPU dataloading method
-        3) Ensure `recon.model` points to the original `model` 
-        4) Reconstructions performed by `Adam.optimize` and `model.Adam_optimize`
-           calls produce identical results.
-        5) The quality of the reconstruction remains below a specified threshold.
+        3) Ensure `recon.model` points to the original `model`
+        4) Reconstructions performed by `Adam.optimize` and
+           `model.Adam_optimize` calls produce identical results.
+        5) The quality of the reconstruction remains below a specified
+           threshold.
     """
 
-    print('\nTesting performance on the standard gold balls dataset with reconstructors.Adam')
-    
-    # Setup dataset and model
-    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(gold_ball_cxi)
+    print('\nTesting performance on the standard gold balls dataset ' +
+          'with reconstructors.Adam')
 
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(gold_ball_cxi)
     pad = 10
     dataset.pad(pad)
-
     model = cdtools.models.FancyPtycho.from_dataset(
         dataset,
         n_modes=3,
@@ -33,25 +34,24 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
         probe_fourier_crop=pad
     )
 
-    model.translation_offsets.data += 0.7 * t.randn_like(model.translation_offsets)
+    model.translation_offsets.data += 0.7 * \
+        t.randn_like(model.translation_offsets)
     model.weights.requires_grad = False
 
     # Make a copy of the model
     model_recon = deepcopy(model)
 
-    # Load models and datasets to devices
     model.to(device=reconstruction_device)
     model_recon.to(device=reconstruction_device)
     dataset.get_as(device=reconstruction_device)
 
-    # Make sure that we're not going to perform reconstructions on the same model
+    # Make sure that we're not going to perform reconstructions on the same
+    # model
     assert id(model_recon) != id(model)
 
-    ######### Reconstructions with cdtools.reconstructors.Adam.optimize #########
-    #############################################################################
-
-    print('Running reconstruction using cdtools.reconstructors.Adam.optimize on provided reconstruction_device,',
-          reconstruction_device)
+    # ******* Reconstructions with cdtools.reconstructors.Adam.optimize *******
+    print('Running reconstruction using cdtools.reconstructors.Adam.optimize' +
+          ' on provided reconstruction_device,', reconstruction_device)
 
     recon = cdtools.reconstructors.Adam(model=model_recon, dataset=dataset)
     t.manual_seed(0)
@@ -65,7 +65,8 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     assert recon.optimizer.param_groups[0]['lr'] == 0.005
     assert recon.data_loader.batch_size == 50
 
-    # Test 2: Ensure that recon does not have sampler as an attribute (for multi-GPU)
+    # Test 2: Ensure that recon does not have sampler as an attribute
+    #         (for multi-GPU)
     assert not hasattr(recon, 'sampler')
 
     for loss in recon.optimize(50, lr=0.002, batch_size=100):
@@ -79,19 +80,19 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     assert recon.data_loader.batch_size == 100
 
     for loss in recon.optimize(100, lr=0.001, batch_size=100,
-                                    schedule=True):
+                               schedule=True):
         print(model_recon.report())
         if show_plot and model_recon.epoch % 10 == 0:
             model_recon.inspect(dataset)
-    
+
     # Test 1c: Ensure that the Adam.optimizer.param_groups learning rate and
     #          batch size got updated
     assert recon.optimizer.param_groups[0]['lr'] == 0.001
     assert recon.data_loader.batch_size == 100
 
-    # Test 3:  Ensure recon.model points to the original model 
-    assert id(model_recon) == id(recon.model)        
-    
+    # Test 3:  Ensure recon.model points to the original model
+    assert id(model_recon) == id(recon.model)
+
     model_recon.tidy_probes()
 
     if show_plot:
@@ -100,10 +101,9 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
 
     loss_recon = model_recon.loss_history[-1]
 
-    ############ Reconstructions with cdtools.CDIModel.Adam_optimize ############
-    #############################################################################
-    print('Running reconstruction using CDIModel.Adam_optimize on provided reconstruction_device,',
-          reconstruction_device)
+    # ******* Reconstructions with cdtools.CDIModel.Adam_optimize *******
+    print('Running reconstruction using CDIModel.Adam_optimize on provided' +
+          ' reconstruction_device,', reconstruction_device)
     t.manual_seed(0)
 
     for loss in model.Adam_optimize(20, dataset, lr=0.005, batch_size=50):
@@ -121,7 +121,7 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
         print(model.report())
         if show_plot and model.epoch % 10 == 0:
             model.inspect(dataset)
-            
+
     model.tidy_probes()
 
     if show_plot:
@@ -134,7 +134,8 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     assert np.allclose(loss_recon, loss_model)
 
     # Test 5: Ensure reconstructions have reached a certain loss tolerance
-    #         This just comes from running a reconstruction when it was working well
-    #         and choosing a rough value. If it triggers this assertion error,
-    #         something changed to make the final quality worse!
+    #         This just comes from running a reconstruction when it was
+    #         working well and choosing a rough value. If it triggers this
+    #         assertion error, something changed to make the final quality
+    #         worse!
     assert loss_model < 0.0001

From 27f2d54bbc25def213f46452f6999beef78abb73 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 04:36:48 +0000
Subject: [PATCH 106/115] Linted multi gpu tests and test scripts

---
 tests/multi_gpu/multi_gpu_script_quality.py | 20 +++++------
 tests/multi_gpu/test_multi_gpu.py           | 38 ++++++++++-----------
 2 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/tests/multi_gpu/multi_gpu_script_quality.py b/tests/multi_gpu/multi_gpu_script_quality.py
index 40e7e16f..a66d0ee0 100644
--- a/tests/multi_gpu/multi_gpu_script_quality.py
+++ b/tests/multi_gpu/multi_gpu_script_quality.py
@@ -1,6 +1,7 @@
 import cdtools
 import os
 
+
 @cdtools.tools.distributed.report_speed_test
 def main():
     filename = os.environ.get('CDTOOLS_TESTING_DATA_PATH')
@@ -9,36 +10,31 @@ def main():
     # FancyPtycho is the workhorse model
     model = cdtools.models.FancyPtycho.from_dataset(
         dataset,
-        n_modes=3, # Use 3 incoherently mixing probe modes
-        oversampling=2, # Simulate the probe on a 2xlarger real-space array
-        probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
-        propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
-        units='mm', # Set the units for the live plots
-        obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
+        n_modes=3,
+        oversampling=2,
+        probe_support_radius=120,
+        propagation_distance=5e-3,
+        units='mm',
+        obj_view_crop=-50,
     )
 
     device = 'cuda'
     model.to(device=device)
     dataset.get_as(device=device)
 
-    # Remove or comment out plotting statements
     for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
         if model.rank == 0 and model.epoch % 10:
             print(model.report())
-
     for loss in model.Adam_optimize(25, dataset,  lr=0.005, batch_size=40):
         if model.rank == 0 and model.epoch % 10:
             print(model.report())
-
     for loss in model.Adam_optimize(25, dataset,  lr=0.001, batch_size=40):
         if model.rank == 0 and model.epoch % 10:
             print(model.report())
 
-    # This orthogonalizes the recovered probe modes
     model.tidy_probes()
-
     return model
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/tests/multi_gpu/test_multi_gpu.py b/tests/multi_gpu/test_multi_gpu.py
index fc33ba5c..389cc5bb 100644
--- a/tests/multi_gpu/test_multi_gpu.py
+++ b/tests/multi_gpu/test_multi_gpu.py
@@ -1,32 +1,28 @@
-import cdtools
 import cdtools.tools.distributed as dist
 import pytest
 import os
 
 """
-This file contains several tests that are relevant to running
-multi-GPU operations in CDTools. 
-
-
-
+This file contains several tests that are relevant to running multi-GPU
+operations in CDTools.
 """
 
+
 @pytest.mark.multigpu
 def test_reconstruction_quality(lab_ptycho_cxi,
                                 multigpu_script_1,
                                 tmp_path,
                                 show_plot):
     """
-    Run a multi-GPU test based on fancy_ptycho_speed_test.py and make 
+    Run a multi-GPU test based on fancy_ptycho_speed_test.py and make
     sure the final reconstructed loss using 2 GPUs is similar to 1 GPU.
 
     This test requires us to have 2 NVIDIA GPUs and makes use of the
     multi-GPU speed test.
 
-    If this test fails, it indicates that the reconstruction quality
-    is getting noticably worse with increased GPU counts. This may 
-    be a symptom of a synchronization/broadcasting issue between the 
-    different GPUs.
+    If this test fails, it indicates that the reconstruction quality is
+    getting noticably worse with increased GPU counts. This may be a symptom
+    of a synchronization/broadcasting issue between the different GPUs.
     """
     # Pass the cxi directory to the reconstruction script
     os.environ['CDTOOLS_TESTING_DATA_PATH'] = lab_ptycho_cxi
@@ -61,16 +57,18 @@ def test_reconstruction_quality(lab_ptycho_cxi,
     single_gpu_loss_max = loss_mean[0] + loss_std[0]
     multi_gpu_loss_min = loss_mean[1] - loss_std[1]
     multi_gpu_loss_max = loss_mean[1] + loss_std[1]
-    has_overlap_loss = min(single_gpu_loss_max, multi_gpu_loss_max)\
-                       > max(single_gpu_loss_min, multi_gpu_loss_min)
+    has_overlap_loss = \
+        min(single_gpu_loss_max, multi_gpu_loss_max)\
+        > max(single_gpu_loss_min, multi_gpu_loss_min)
 
     print(f'Single GPU final loss: {loss_mean[0]} +- {loss_std[0]}')
     print(f'Two GPU final loss: {loss_mean[1]} +- {loss_std[1]}')
-    print(f'Overlap between the mean +- std of the single/multi GPU losses: {has_overlap_loss}')
+    print('Overlap between mean +- std of the single/multi GPU losses: ' +
+          f'{has_overlap_loss}')
 
     assert has_overlap_loss
 
-    # Also make sure that we actually get some kind of speed up with 
+    # Also make sure that we actually get some kind of speed up with
     # multiple GPUs...
     speed_mean = results[2]
     speed_std = results[3]
@@ -79,17 +77,17 @@ def test_reconstruction_quality(lab_ptycho_cxi,
     single_gpu_speed_max = speed_mean[0] + speed_std[0]
     multi_gpu_speed_min = speed_mean[1] - speed_std[1]
     multi_gpu_speed_max = speed_mean[1] + speed_std[1]
-    has_overlap_speed = min(single_gpu_speed_max, multi_gpu_speed_max)\
-                        > max(single_gpu_speed_min, multi_gpu_speed_min)
+    has_overlap_speed = \
+        min(single_gpu_speed_max, multi_gpu_speed_max)\
+        > max(single_gpu_speed_min, multi_gpu_speed_min)
 
     print(f'Single GPU runtime: {speed_mean[0]} +- {speed_std[0]}')
     print(f'Two GPU runtime: {speed_mean[1]} +- {speed_std[1]}')
-    print(f'Overlap between the mean +- std of the single/multi GPU runtimes: {has_overlap_speed}')
+    print('Overlap between the mean +- std of the single/multi GPU runtimes: '
+          + f'{has_overlap_speed}')
 
     assert speed_mean[0] < speed_mean[1]
     assert not has_overlap_speed
 
     # Clear the environment variable we created here
     os.environ.pop('CDTOOLS_TESTING_DATA_PATH')
-
-

From 976e94ca45b4183a27ab0e8a9ccc02b9e2eff33e Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 17:37:08 +0000
Subject: [PATCH 107/115] Linted and cleaned up the distributed speed test
 examples

---
 examples/distributed_speed_test.py  | 85 ++++++++++++++++-------------
 examples/fancy_ptycho_speed_test.py | 29 ++++++----
 2 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/examples/distributed_speed_test.py b/examples/distributed_speed_test.py
index 581bbc4c..f759b336 100644
--- a/examples/distributed_speed_test.py
+++ b/examples/distributed_speed_test.py
@@ -1,39 +1,46 @@
-'''This is a testing script to study how the reconstruction speed
-and convergence rate scales with the number of GPUs utilized.
-
-The test is set up so that you can run n-trials for each number of GPUs
-you want to study and plot statistics of loss-versus-time as a function
-of GPU counts. 
-
-This test is based on fancy_ptycho_multi_gpu_ddp.py and fancy_ptycho.py.
-
-'''
-
-import cdtools.tools.distributed as dist
-
-# This will execute the multi_gpu_reconstruct upon running this file
-if __name__ == '__main__':
-    # Define the number of GPUs to use.
-    world_sizes = [1,2]
-    
-    # How many reconstruction runs to perform for statistics
-    runs = 3
-
-    # Define where the single-GPU script is located
-    script_path = 'fancy_ptycho_speed_test.py']
-
-    # Define where the loss-vs-time data is being stored in
-    output_dir = 'example_loss_data4'
-
-    # Define what prefix you want on the file
-    file_prefix = 'speed_test'
-    
-    # Run the test
-    results = dist.run_speed_test(world_sizes=world_sizes, 
-                                  runs=runs, 
-                                  script_path=script_path, 
-                                  output_dir=output_dir, 
-                                  file_prefix=file_prefix,
-                                  show_plot=True,
-                                  delete_output_files=True)
-    
\ No newline at end of file
+from cdtools.tools.distributed import run_speed_test
+
+# Define the number of GPUs to use for the test. We always need to include
+# a single GPU in the test.
+#
+# Here, we will run trials with 1 and 2 GPUs.
+world_sizes = [1, 2]
+
+# We will run 3 trials per GPU to collect statistics on loss-versus-epoch/time
+# data as well as runtime speedup.
+runs = 3
+
+# We will perform a speed test on a reconstruction script modified to run
+# a speed test (see fancy_ptycho_speed_test.py)
+script_path = 'fancy_ptycho_speed_test.py'
+
+# When we run the modified script with the speed test, a pickle dump file
+# will be generated after each trial. The file contains data about loss-vs-time
+# measured for the trial with one or several GPUs used.
+output_dir = 'example_loss_data'
+
+# Define the file name prefix. The file will have the following name:
+# `<file_prefix>_nGPUs_<world_size>_TRIAL_<run number>.pkl`
+file_prefix = 'speed_test'
+
+# We can plot several curves showing what the loss-versus/epoch curves look
+# like for each GPU count. The plot will also show the relative runtime
+# speed-up relative to the single-GPU runtime.
+show_plot = True
+
+# We can also delete the pickle dump files after each trial run has been
+# completed and stored by `run_speed_test`
+delete_output_file = True
+
+# Run the test. This speed test will return several lists containing the
+# means and standard deviations of the final recorded losses and runtime
+# speed ups calculated over several trial runs. Each entry index maps to
+# the GPU count specified by `world_sizes`.
+final_loss_mean, final_loss_std, speed_up_mean, speed_up_std = \
+    run_speed_test(world_sizes=world_sizes,
+                   runs=runs,
+                   script_path=script_path,
+                   output_dir=output_dir,
+                   file_prefix=file_prefix,
+                   show_plot=show_plot,
+                   delete_output_files=delete_output_file)
diff --git a/examples/fancy_ptycho_speed_test.py b/examples/fancy_ptycho_speed_test.py
index 7ba9f954..94c870dd 100644
--- a/examples/fancy_ptycho_speed_test.py
+++ b/examples/fancy_ptycho_speed_test.py
@@ -1,27 +1,36 @@
 import cdtools
 
+
+# To modify fancy_ptycho.py for a multi-GPU speed test, we need to enclose the
+# entire reconstruction script in a function. The function then needs to be
+# decorated with cdtools.tools.distributed.report_speed_test. The decorator
+# allows data to be saved and read by the multi-GPU speed test function
+# which we will use to run this script.
 @cdtools.tools.distributed.report_speed_test
 def main():
     filename = 'example_data/lab_ptycho_data.cxi'
     dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
 
-    # FancyPtycho is the workhorse model
     model = cdtools.models.FancyPtycho.from_dataset(
         dataset,
-        n_modes=3, # Use 3 incoherently mixing probe modes
-        oversampling=2, # Simulate the probe on a 2xlarger real-space array
-        probe_support_radius=120, # Force the probe to 0 outside a radius of 120 pix
-        propagation_distance=5e-3, # Propagate the initial probe guess by 5 mm
-        units='mm', # Set the units for the live plots
-        obj_view_crop=-50, # Expands the field of view in the object plot by 50 pix
+        n_modes=3,
+        oversampling=2,
+        probe_support_radius=120,
+        propagation_distance=5e-3,
+        units='mm',
+        obj_view_crop=-50
     )
 
     device = 'cuda'
     model.to(device=device)
     dataset.get_as(device=device)
 
-    # Remove or comment out plotting statements
+    # Remove or comment out plotting existing plotting statements
     for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
+        # Optional: ensure that only a single GPU prints a report by
+        # adding an if statement. Without this, the print statement will
+        # be called by all participating GPUs, resulting in multiple copies
+        # of the printed model report.
         if model.rank == 0:
             print(model.report())
 
@@ -33,12 +42,12 @@ def main():
         if model.rank == 0:
             print(model.report())
 
-    # This orthogonalizes the recovered probe modes
     model.tidy_probes()
 
+    # We need to return the model so the data can be saved by the decorator.
     return model
 
 
+# We also need to include this if-name-main block at the end
 if __name__ == '__main__':
     main()
-

From 7e850fed30e6279d177265e19de6c9924bd297a7 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 18:33:51 +0000
Subject: [PATCH 108/115] Added LBFGS and RPI pytest

---
 tests/conftest.py            | 12 +++++
 tests/test_reconstructors.py | 94 +++++++++++++++++++++++++++++++++++-
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 5d0dd89a..b973e99e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -392,6 +392,18 @@ def lab_ptycho_cxi(pytestconfig):
         '/examples/example_data/lab_ptycho_data.cxi'
 
 
+@pytest.fixture(scope='module')
+def optical_data_ss_cxi(pytestconfig):
+    return str(pytestconfig.rootpath) + \
+        '/examples/example_data/Optical_Data_ss.cxi'
+
+
+@pytest.fixture(scope='module')
+def optical_ptycho_incoherent_pickle(pytestconfig):
+    return str(pytestconfig.rootpath) + \
+        '/examples/example_data/Optical_ptycho_incoherent.pickle'
+
+
 @pytest.fixture(scope='module')
 def example_nested_dicts(pytestconfig):
     example_tensor = t.as_tensor(np.array([1, 4.5, 7]))
diff --git a/tests/test_reconstructors.py b/tests/test_reconstructors.py
index ed148b2b..0213cb10 100644
--- a/tests/test_reconstructors.py
+++ b/tests/test_reconstructors.py
@@ -2,9 +2,10 @@
 import cdtools
 import torch as t
 import numpy as np
+import pickle
+from matplotlib import pyplot as plt
 from copy import deepcopy
 
-
 @pytest.mark.slow
 def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     """
@@ -139,3 +140,94 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     #         assertion error, something changed to make the final quality
     #         worse!
     assert loss_model < 0.0001
+
+
+@pytest.mark.slow
+def test_LBFGS_RPI(optical_data_ss_cxi,
+                   optical_ptycho_incoherent_pickle,
+                   reconstruction_device,
+                   show_plot):
+    """
+    This test checks out several things with the transmission RPI dataset
+        1) Calls to Reconstructor.adjust_optimizer is updating the
+           hyperparameters
+        2) Ensure `recon.model` points to the original `model`
+        3) Reconstructions performed by `LBFGS.optimize` and
+           `model.LBFGS_optimize` calls produce identical results.
+        4) The quality of the reconstruction remains below a specified
+           threshold.
+        5) Ensure that the RPI model works fine and dandy with the
+           Reconstructors.
+    """
+    with open(optical_ptycho_incoherent_pickle, 'rb') as f:
+        ptycho_results = pickle.load(f)
+
+    probe = ptycho_results['probe']
+    background = ptycho_results['background']
+
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(optical_data_ss_cxi)
+    model = cdtools.models.RPI.from_dataset(dataset, probe, [500, 500],
+                                            background=background, n_modes=2,
+                                            initialization='random')
+
+    # Prepare two sets of models for the comparative reconstruction
+    model_recon = deepcopy(model)
+
+    model.to(device=reconstruction_device)
+    model_recon.to(device=reconstruction_device)
+    dataset.get_as(device=reconstruction_device)
+
+    # ******* Reconstructions with cdtools.reconstructors.LBFGS.optimize ******
+    print('Running reconstruction using cdtools.reconstructors.LBFGS.' +
+          'optimize on provided reconstruction_device,', reconstruction_device)
+
+    recon = cdtools.reconstructors.LBFGS(model=model_recon, dataset=dataset)
+    t.manual_seed(0)
+
+    # Run a reconstruction
+    reg_factor_tup = ([0.05, 0.05], [0.001, 0.1])
+    epoch_tup = (30, 50)
+    for i, iterations in enumerate(epoch_tup):
+        for loss in recon.optimize(iterations,
+                                   lr=0.4,
+                                   regularization_factor=reg_factor_tup[i]):
+            if show_plot and i == 0:
+                model_recon.inspect(dataset)
+            print(model_recon.report())
+
+        # Check hyperparameter update (or lack thereof)
+        assert recon.optimizer.param_groups[0]['lr'] == 0.4
+
+    if show_plot:
+        model_recon.inspect(dataset)
+        model_recon.compare(dataset)
+        plt.show()
+
+    # Check model pointing
+    assert id(model_recon) == id(recon.model)
+
+    # ******* Reconstructions with cdtools.reconstructors.LBFGS.optimize ******
+    print('Running reconstruction using CDIModel.LBFGS_optimize.' +
+          'optimize on provided reconstruction_device,', reconstruction_device)
+    t.manual_seed(0)
+    for i, iterations in enumerate(epoch_tup):
+        for loss in model.LBFGS_optimize(iterations,
+                                         dataset,
+                                         lr=0.4,
+                                         regularization_factor=reg_factor_tup[i]): # noqa
+            if show_plot and i == 0:
+                model.inspect(dataset)
+            print(model.report())
+
+    if show_plot:
+        model.inspect(dataset)
+        model.compare(dataset)
+        plt.show()
+
+    # Check loss equivalency between the two reconstructions
+    assert np.allclose(model.loss_history[-1], model_recon.loss_history[-1])
+
+    # The final loss when testing this was 2.28607e-3. Based on this, we set
+    # a threshold of 2.3e-3 for the tested loss. If this value has been
+    # exceeded, the reconstructions have gotten worse.
+    assert model.loss_history[-1] < 0.0023

From 5a6c66f8457d04ef954f70cc3b3263d7502c0b35 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 18:50:34 +0000
Subject: [PATCH 109/115] Cleaned up and refactored parts of
 test_Adam_gold_balls

---
 tests/test_reconstructors.py | 92 +++++++++++++-----------------------
 1 file changed, 34 insertions(+), 58 deletions(-)

diff --git a/tests/test_reconstructors.py b/tests/test_reconstructors.py
index 0213cb10..92e19ce0 100644
--- a/tests/test_reconstructors.py
+++ b/tests/test_reconstructors.py
@@ -6,8 +6,9 @@
 from matplotlib import pyplot as plt
 from copy import deepcopy
 
+
 @pytest.mark.slow
-def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
+def test_Adam_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     """
     This test checks out several things with the Au particle dataset
         1) Calls to Reconstructor.adjust_optimizer is updating the
@@ -18,6 +19,8 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
            `model.Adam_optimize` calls produce identical results.
         5) The quality of the reconstruction remains below a specified
            threshold.
+        5) Ensure that the FancyPtycho model works fine and dandy with the
+           Reconstructors.
     """
 
     print('\nTesting performance on the standard gold balls dataset ' +
@@ -41,57 +44,39 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
 
     # Make a copy of the model
     model_recon = deepcopy(model)
-
     model.to(device=reconstruction_device)
     model_recon.to(device=reconstruction_device)
     dataset.get_as(device=reconstruction_device)
 
-    # Make sure that we're not going to perform reconstructions on the same
-    # model
-    assert id(model_recon) != id(model)
-
     # ******* Reconstructions with cdtools.reconstructors.Adam.optimize *******
     print('Running reconstruction using cdtools.reconstructors.Adam.optimize' +
           ' on provided reconstruction_device,', reconstruction_device)
 
     recon = cdtools.reconstructors.Adam(model=model_recon, dataset=dataset)
     t.manual_seed(0)
-    for loss in recon.optimize(20, lr=0.005, batch_size=50):
-        print(model_recon.report())
-        if show_plot and model_recon.epoch % 10 == 0:
-            model_recon.inspect(dataset)
-
-    # Test 1a: Ensure that the Adam.optimizer.param_groups learning rate and
-    #          batch size got updated
-    assert recon.optimizer.param_groups[0]['lr'] == 0.005
-    assert recon.data_loader.batch_size == 50
-
-    # Test 2: Ensure that recon does not have sampler as an attribute
-    #         (for multi-GPU)
-    assert not hasattr(recon, 'sampler')
 
-    for loss in recon.optimize(50, lr=0.002, batch_size=100):
-        print(model_recon.report())
-        if show_plot and model_recon.epoch % 10 == 0:
-            model_recon.inspect(dataset)
+    # Run a reconstruction
+    epoch_tup = (20, 50, 100)
+    lr_tup = (0.005, 0.002, 0.001)
+    batch_size_tup = (50, 100, 100)
 
-    # Test 1b: Ensure that the Adam.optimizer.param_groups learning rate and
-    #          batch size got updated
-    assert recon.optimizer.param_groups[0]['lr'] == 0.002
-    assert recon.data_loader.batch_size == 100
+    for i, iterations in enumerate(epoch_tup):
+        for loss in recon.optimize(iterations,
+                                   lr=lr_tup[i],
+                                   batch_size=batch_size_tup[i]):
+            print(model_recon.report())
+            if show_plot and model_recon.epoch % 10 == 0:
+                model_recon.inspect(dataset)
 
-    for loss in recon.optimize(100, lr=0.001, batch_size=100,
-                               schedule=True):
-        print(model_recon.report())
-        if show_plot and model_recon.epoch % 10 == 0:
-            model_recon.inspect(dataset)
+        # Check hyperparameter update
+        assert recon.optimizer.param_groups[0]['lr'] == lr_tup[i]
+        assert recon.data_loader.batch_size == batch_size_tup[i]
 
-    # Test 1c: Ensure that the Adam.optimizer.param_groups learning rate and
-    #          batch size got updated
-    assert recon.optimizer.param_groups[0]['lr'] == 0.001
-    assert recon.data_loader.batch_size == 100
+    # Ensure that recon does not have sampler as an attribute (only used in
+    # multi-GPU)
+    assert not hasattr(recon, 'sampler')
 
-    # Test 3:  Ensure recon.model points to the original model
+    # Ensure recon.model points to the original model
     assert id(model_recon) == id(recon.model)
 
     model_recon.tidy_probes()
@@ -99,47 +84,38 @@ def test_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     if show_plot:
         model_recon.inspect(dataset)
         model_recon.compare(dataset)
-
-    loss_recon = model_recon.loss_history[-1]
+        plt.show()
 
     # ******* Reconstructions with cdtools.CDIModel.Adam_optimize *******
     print('Running reconstruction using CDIModel.Adam_optimize on provided' +
           ' reconstruction_device,', reconstruction_device)
     t.manual_seed(0)
 
-    for loss in model.Adam_optimize(20, dataset, lr=0.005, batch_size=50):
-        print(model.report())
-        if show_plot and model.epoch % 10 == 0:
-            model.inspect(dataset)
-
-    for loss in model.Adam_optimize(50, dataset, lr=0.002, batch_size=100):
-        print(model.report())
-        if show_plot and model.epoch % 10 == 0:
-            model.inspect(dataset)
-
-    for loss in model.Adam_optimize(100, dataset, lr=0.001, batch_size=100,
-                                    schedule=True):
-        print(model.report())
-        if show_plot and model.epoch % 10 == 0:
-            model.inspect(dataset)
+    for i, iterations in enumerate(epoch_tup):
+        for loss in model.Adam_optimize(iterations,
+                                        dataset,
+                                        lr=lr_tup[i],
+                                        batch_size=batch_size_tup[i]):
+            print(model.report())
+            if show_plot and model.epoch % 10 == 0:
+                model.inspect(dataset)
 
     model.tidy_probes()
 
     if show_plot:
         model.inspect(dataset)
         model.compare(dataset)
-
-    loss_model = model.loss_history[-1]
+        plt.show()
 
     # Test 4: Ensure equivalency between the model reconstructions
-    assert np.allclose(loss_recon, loss_model)
+    assert np.allclose(model_recon.loss_history[-1], model.loss_history[-1])
 
     # Test 5: Ensure reconstructions have reached a certain loss tolerance
     #         This just comes from running a reconstruction when it was
     #         working well and choosing a rough value. If it triggers this
     #         assertion error, something changed to make the final quality
     #         worse!
-    assert loss_model < 0.0001
+    assert model.loss_history[-1] < 0.0001
 
 
 @pytest.mark.slow

From 01ebaf2b71dc90139bf2a34cab8d324fe03c4155 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Thu, 17 Jul 2025 20:34:03 +0000
Subject: [PATCH 110/115] Added pytest for the SGD Reconstructor

---
 tests/test_reconstructors.py | 120 +++++++++++++++++++++++++++++++++--
 1 file changed, 114 insertions(+), 6 deletions(-)

diff --git a/tests/test_reconstructors.py b/tests/test_reconstructors.py
index 92e19ce0..4e6ee3c1 100644
--- a/tests/test_reconstructors.py
+++ b/tests/test_reconstructors.py
@@ -107,14 +107,13 @@ def test_Adam_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
         model.compare(dataset)
         plt.show()
 
-    # Test 4: Ensure equivalency between the model reconstructions
+    # Ensure equivalency between the model reconstructions
     assert np.allclose(model_recon.loss_history[-1], model.loss_history[-1])
 
-    # Test 5: Ensure reconstructions have reached a certain loss tolerance
-    #         This just comes from running a reconstruction when it was
-    #         working well and choosing a rough value. If it triggers this
-    #         assertion error, something changed to make the final quality
-    #         worse!
+    # Ensure reconstructions have reached a certain loss tolerance. This just
+    # comes from running a reconstruction when it was working well and
+    # choosing a rough value. If it triggers this assertion error, something
+    # changed to make the final quality worse!
     assert model.loss_history[-1] < 0.0001
 
 
@@ -207,3 +206,112 @@ def test_LBFGS_RPI(optical_data_ss_cxi,
     # a threshold of 2.3e-3 for the tested loss. If this value has been
     # exceeded, the reconstructions have gotten worse.
     assert model.loss_history[-1] < 0.0023
+
+
+@pytest.mark.slow
+def test_SGD_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
+    """
+    This test checks out several things with the Au particle dataset
+        1) Calls to Reconstructor.adjust_optimizer is updating the
+           hyperparameters
+        3) Ensure `recon.model` points to the original `model`
+        4) Reconstructions performed by `SGD.optimize` and
+           `model.SGD_optimize` calls produce identical results.
+        5) The quality of the reconstruction remains below a specified
+           threshold.
+        5) Ensure that the FancyPtycho model works fine and dandy with the
+           Reconstructors.
+
+    The hyperparameters used in this test are not optimized to produce
+    a super-high-quality reconstruction. Instead, I just need A reconstruction
+    to do some kind of comparative assessment.
+    """
+    print('\nTesting performance on the standard gold balls dataset ' +
+          'with reconstructors.SGD')
+
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(gold_ball_cxi)
+    pad = 10
+    dataset.pad(pad)
+    model = cdtools.models.FancyPtycho.from_dataset(
+        dataset,
+        n_modes=3,
+        probe_support_radius=50,
+        propagation_distance=2e-6,
+        units='um',
+        probe_fourier_crop=pad
+    )
+
+    model.translation_offsets.data += 0.7 * \
+        t.randn_like(model.translation_offsets)
+    model.weights.requires_grad = False
+
+    # Make a copy of the model
+    model_recon = deepcopy(model)
+    model.to(device=reconstruction_device)
+    model_recon.to(device=reconstruction_device)
+    dataset.get_as(device=reconstruction_device)
+
+    # ******* Reconstructions with cdtools.reconstructors.SGD.optimize *******
+    print('Running reconstruction using cdtools.reconstructors.SGD.optimize' +
+          ' on provided reconstruction_device,', reconstruction_device)
+
+    recon = cdtools.reconstructors.SGD(model=model_recon, dataset=dataset)
+    t.manual_seed(0)
+
+    # Run a reconstruction
+    epochs = 50
+    lr = 0.00000005
+    batch_size = 40
+
+    for loss in recon.optimize(epochs,
+                               lr=lr,
+                               batch_size=batch_size):
+        print(model_recon.report())
+        if show_plot and model_recon.epoch % 10 == 0:
+            model_recon.inspect(dataset)
+
+    # Check hyperparameter update
+    assert recon.optimizer.param_groups[0]['lr'] == lr
+    assert recon.data_loader.batch_size == batch_size
+
+    # Ensure that recon does not have sampler as an attribute (only used in
+    # multi-GPU)
+    assert not hasattr(recon, 'sampler')
+
+    # Ensure recon.model points to the original model
+    assert id(model_recon) == id(recon.model)
+
+    model_recon.tidy_probes()
+
+    if show_plot:
+        model_recon.inspect(dataset)
+        model_recon.compare(dataset)
+        plt.show()
+
+    # ******* Reconstructions with cdtools.CDIModel.SGD_optimize *******
+    print('Running reconstruction using CDIModel.SGD_optimize on provided' +
+          ' reconstruction_device,', reconstruction_device)
+    t.manual_seed(0)
+
+    for loss in model.SGD_optimize(epochs,
+                                   dataset,
+                                   lr=lr,
+                                   batch_size=batch_size):
+        print(model.report())
+        if show_plot and model.epoch % 10 == 0:
+            model.inspect(dataset)
+
+    model.tidy_probes()
+
+    if show_plot:
+        model.inspect(dataset)
+        model.compare(dataset)
+        plt.show()
+
+    # Ensure equivalency between the model reconstructions
+    assert np.allclose(model_recon.loss_history[-1], model.loss_history[-1])
+
+    # The final loss when testing this was 7.12188e-4. Based on this, we set
+    # a threshold of 7.2e-4 for the tested loss. If this value has been
+    # exceeded, the reconstructions have gotten worse.
+    assert model.loss_history[-1] < 0.00072

From a0052fdb43f4763afaf70427e6f1e07c97b17707 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 18 Jul 2025 18:42:32 +0000
Subject: [PATCH 111/115] Updated Rank 0 multi gpu flagging for
 ptycho_2d_dataset and CDIModel

---
 src/cdtools/datasets/ptycho_2d_dataset.py |  2 +
 src/cdtools/models/base.py                | 45 ++++++++++++-----------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/src/cdtools/datasets/ptycho_2d_dataset.py b/src/cdtools/datasets/ptycho_2d_dataset.py
index 2350e5c2..9d719275 100644
--- a/src/cdtools/datasets/ptycho_2d_dataset.py
+++ b/src/cdtools/datasets/ptycho_2d_dataset.py
@@ -198,6 +198,8 @@ def to_cxi(self, cxi_file):
         cxi_file : str, pathlib.Path, or h5py.File
             The .cxi file to write to
         """
+        if self.multi_gpu_used and self.rank != 0:
+            return
 
         # If a bare string is passed
         if isinstance(cxi_file, str) or isinstance(cxi_file, pathlib.Path):
diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
index 19948df2..0e5e345d 100644
--- a/src/cdtools/models/base.py
+++ b/src/cdtools/models/base.py
@@ -212,10 +212,8 @@ def save_to_h5(self, filename, *args):
             Accepts any additional args that model.save_results needs, for this model
         """
         # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
-        if self.multi_gpu_used and self.rank != 0:
-            return
-        
-        return nested_dict_to_h5(filename, self.save_results(*args))
+        if not (self.multi_gpu_used and self.rank != 0):
+            return nested_dict_to_h5(filename, self.save_results(*args))
     
 
     @contextmanager
@@ -235,18 +233,19 @@ def save_on_exit(self, filename, *args, exception_filename=None):
         exception_filename : str
             Optional, a separate filename to use if an exception is raised during execution. Default is equal to filename
         """
-        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
-        if self.multi_gpu_used and self.rank != 0:
-            return
-        
         try:
             yield
-            self.save_to_h5(filename, *args)
-        except:
-            if exception_filename is None:
-                exception_filename = filename
-            self.save_to_h5(exception_filename, *args)
-            raise
+
+            # Only let the Rank 0 GPU handle saving in multi-GPU
+            if not (self.multi_gpu_used and self.rank != 0):
+                self.save_to_h5(filename, *args)
+
+        except Exception as e:
+            if not (self.multi_gpu_used and self.rank != 0):
+                if exception_filename is None:
+                    exception_filename = filename
+                self.save_to_h5(exception_filename, *args)
+            raise e
 
     @contextmanager
     def save_on_exception(self, filename, *args):
@@ -265,16 +264,14 @@ def save_on_exception(self, filename, *args):
             Accepts any additional args that model.save_results needs, for this model
         """
         # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
-        if self.multi_gpu_used and self.rank != 0:
-            return
-        
         try:
             yield
-        except:
-            self.save_to_h5(filename, *args)
-            print('Intermediate results saved under name:')
-            print(filename, flush=True)
-            raise
+        except Exception as e:
+            if not (self.multi_gpu_used and self.rank != 0):
+                self.save_to_h5(filename, *args)
+                print('Intermediate results saved under name:')
+                print(filename, flush=True)
+            raise e
 
 
     def use_checkpoints(self, job_id, checkpoint_file_stem):
@@ -296,6 +293,10 @@ def skip_computation(self):
             return False
 
     def save_checkpoint(self, *args, checkpoint_file=None):
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+        
         checkpoint = self.save_results(*args)
         if (hasattr(self, 'current_optimizer')
             and self.current_optimizer is not None):

From 30eeb431248c55f3126bcd68bfb7eec6e21180c5 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 18 Jul 2025 20:33:24 +0000
Subject: [PATCH 112/115] Added plotting and saving test and got rid of plt
 show statements

---
 src/cdtools/tools/distributed/distributed.py  |  1 -
 tests/conftest.py                             |  6 ++
 .../multi_gpu_script_plot_and_save.py         | 77 ++++++++++++++++++
 tests/multi_gpu/test_multi_gpu.py             | 79 ++++++++++++++++++-
 tests/test_reconstructors.py                  |  6 --
 5 files changed, 161 insertions(+), 8 deletions(-)
 create mode 100644 tests/multi_gpu/multi_gpu_script_plot_and_save.py

diff --git a/src/cdtools/tools/distributed/distributed.py b/src/cdtools/tools/distributed/distributed.py
index 9e8d11d8..d956343b 100644
--- a/src/cdtools/tools/distributed/distributed.py
+++ b/src/cdtools/tools/distributed/distributed.py
@@ -657,7 +657,6 @@ def run_speed_test(world_sizes: List[int],
         ax2.set_xlabel('Epochs')
         ax3.set_xlabel('Number of GPUs')
         ax3.set_ylabel('Speed-up relative to single GPU')
-        plt.show()
 
     print('[INFO]: Multi-GPU speed test completed.')
 
diff --git a/tests/conftest.py b/tests/conftest.py
index b973e99e..d45e4f55 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -433,3 +433,9 @@ def example_nested_dicts(pytestconfig):
 def multigpu_script_1(pytestconfig):
     return str(pytestconfig.rootpath) + \
         '/tests/multi_gpu/multi_gpu_script_quality.py'
+
+
+@pytest.fixture(scope='module')
+def multigpu_script_2(pytestconfig):
+    return str(pytestconfig.rootpath) + \
+        '/tests/multi_gpu/multi_gpu_script_plot_and_save.py'
diff --git a/tests/multi_gpu/multi_gpu_script_plot_and_save.py b/tests/multi_gpu/multi_gpu_script_plot_and_save.py
new file mode 100644
index 00000000..456fae29
--- /dev/null
+++ b/tests/multi_gpu/multi_gpu_script_plot_and_save.py
@@ -0,0 +1,77 @@
+import cdtools
+import os
+from matplotlib import pyplot as plt
+
+filename = os.environ.get('CDTOOLS_TESTING_DATA_PATH')
+savedir = os.environ.get('CDTOOLS_TESTING_TMP_PATH')
+SHOW_PLOT = bool(int(os.environ.get('CDTOOLS_TESTING_SHOW_PLOT')))
+dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+model = cdtools.models.FancyPtycho.from_dataset(
+    dataset,
+    n_modes=3,
+    oversampling=2,
+    probe_support_radius=120,
+    propagation_distance=5e-3,
+    units='mm',
+    obj_view_crop=-50,
+)
+
+device = 'cuda'
+model.to(device=device)
+dataset.get_as(device=device)
+
+# Test Ptycho2DDataset.inspect
+if SHOW_PLOT:
+    dataset.inspect()
+
+# Test Ptycho2DDataset.to_cxi
+filename_to_cxi = os.path.join(savedir,
+                               f'RANK_{model.rank}_test_to_cxi.h5')
+dataset.to_cxi(filename_to_cxi)
+
+# Test CDIModel.save_to_h5
+filename_save_to_h5 = os.path.join(savedir,
+                                   f'RANK_{model.rank}_test_save_to.h5')
+model.save_to_h5(filename_save_to_h5, dataset)
+
+# Test CDIModel.save_on_exit(), CDIModel.inspect()
+filename_save_on_exit = os.path.join(savedir,
+                                     f'RANK_{model.rank}_test_save_on_exit.h5')
+
+with model.save_on_exit(filename_save_on_exit, dataset):
+    for loss in model.Adam_optimize(5, dataset, lr=0.02, batch_size=40):
+        if model.rank == 0:
+            print(model.report())
+            if SHOW_PLOT:
+                model.inspect(dataset)
+
+
+if SHOW_PLOT:
+    # Test CDIModel.compare(dataset)
+    model.compare(dataset)
+
+    # Test CDIModel.save_figures()
+    filename_save_figures = os.path.join(savedir,
+                                         f'RANK_{model.rank}_test_plot_')
+    model.save_figures(prefix=filename_save_figures,
+                       extension='.png')
+
+    plt.close('all')
+
+# Test CDIModel.save_checkpoint
+filename_save_checkpoint = \
+    os.path.join(savedir, f'RANK_{model.rank}_test_save_checkpoint.pt')
+model.save_checkpoint(dataset, checkpoint_file=filename_save_checkpoint)
+
+# Test CDIModel.save_on_exception()
+filename_save_on_except = \
+    os.path.join(savedir, f'RANK_{model.rank}_test_save_on_except.h5')
+
+with model.save_on_exception(filename_save_on_except, dataset):
+    for loss in model.Adam_optimize(10, dataset, lr=0.02, batch_size=40):
+        if model.rank == 0 and model.epoch <= 10:
+            print(model.report())
+        elif model.epoch > 10:
+            raise Exception('This is a deliberate exception raised to ' +
+                            'test save on exception')
diff --git a/tests/multi_gpu/test_multi_gpu.py b/tests/multi_gpu/test_multi_gpu.py
index 389cc5bb..93563ffd 100644
--- a/tests/multi_gpu/test_multi_gpu.py
+++ b/tests/multi_gpu/test_multi_gpu.py
@@ -1,6 +1,7 @@
 import cdtools.tools.distributed as dist
 import pytest
 import os
+import subprocess
 
 """
 This file contains several tests that are relevant to running multi-GPU
@@ -8,6 +9,82 @@
 """
 
 
+@pytest.mark.multigpu
+def test_plotting_and_saving(lab_ptycho_cxi,
+                             multigpu_script_2,
+                             tmp_path,
+                             show_plot):
+    """
+    Run a multi-GPU test on a script that executes several plotting and
+    file-saving methods from CDIModel and ensure they run without failure.
+
+    Also, make sure that only 1 GPU is generating the plots.
+
+    If this test fails, one of three things happened:
+        1) Either something failed while multigpu_script_2 was called
+        2) Somehow, something aside from Rank 0 saved results
+        3) multigpu_script_2 was not able to save all the data files
+           we asked it to save.
+    """
+    # Pass the cxi directory to the reconstruction script
+    # Define a temporary directory
+
+    # Run the test script, which generates several files that either have
+    # the prefix
+    cmd = ['torchrun',
+           '--standalone',
+           '--nnodes=1',
+           '--nproc_per_node=2',
+           '-m',
+           'cdtools.tools.distributed.single_to_multi_gpu',
+           '--backend=nccl',
+           '--timeout=30',
+           '--nccl_p2p_disable=1',
+           multigpu_script_2]
+
+    child_env = os.environ.copy()
+    child_env['CDTOOLS_TESTING_DATA_PATH'] = lab_ptycho_cxi
+    child_env['CDTOOLS_TESTING_TMP_PATH'] = str(tmp_path)
+    child_env['CDTOOLS_TESTING_SHOW_PLOT'] = str(int(show_plot))
+
+    try:
+        subprocess.run(cmd, check=True, env=child_env)
+    except subprocess.CalledProcessError:
+        # The called script is designed to throw an exception.
+        # TODO: Figure out how to distinguish between the engineered error
+        # in the script versus any other error.
+        pass
+
+    # Check if all the generated file names only have the prefix 'RANK_0'
+    filelist = [f for f in os.listdir(tmp_path)
+                if os.path.isfile(os.path.join(tmp_path, f))]
+
+    assert all([file.startswith('RANK_0') for file in filelist])
+    print('All files have the RANK_0 prefix.')
+
+    # Check if plots have been saved
+    if show_plot:
+        print('Plots generated: ' +
+              f'{sum([file.startswith('RANK_0_test_plot')
+                      for file in filelist])}')
+        assert any([file.startswith('RANK_0_test_plot') for file in filelist])
+    else:
+        print('--plot not enabled. Checks on plotting and figure saving' +
+              ' will not be conducted.')
+
+    # Check if we have all five data files saved
+    file_output_suffix = ('test_save_checkpoint.pt',
+                          'test_save_on_exit.h5',
+                          'test_save_on_except.h5',
+                          'test_save_to.h5',
+                          'test_to_cxi.h5')
+
+    print(f'{sum([file.endswith(file_output_suffix) for file in filelist])}'
+          + ' out of 5 data files have been generated.')
+    assert sum([file.endswith(file_output_suffix) for file in filelist]) \
+        == len(file_output_suffix)
+
+
 @pytest.mark.multigpu
 def test_reconstruction_quality(lab_ptycho_cxi,
                                 multigpu_script_1,
@@ -33,7 +110,7 @@ def test_reconstruction_quality(lab_ptycho_cxi,
     file_prefix = 'speed_test'
 
     # Define a temporary directory
-    temp_dir = str(tmp_path / "reconstruction")
+    temp_dir = str(tmp_path)
 
     results = dist.run_speed_test(world_sizes=world_sizes,
                                   runs=runs,
diff --git a/tests/test_reconstructors.py b/tests/test_reconstructors.py
index 4e6ee3c1..0f46239c 100644
--- a/tests/test_reconstructors.py
+++ b/tests/test_reconstructors.py
@@ -84,7 +84,6 @@ def test_Adam_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     if show_plot:
         model_recon.inspect(dataset)
         model_recon.compare(dataset)
-        plt.show()
 
     # ******* Reconstructions with cdtools.CDIModel.Adam_optimize *******
     print('Running reconstruction using CDIModel.Adam_optimize on provided' +
@@ -105,7 +104,6 @@ def test_Adam_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     if show_plot:
         model.inspect(dataset)
         model.compare(dataset)
-        plt.show()
 
     # Ensure equivalency between the model reconstructions
     assert np.allclose(model_recon.loss_history[-1], model.loss_history[-1])
@@ -176,7 +174,6 @@ def test_LBFGS_RPI(optical_data_ss_cxi,
     if show_plot:
         model_recon.inspect(dataset)
         model_recon.compare(dataset)
-        plt.show()
 
     # Check model pointing
     assert id(model_recon) == id(recon.model)
@@ -197,7 +194,6 @@ def test_LBFGS_RPI(optical_data_ss_cxi,
     if show_plot:
         model.inspect(dataset)
         model.compare(dataset)
-        plt.show()
 
     # Check loss equivalency between the two reconstructions
     assert np.allclose(model.loss_history[-1], model_recon.loss_history[-1])
@@ -286,7 +282,6 @@ def test_SGD_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     if show_plot:
         model_recon.inspect(dataset)
         model_recon.compare(dataset)
-        plt.show()
 
     # ******* Reconstructions with cdtools.CDIModel.SGD_optimize *******
     print('Running reconstruction using CDIModel.SGD_optimize on provided' +
@@ -306,7 +301,6 @@ def test_SGD_gold_balls(gold_ball_cxi, reconstruction_device, show_plot):
     if show_plot:
         model.inspect(dataset)
         model.compare(dataset)
-        plt.show()
 
     # Ensure equivalency between the model reconstructions
     assert np.allclose(model_recon.loss_history[-1], model.loss_history[-1])

From a2f5956a3e3691db5a364bc67322bdba13369c71 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 18 Jul 2025 22:22:11 +0000
Subject: [PATCH 113/115] ReduceLROnPlateau works with multi-GPU

---
 src/cdtools/reconstructors/base.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/cdtools/reconstructors/base.py b/src/cdtools/reconstructors/base.py
index ec5e198a..017218d6 100644
--- a/src/cdtools/reconstructors/base.py
+++ b/src/cdtools/reconstructors/base.py
@@ -236,6 +236,16 @@ def closure():
         if self.scheduler is not None:
             self.scheduler.step(loss)
 
+            # Broadcast the learning rate based on Rank 0 for multi-GPU
+            if self.model.multi_gpu_used:
+                for param_group in self.optimizer.param_groups:
+                    # Make sure we broadcase over whatever device type
+                    # we're using. Only tested over cuda.
+                    lr_tensor = t.tensor(param_group['lr'],
+                                         device=self.model.obj.device)
+                    dist.broadcast(lr_tensor, src=0)
+                    param_group['lr'] = lr_tensor.item()
+
         self.model.loss_history.append(loss)
         self.model.loss_times.append(time.time() - self.model.INITIAL_TIME)
         self.model.epoch = len(self.model.loss_history)

From 219bce78d79afa7c3b548844112e96f2f8776543 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 18 Jul 2025 22:31:06 +0000
Subject: [PATCH 114/115] Changed single to double quote in
 test_plotting_and_saving

---
 tests/multi_gpu/test_multi_gpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/multi_gpu/test_multi_gpu.py b/tests/multi_gpu/test_multi_gpu.py
index 93563ffd..0134d01b 100644
--- a/tests/multi_gpu/test_multi_gpu.py
+++ b/tests/multi_gpu/test_multi_gpu.py
@@ -65,8 +65,8 @@ def test_plotting_and_saving(lab_ptycho_cxi,
     # Check if plots have been saved
     if show_plot:
         print('Plots generated: ' +
-              f'{sum([file.startswith('RANK_0_test_plot')
-                      for file in filelist])}')
+              f"{sum([file.startswith('RANK_0_test_plot')
+                      for file in filelist])}")
         assert any([file.startswith('RANK_0_test_plot') for file in filelist])
     else:
         print('--plot not enabled. Checks on plotting and figure saving' +

From 95988d3aa81ec8d6f8bcdae5080cb3506a29bc83 Mon Sep 17 00:00:00 2001
From: yoshikisd <yoshikisd@outlook.com>
Date: Fri, 18 Jul 2025 22:36:21 +0000
Subject: [PATCH 115/115] Make the print statement in test_plotting_and_saving
 a single line

---
 tests/multi_gpu/test_multi_gpu.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/multi_gpu/test_multi_gpu.py b/tests/multi_gpu/test_multi_gpu.py
index 0134d01b..f26db2cc 100644
--- a/tests/multi_gpu/test_multi_gpu.py
+++ b/tests/multi_gpu/test_multi_gpu.py
@@ -65,8 +65,7 @@ def test_plotting_and_saving(lab_ptycho_cxi,
     # Check if plots have been saved
     if show_plot:
         print('Plots generated: ' +
-              f"{sum([file.startswith('RANK_0_test_plot')
-                      for file in filelist])}")
+              f"{sum([file.startswith('RANK_0_test_plot') for file in filelist])}") # noqa
         assert any([file.startswith('RANK_0_test_plot') for file in filelist])
     else:
         print('--plot not enabled. Checks on plotting and figure saving' +