cdtools-developers · yoshikisd · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025
diff --git a/examples/distributed_speed_test.py b/examples/distributed_speed_test.py
@@ -0,0 +1,46 @@
+from cdtools.tools.distributed import run_speed_test
+
+# Define the number of GPUs to use for the test. We always need to include
+# a single GPU in the test.
+#
+# Here, we will run trials with 1 and 2 GPUs.
+world_sizes = [1, 2]
+
+# We will run 3 trials per GPU to collect statistics on loss-versus-epoch/time
+# data as well as runtime speedup.
+runs = 3
+
+# We will perform a speed test on a reconstruction script modified to run
+# a speed test (see fancy_ptycho_speed_test.py)
+script_path = 'fancy_ptycho_speed_test.py'
+
+# When we run the modified script with the speed test, a pickle dump file
+# will be generated after each trial. The file contains data about loss-vs-time
+# measured for the trial with one or several GPUs used.
+output_dir = 'example_loss_data'
+
+# Define the file name prefix. The file will have the following name:
+# `<file_prefix>_nGPUs_<world_size>_TRIAL_<run number>.pkl`
+file_prefix = 'speed_test'
+
+# We can plot several curves showing what the loss-versus/epoch curves look
+# like for each GPU count. The plot will also show the relative runtime
+# speed-up relative to the single-GPU runtime.
+show_plot = True
+
+# We can also delete the pickle dump files after each trial run has been
+# completed and stored by `run_speed_test`
+delete_output_file = True
+
+# Run the test. This speed test will return several lists containing the
+# means and standard deviations of the final recorded losses and runtime
+# speed ups calculated over several trial runs. Each entry index maps to
+# the GPU count specified by `world_sizes`.
+final_loss_mean, final_loss_std, speed_up_mean, speed_up_std = \
+    run_speed_test(world_sizes=world_sizes,
+                   runs=runs,
+                   script_path=script_path,
+                   output_dir=output_dir,
+                   file_prefix=file_prefix,
+                   show_plot=show_plot,
+                   delete_output_files=delete_output_file)
diff --git a/examples/fancy_ptycho_speed_test.py b/examples/fancy_ptycho_speed_test.py
@@ -0,0 +1,53 @@
+import cdtools
+
+
+# To modify fancy_ptycho.py for a multi-GPU speed test, we need to enclose the
+# entire reconstruction script in a function. The function then needs to be
+# decorated with cdtools.tools.distributed.report_speed_test. The decorator
+# allows data to be saved and read by the multi-GPU speed test function
+# which we will use to run this script.
+@cdtools.tools.distributed.report_speed_test
+def main():
+    filename = 'example_data/lab_ptycho_data.cxi'
+    dataset = cdtools.datasets.Ptycho2DDataset.from_cxi(filename)
+
+    model = cdtools.models.FancyPtycho.from_dataset(
+        dataset,
+        n_modes=3,
+        oversampling=2,
+        probe_support_radius=120,
+        propagation_distance=5e-3,
+        units='mm',
+        obj_view_crop=-50
+    )
+
+    device = 'cuda'
+    model.to(device=device)
+    dataset.get_as(device=device)
+
+    # Remove or comment out plotting existing plotting statements
+    for loss in model.Adam_optimize(50, dataset, lr=0.02, batch_size=40):
+        # Optional: ensure that only a single GPU prints a report by
+        # adding an if statement. Without this, the print statement will
+        # be called by all participating GPUs, resulting in multiple copies
+        # of the printed model report.
+        if model.rank == 0:
+            print(model.report())
+
+    for loss in model.Adam_optimize(25, dataset,  lr=0.005, batch_size=40):
+        if model.rank == 0:
+            print(model.report())
+
+    for loss in model.Adam_optimize(25, dataset,  lr=0.001, batch_size=40):
+        if model.rank == 0:
+            print(model.report())
+
+    model.tidy_probes()
+
+    # We need to return the model so the data can be saved by the decorator.
+    return model
+
+
+# We also need to include this if-name-main block at the end
+if __name__ == '__main__':
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,6 @@
 [tool.ruff]
 # Decrease the maximum line length to 79 characters.
-line-length = 79
+line-length = 79
+
+[tool.pytest.ini_options]
+testpaths = 'tests'
diff --git a/setup.py b/setup.py
@@ -47,5 +47,10 @@
         "Programming Language :: Python :: 3",
         "Operating System :: OS Independent",
     ],
+    entry_points={
+        'console_scripts': {
+            'cdt-torchrun = cdtools.tools.distributed.distributed:run_single_to_multi_gpu'
+        }
+    }
 )
 
diff --git a/src/cdtools/datasets/base.py b/src/cdtools/datasets/base.py
@@ -19,6 +19,7 @@
 import pathlib
 from cdtools.tools import data as cdtdata
 from torch.utils import data as torchdata
+import os
 
 __all__ = ['CDataset']
 
@@ -92,6 +93,18 @@ def __init__(
 
         self.get_as(device='cpu')
 
+        # These attributes indicate to the CDataset methods whether or not 
+        # multi-GPU calculations are being performed. These flags are mostly
+        # used to prevent the production of duplicate plots when CDataset.inspect
+        # is called.
+        rank = os.environ.get('RANK')
+        world_size = os.environ.get('WORLD_SIZE')
+        # Rank of the subprocess running the GPU (defauly rank 0)
+        self.rank = int(rank) if rank is not None else 0 
+        # Total number of GPUs being used.    
+        self.world_size = int(world_size) if world_size is not None else 1   
+        self.multi_gpu_used = int(self.world_size) > 1      
+
 
     def to(self, *args, **kwargs):
         """Sends the relevant data to the given device and dtype

diff --git a/src/cdtools/datasets/ptycho_2d_dataset.py b/src/cdtools/datasets/ptycho_2d_dataset.py
@@ -198,6 +198,8 @@ def to_cxi(self, cxi_file):
         cxi_file : str, pathlib.Path, or h5py.File
             The .cxi file to write to
         """
+        if self.multi_gpu_used and self.rank != 0:
+            return
 
         # If a bare string is passed
         if isinstance(cxi_file, str) or isinstance(cxi_file, pathlib.Path):
@@ -230,7 +232,9 @@ def inspect(
         can display a base-10 log plot of the detector readout at each
         position.
         """
-
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
 
         def get_images(idx):
             inputs, output = self[idx]

diff --git a/src/cdtools/models/base.py b/src/cdtools/models/base.py
@@ -29,15 +29,11 @@
 """
 
 import torch as t
-from torch.utils import data as torchdata
 from matplotlib import pyplot as plt
 from matplotlib.widgets import Slider
 from matplotlib import ticker
 import numpy as np
-import threading
-import queue
 import time
-from scipy import io
 from contextlib import contextmanager
 from cdtools.tools.data import nested_dict_to_h5, h5_to_nested_dict, nested_dict_to_numpy, nested_dict_to_torch
 from cdtools.reconstructors import AdamReconstructor, LBFGSReconstructor, SGDReconstructor
@@ -65,6 +61,25 @@ def __init__(self):
         self.training_history = ''
         self.epoch = 0
 
+        # These attributes indicate to the CDIModel methods whether or not 
+        # multi-GPU calculations are being performed. These flags help
+        # trigger multi-GPU-specific function calls (i.e., all_reduce) and
+        # prevent redundant plots/reports/saves during multi-GPU use.
+        rank = os.environ.get('RANK')
+        world_size = os.environ.get('WORLD_SIZE')
+
+        # Rank of the subprocess running the GPU (defauly rank 0)
+        self.rank = int(rank) if rank is not None else 0 
+        # Total number of GPUs being used.    
+        self.world_size = int(world_size) if world_size is not None else 1   
+        self.multi_gpu_used = int(self.world_size) > 1     
+
+        # Keep track of the time each loss history point was taken relative to
+        # the initialization of this model.
+        self.INITIAL_TIME = time.time()
+        self.loss_times = []        
+
+
     def from_dataset(self, dataset):
         raise NotImplementedError()
 
@@ -197,7 +212,9 @@ def save_to_h5(self, filename, *args):
         *args
             Accepts any additional args that model.save_results needs, for this model
         """
-        return nested_dict_to_h5(filename, self.save_results(*args))
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if not (self.multi_gpu_used and self.rank != 0):
+            return nested_dict_to_h5(filename, self.save_results(*args))
 
 
     @contextmanager
@@ -219,12 +236,17 @@ def save_on_exit(self, filename, *args, exception_filename=None):
         """
         try:
             yield
-            self.save_to_h5(filename, *args)
-        except:
-            if exception_filename is None:
-                exception_filename = filename
-            self.save_to_h5(exception_filename, *args)
-            raise
+
+            # Only let the Rank 0 GPU handle saving in multi-GPU
+            if not (self.multi_gpu_used and self.rank != 0):
+                self.save_to_h5(filename, *args)
+
+        except Exception as e:
+            if not (self.multi_gpu_used and self.rank != 0):
+                if exception_filename is None:
+                    exception_filename = filename
+                self.save_to_h5(exception_filename, *args)
+            raise e
 
     @contextmanager
     def save_on_exception(self, filename, *args):
@@ -242,13 +264,15 @@ def save_on_exception(self, filename, *args):
         *args
             Accepts any additional args that model.save_results needs, for this model
         """
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
         try:
             yield
-        except:
-            self.save_to_h5(filename, *args)
-            print('Intermediate results saved under name:')
-            print(filename, flush=True)
-            raise
+        except Exception as e:
+            if not (self.multi_gpu_used and self.rank != 0):
+                self.save_to_h5(filename, *args)
+                print('Intermediate results saved under name:')
+                print(filename, flush=True)
+            raise e
 
 
     def use_checkpoints(self, job_id, checkpoint_file_stem):
@@ -270,6 +294,10 @@ def skip_computation(self):
             return False
 
     def save_checkpoint(self, *args, checkpoint_file=None):
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+
         checkpoint = self.save_results(*args)
         if (hasattr(self, 'current_optimizer')
             and self.current_optimizer is not None):
@@ -578,6 +606,10 @@ def inspect(self, dataset=None, update=True):
             Whether to update existing plots or plot new ones
 
         """
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+
         # We find or create all the figures
         first_update = False
         if update and hasattr(self, 'figs') and self.figs:
@@ -660,7 +692,10 @@ def save_figures(self, prefix='', extension='.pdf'):
         extention : strategy
             Default is .eps, the file extension to save with.
         """
-
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+
         if hasattr(self, 'figs') and self.figs:
             figs = self.figs
         else:
@@ -688,6 +723,10 @@ def compare(self, dataset, logarithmic=False):
             Whether to plot the diffraction on a logarithmic scale
         """
 
+        # FOR MULTI-GPU: Only run this method if it's called by the rank 0 GPU
+        if self.multi_gpu_used and self.rank != 0:
+            return
+
         fig, axes = plt.subplots(1,3,figsize=(12,5.3))
         fig.tight_layout(rect=[0.02, 0.09, 0.98, 0.96])
         axslider = plt.axes([0.15,0.06,0.75,0.03])