update

Mark Saroufim · Mark Saroufim · commit 9629cfe3ab2d · 2025-09-23T16:55:14.000-07:00
diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py
@@ -1,80 +1,82 @@
-import os
-import signal
-import sys
-
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-
+from multiprocessing import Pool
+import os
+import signal
+import sys
 
 def timeout_handler(signum, frame):
-    print("✗ TIMEOUT: Process hung")
+    print('✗ TIMEOUT: Process hung')
     sys.exit(1)
 
-
-def test_worker(rank, world_size, master_port):
+def test_worker(args):
+    rank, world_size, master_port = args
     try:
-        os.environ["MASTER_ADDR"] = "127.0.0.1"
-        os.environ["MASTER_PORT"] = str(master_port)
-        os.environ["RANK"] = str(rank)
-        os.environ["WORLD_SIZE"] = str(world_size)
-
+        os.environ['MASTER_ADDR'] = '127.0.0.1'
+        os.environ['MASTER_PORT'] = str(master_port)
+        os.environ['RANK'] = str(rank)
+        os.environ['WORLD_SIZE'] = str(world_size)
+        
         signal.signal(signal.SIGALRM, timeout_handler)
         signal.alarm(30)
-
-        print(f"Rank {rank}: Init NCCL...")
-        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+        
+        print(f'Rank {rank}: Init NCCL...')
+        dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
         signal.alarm(0)
-
-        device = torch.device(f"cuda:{rank}")
+        
+        device = torch.device(f'cuda:{rank}')
         tensor = torch.ones(100, device=device) * rank
-
+        
         signal.alarm(15)
         dist.all_reduce(tensor)
         signal.alarm(0)
-
-        print(f"✓ Rank {rank}: sum = {tensor[0].item()}")
+        
+        print(f'✓ Rank {rank}: sum = {tensor[0].item()}')
         dist.destroy_process_group()
-
+        return True
+        
     except Exception as e:
         signal.alarm(0)
-        print(f"✗ Rank {rank}: {e}")
-        sys.exit(1)
-
+        print(f'✗ Rank {rank}: {e}')
+        return False
 
 def main():
     num_gpus = torch.cuda.device_count()
-    print(f"Testing {num_gpus} GPUs - 4 rounds")
-
+    print(f'Testing {num_gpus} GPUs - 4 rounds')
+    
     for round_num in range(4):
-        print(f"=== ROUND {round_num + 1} ===")
+        print(f'=== ROUND {round_num + 1} ===')
         master_port = 29500 + round_num
-
-        mp.set_start_method("spawn", force=True)
-        processes = []
-
-        for rank in range(num_gpus):
-            p = mp.Process(target=test_worker, args=(rank, num_gpus, master_port))
-            p.start()
-            processes.append(p)
-
-        for _, p in enumerate(processes):
-            p.join(timeout=60)
-            if p.exitcode != 0:
-                print(f"✗ ROUND {round_num + 1} FAILED")
-                for rp in processes:
-                    if rp.is_alive():
-                        rp.terminate()
+        
+        mp.set_start_method('spawn', force=True)
+        
+        # Prepare worker arguments
+        worker_args = [(rank, num_gpus, master_port) for rank in range(num_gpus)]
+        
+        with Pool(processes=num_gpus) as pool:
+            try:
+                # Use map_async with timeout
+                result = pool.map_async(test_worker, worker_args)
+                results = result.get(timeout=60)
+                
+                # Check if all workers succeeded
+                if not all(results):
+                    print(f'✗ ROUND {round_num + 1} FAILED')
+                    sys.exit(1)
+                    
+            except mp.TimeoutError:
+                print(f'✗ ROUND {round_num + 1} HUNG')
+                pool.terminate()
+                pool.join()
                 sys.exit(1)
-            elif p.is_alive():
-                print(f"✗ ROUND {round_num + 1} HUNG")
-                p.terminate()
+            except Exception as e:
+                print(f'✗ ROUND {round_num + 1} ERROR: {e}')
                 sys.exit(1)
+        
+        print(f'✓ ROUND {round_num + 1} PASSED')
+    
+    print('✓ ALL ROUNDS PASSED')
 
-        print(f"✓ ROUND {round_num + 1} PASSED")
-
-    print("✓ ALL ROUNDS PASSED")
-
-
-if __name__ == "__main__":
-    main()
+if __name__ == '__main__':
+    main()