From ca50bbf9ba9194465bf704fa7f7a711c33c5985b Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 19 May 2026 13:35:40 -0400 Subject: [PATCH 1/2] Add license to framework sdist builds (#3002) Signed-off-by: ksivamani --- transformer_engine/jax/setup.py | 8 ++++++++ transformer_engine/pytorch/setup.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/transformer_engine/jax/setup.py b/transformer_engine/jax/setup.py index 2d25242825..678062df91 100644 --- a/transformer_engine/jax/setup.py +++ b/transformer_engine/jax/setup.py @@ -42,6 +42,11 @@ shutil.rmtree(build_tools_copy) shutil.copytree(build_tools_dir, build_tools_copy) +license_src = current_file_path.parent.parent / "LICENSE" +license_dst = current_file_path / "LICENSE" +if license_src.is_file(): + shutil.copyfile(license_src, license_dst) + from build_tools.build_ext import get_build_ext from build_tools.utils import copy_common_headers, min_python_version_str @@ -131,7 +136,10 @@ def get_cuda_major_version() -> int: python_requires=f">={min_python_version_str()}", install_requires=install_requires, tests_require=test_requirements(), + license_files=("LICENSE",), ) if any(x in sys.argv for x in (".", "sdist", "bdist_wheel")): shutil.rmtree(common_headers_dir) shutil.rmtree("build_tools") + if license_dst.is_file(): + license_dst.unlink() diff --git a/transformer_engine/pytorch/setup.py b/transformer_engine/pytorch/setup.py index 99f6a99efa..593a3169d9 100644 --- a/transformer_engine/pytorch/setup.py +++ b/transformer_engine/pytorch/setup.py @@ -43,6 +43,11 @@ shutil.rmtree(build_tools_copy) shutil.copytree(build_tools_dir, build_tools_copy) +license_src = current_file_path.parent.parent / "LICENSE" +license_dst = current_file_path / "LICENSE" +if license_src.is_file(): + shutil.copyfile(license_src, license_dst) + from build_tools.build_ext import get_build_ext from build_tools.utils import copy_common_headers, min_python_version_str @@ -177,7 +182,10 @@ def run(self): python_requires=f">={min_python_version_str()}", install_requires=install_requires, tests_require=test_requirements(), + license_files=("LICENSE",), ) if any(x in sys.argv for x in (".", "sdist", "bdist_wheel")): shutil.rmtree(common_headers_dir) shutil.rmtree("build_tools") + if license_dst.is_file(): + license_dst.unlink() From b629e6e54cb3197927c0799c5aeca7537adebe68 Mon Sep 17 00:00:00 2001 From: Shaurya Singh Date: Tue, 19 May 2026 11:02:55 -0700 Subject: [PATCH 2/2] docs: fix comm GEMM overlap README typos (#3010) Signed-off-by: LeSingh1 --- examples/pytorch/comm_gemm_overlap/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/comm_gemm_overlap/README.md b/examples/pytorch/comm_gemm_overlap/README.md index fc8458844b..b7ecb2d069 100644 --- a/examples/pytorch/comm_gemm_overlap/README.md +++ b/examples/pytorch/comm_gemm_overlap/README.md @@ -6,7 +6,7 @@ - `CUDA_DEVICE_MAX_CONNECTIONS=1` must be enabled in the environment. - For best performance, point-to-point communication via _CUDA Multicast_ needs CUDA Toolkit 12.0+ and CUDA driver 535+ on devices with compute capability 9.0 or newer. -- Devices older than compute capability 9.0 require `UB_SKIPMC=1` in the environment in order fall +- Devices older than compute capability 9.0 require `UB_SKIPMC=1` in the environment in order to fall back on a less performant implementation based on CUDA Inter-Process Communication (IPC) handles. ## Examples @@ -22,7 +22,7 @@ $ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_ov # [rank0:node0] |-- Created tensor-parallel group: [0, 1, 2, 3, 4, 5, 6, 7] # !!! [UB] Create UbufP2PCommOverlap Communicator # UB_TIMEOUT is set to 110 sec, 217800000000 cycles, freq: 1980000khz -# MC initialized succesfully, window size = 549755813888 +# MC initialized successfully, window size = 549755813888 # !!! [UBP2P] Register UBuf 1 # !!! [UBP2P] Register UBuf 2 # !!! [UBP2P] Register UBuf 3 @@ -66,7 +66,7 @@ $ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_ov ``` ### Single node, mixed data- and tensor-parallel LayerNormMLP: -Uses `torch.nn.parallel.DistributedDataParallel` for replicatin the model across 2 tensor-parallel +Uses `torch.nn.parallel.DistributedDataParallel` for replicating the model across 2 tensor-parallel groups in a single node. ```bash @@ -81,7 +81,7 @@ $ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_ov # [rank2:node0] |-- Created data-parallel group: [2, 6] # !!! [UB] Create UbufP2PCommOverlap Communicator # UB_TIMEOUT is set to 110 sec, 217800000000 cycles, freq: 1980000khz -# MC initialized succesfully, window size = 549755813888 +# MC initialized successfully, window size = 549755813888 # !!! [UBP2P] Register UBuf 1 # !!! [UBP2P] Register UBuf 2 # !!! [UBP2P] Register UBuf 3