From 018186b1fd776b99f604a71e6be9aa176e2d1022 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Mon, 2 Feb 2026 18:28:50 -0800 Subject: [PATCH 01/10] Refactor CUDA workflow to better use test-infra --- .github/workflows/linux_cuda.yml | 190 ++++++++++--------------------- 1 file changed, 63 insertions(+), 127 deletions(-) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index f3670de49..7bed3d21a 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -6,133 +6,69 @@ on: - main pull_request: -# Use TorchBench's docker image which has all basic dependencies. -env: - DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest" - -jobs: - pr-test: - # AWS A10G GPU instance label: linux.g5.4xlarge.nvidia.gpu - # OS version: Amazon Linux 2 - runs-on: linux.g5.4xlarge.nvidia.gpu - timeout-minutes: 180 # 3 hours - steps: - - name: Checkout Kineto - uses: actions/checkout@v5 - with: - path: kineto - submodules: recursive - - - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ env.DOCKER_IMAGE }} - - - name: Install NVIDIA Driver, docker runtime, set GPU_FLAG - uses: pytorch/test-infra/.github/actions/setup-nvidia@main - - - name: Get env vars - run: | - echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW - echo HOME = $HOME - echo GITHUB_ACTION = $GITHUB_ACTION - echo GITHUB_ACTIONS = $GITHUB_ACTIONS - echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY - echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME - echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH - echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE - echo GITHUB_SHA = $GITHUB_SHA - echo GITHUB_REF = $GITHUB_REF - - - name: Build libkineto (static and shared library) - run: | - container_name=$(docker run \ - ${GPU_FLAG:-} \ - --tty \ - --detach \ - --shm-size=32gb \ - -v "${PWD}/kineto:/kineto" \ - -w / \ - "${{ env.DOCKER_IMAGE }}" - ) - echo "Container name: ${container_name}" - - docker exec -t -w "/kineto" "${container_name}" bash -c " - set -eux - mkdir build_static - mkdir build_shared - pip install cmake - " - - docker exec -t -w "/kineto/build_static" "${container_name}" bash -c " - set -eux - cmake -DKINETO_LIBRARY_TYPE=static ../libkineto/ - make -j - " - - docker exec -t -w "/kineto/build_shared" "${container_name}" bash -c " - set -eux - cmake -DKINETO_LIBRARY_TYPE=shared ../libkineto/ - make -j - " - - - name: Run libkineto tests - run: | - container_name=$(docker ps -lq) - docker exec -t -w "/kineto/build_static" "${container_name}" bash -c "make test" - - - name: Clone PyTorch - run: | - container_name=$(docker ps -lq) - docker exec -t -w "/" "${container_name}" bash -c " - set -eux - git clone --recursive --branch viable/strict https://github.com/pytorch/pytorch.git - " - - - name: Replace PyTorch's Kineto with PR version - run: | - container_name=$(docker ps -lq) - docker exec -t -w "/pytorch" "${container_name}" bash -c " - set -eux - rm -rf third_party/kineto - ln -s /kineto third_party/kineto - " - - - name: Build PyTorch from source - run: | - container_name=$(docker ps -lq) - docker exec -t -w "/pytorch" "${container_name}" bash -c " - set -eux - pip install -r requirements.txt - export USE_CUDA=1 - export BUILD_TEST=1 - python setup.py develop - " - - - name: Run PyTorch profiler tests - run: | - container_name=$(docker ps -lq) - docker exec -t -w "/pytorch" "${container_name}" bash -c " - set -eux - # TODO: Dynamically add/remove tests to the exclusion list based on their - # status on trunk instead of maintaining a hardcoded list of known failures. - # This will prevent the list from becoming stale as tests get fixed upstream. - python -m pytest test/profiler/ -v \ - --deselect=test/profiler/test_memory_profiler.py::TestDataFlow::test_data_flow_graph_complicated \ - --deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_sequential_fwd_bwd \ - --deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_simple_fwd_bwd \ - --deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_simple_fwd_bwd_step \ - --deselect=test/profiler/test_profiler.py::TestProfiler::test_kineto \ - --deselect=test/profiler/test_profiler.py::TestProfiler::test_user_annotation \ - --deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_fuzz_symbolize \ - --deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_profiler_debug_autotuner \ - --deselect=test/profiler/test_torch_tidy.py::TestTorchTidyProfiler::test_tensorimpl_invalidation_scalar_args - " - - - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main - if: always() - concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true + +jobs: + pr-test: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + job-name: linux-cuda-build-and-test + # AWS A10G GPU instance + runner: linux.g5.4xlarge.nvidia.gpu + timeout: 180 + submodules: recursive + gpu-arch-type: cuda + gpu-arch-version: "12.6" + script: | + set -eux + + # Build libkineto (static and shared library) + pip install cmake + mkdir -p build_static build_shared + + pushd build_static + cmake -DKINETO_LIBRARY_TYPE=static ../libkineto/ + make -j + popd + + pushd build_shared + cmake -DKINETO_LIBRARY_TYPE=shared ../libkineto/ + make -j + popd + + # Run libkineto tests + pushd build_static + make test + popd + + # Clone PyTorch and replace its Kineto with PR version + PYTORCH_DIR="${RUNNER_TEMP}/pytorch" + git clone --recursive --branch viable/strict https://github.com/pytorch/pytorch.git "${PYTORCH_DIR}" + + pushd "${PYTORCH_DIR}" + rm -rf third_party/kineto + ln -s "${GITHUB_WORKSPACE}/${GITHUB_REPOSITORY}" third_party/kineto + + # Build PyTorch from source + pip install -r requirements.txt + export USE_CUDA=1 + export BUILD_TEST=1 + python setup.py develop + + # Run PyTorch profiler tests + # TODO: Dynamically add/remove tests to the exclusion list based on their + # status on trunk instead of maintaining a hardcoded list of known failures. + # This will prevent the list from becoming stale as tests get fixed upstream. + python -m pytest test/profiler/ -v \ + --deselect=test/profiler/test_memory_profiler.py::TestDataFlow::test_data_flow_graph_complicated \ + --deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_sequential_fwd_bwd \ + --deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_simple_fwd_bwd \ + --deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_simple_fwd_bwd_step \ + --deselect=test/profiler/test_profiler.py::TestProfiler::test_kineto \ + --deselect=test/profiler/test_profiler.py::TestProfiler::test_user_annotation \ + --deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_fuzz_symbolize \ + --deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_profiler_debug_autotuner \ + --deselect=test/profiler/test_torch_tidy.py::TestTorchTidyProfiler::test_tensorimpl_invalidation_scalar_args + popd From bc2aaa0acc3bea1934bebd1ad9e31fe4cf14bfc1 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Mon, 2 Feb 2026 18:50:05 -0800 Subject: [PATCH 02/10] Add checkout; upgrade pip --- .github/workflows/linux_cuda.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index 7bed3d21a..8a12b7321 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -18,12 +18,18 @@ jobs: # AWS A10G GPU instance runner: linux.g5.4xlarge.nvidia.gpu timeout: 180 + # Checkout the Kineto repo at the PR ref with submodules + repository: ${{ github.repository }} + ref: ${{ github.ref }} submodules: recursive gpu-arch-type: cuda gpu-arch-version: "12.6" script: | set -eux + # Upgrade pip + python -m pip install --upgrade pip + # Build libkineto (static and shared library) pip install cmake mkdir -p build_static build_shared From f524b30d0da96d581d023e5ba3becfa93fed77b3 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Tue, 3 Feb 2026 06:15:06 -0800 Subject: [PATCH 03/10] Ensure cmake is minimum version --- .github/workflows/linux_cuda.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index 8a12b7321..a771f3c9a 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -30,8 +30,10 @@ jobs: # Upgrade pip python -m pip install --upgrade pip + # Ensure cmake is minimum version Kineto requires + pip install cmake>=3.22 + # Build libkineto (static and shared library) - pip install cmake mkdir -p build_static build_shared pushd build_static From 4fe47861c405b95411335f3d6d874fbde8f3454a Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Tue, 3 Feb 2026 06:30:34 -0800 Subject: [PATCH 04/10] Conda install cmake? --- .github/workflows/linux_cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index a771f3c9a..628ac283a 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -31,7 +31,7 @@ jobs: python -m pip install --upgrade pip # Ensure cmake is minimum version Kineto requires - pip install cmake>=3.22 + conda install -y cmake>=3.22 # Build libkineto (static and shared library) mkdir -p build_static build_shared From 68af7bb01339cf32dc080d2eaa6ac04069d0229a Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Tue, 3 Feb 2026 06:58:54 -0800 Subject: [PATCH 05/10] Progress markers --- .github/workflows/linux_cuda.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index 628ac283a..8ccd92776 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -29,9 +29,11 @@ jobs: # Upgrade pip python -m pip install --upgrade pip + echo "Installed pip version: $(python -m pip --version)" # Ensure cmake is minimum version Kineto requires conda install -y cmake>=3.22 + echo "Installed cmake version: $(cmake --version)" # Build libkineto (static and shared library) mkdir -p build_static build_shared @@ -40,30 +42,36 @@ jobs: cmake -DKINETO_LIBRARY_TYPE=static ../libkineto/ make -j popd + echo "Compiled static libkineto" pushd build_shared cmake -DKINETO_LIBRARY_TYPE=shared ../libkineto/ make -j popd + echo "Compiled shared libkineto" # Run libkineto tests pushd build_static make test popd + echo "Ran static libkineto tests" # Clone PyTorch and replace its Kineto with PR version PYTORCH_DIR="${RUNNER_TEMP}/pytorch" git clone --recursive --branch viable/strict https://github.com/pytorch/pytorch.git "${PYTORCH_DIR}" + echo "Cloned PyTorch" pushd "${PYTORCH_DIR}" rm -rf third_party/kineto ln -s "${GITHUB_WORKSPACE}/${GITHUB_REPOSITORY}" third_party/kineto + echo "Linked PR version of Kineto to PyTorch" # Build PyTorch from source pip install -r requirements.txt export USE_CUDA=1 export BUILD_TEST=1 python setup.py develop + echo "Built PyTorch from source" # Run PyTorch profiler tests # TODO: Dynamically add/remove tests to the exclusion list based on their @@ -80,3 +88,4 @@ jobs: --deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_profiler_debug_autotuner \ --deselect=test/profiler/test_torch_tidy.py::TestTorchTidyProfiler::test_tensorimpl_invalidation_scalar_args popd + echo "Ran PyTorch profiler tests" From ae2dc78d54a5188ebb45f2627bfcddcc207eeb85 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Tue, 3 Feb 2026 07:00:50 -0800 Subject: [PATCH 06/10] Better progress markers --- .github/workflows/linux_cuda.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index 8ccd92776..d6411dd13 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -29,11 +29,15 @@ jobs: # Upgrade pip python -m pip install --upgrade pip + echo "=================================================" echo "Installed pip version: $(python -m pip --version)" + echo "=================================================" # Ensure cmake is minimum version Kineto requires conda install -y cmake>=3.22 + echo "===========================================" echo "Installed cmake version: $(cmake --version)" + echo "===========================================" # Build libkineto (static and shared library) mkdir -p build_static build_shared @@ -42,36 +46,48 @@ jobs: cmake -DKINETO_LIBRARY_TYPE=static ../libkineto/ make -j popd + echo "=========================" echo "Compiled static libkineto" + echo "=========================" pushd build_shared cmake -DKINETO_LIBRARY_TYPE=shared ../libkineto/ make -j popd + echo "=========================" echo "Compiled shared libkineto" + echo "=========================" # Run libkineto tests pushd build_static make test popd + echo "===========================" echo "Ran static libkineto tests" + echo "===========================" # Clone PyTorch and replace its Kineto with PR version PYTORCH_DIR="${RUNNER_TEMP}/pytorch" git clone --recursive --branch viable/strict https://github.com/pytorch/pytorch.git "${PYTORCH_DIR}" + echo "==============" echo "Cloned PyTorch" + echo "==============" pushd "${PYTORCH_DIR}" rm -rf third_party/kineto ln -s "${GITHUB_WORKSPACE}/${GITHUB_REPOSITORY}" third_party/kineto + echo "======================================" echo "Linked PR version of Kineto to PyTorch" + echo "======================================" # Build PyTorch from source pip install -r requirements.txt export USE_CUDA=1 export BUILD_TEST=1 python setup.py develop + echo "=========================" echo "Built PyTorch from source" + echo "=========================" # Run PyTorch profiler tests # TODO: Dynamically add/remove tests to the exclusion list based on their @@ -88,4 +104,6 @@ jobs: --deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_profiler_debug_autotuner \ --deselect=test/profiler/test_torch_tidy.py::TestTorchTidyProfiler::test_tensorimpl_invalidation_scalar_args popd + echo "==========================" echo "Ran PyTorch profiler tests" + echo "==========================" From eebbe4dc19da45463f411746fe746092664cdae1 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Tue, 3 Feb 2026 09:29:00 -0800 Subject: [PATCH 07/10] Plain pytorch directory --- .github/workflows/linux_cuda.yml | 43 ++++++++++---------------------- 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index d6411dd13..3c5f178d7 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -27,17 +27,15 @@ jobs: script: | set -eux + echo "====: Working directory: $(pwd)" + # Upgrade pip python -m pip install --upgrade pip - echo "=================================================" - echo "Installed pip version: $(python -m pip --version)" - echo "=================================================" + echo "====: Installed pip version: $(python -m pip --version)" # Ensure cmake is minimum version Kineto requires conda install -y cmake>=3.22 - echo "===========================================" - echo "Installed cmake version: $(cmake --version)" - echo "===========================================" + echo "====: Installed cmake version: $(cmake --version)" # Build libkineto (static and shared library) mkdir -p build_static build_shared @@ -46,48 +44,35 @@ jobs: cmake -DKINETO_LIBRARY_TYPE=static ../libkineto/ make -j popd - echo "=========================" - echo "Compiled static libkineto" - echo "=========================" + echo "====: Compiled static libkineto" pushd build_shared cmake -DKINETO_LIBRARY_TYPE=shared ../libkineto/ make -j popd - echo "=========================" - echo "Compiled shared libkineto" - echo "=========================" + echo "====: Compiled shared libkineto" # Run libkineto tests pushd build_static make test popd - echo "===========================" - echo "Ran static libkineto tests" - echo "===========================" + echo "====: Ran static libkineto tests" # Clone PyTorch and replace its Kineto with PR version - PYTORCH_DIR="${RUNNER_TEMP}/pytorch" - git clone --recursive --branch viable/strict https://github.com/pytorch/pytorch.git "${PYTORCH_DIR}" - echo "==============" - echo "Cloned PyTorch" - echo "==============" + git clone --recursive --branch viable/strict https://github.com/pytorch/pytorch.git + echo "====: Cloned PyTorch" - pushd "${PYTORCH_DIR}" + pushd pytorch rm -rf third_party/kineto ln -s "${GITHUB_WORKSPACE}/${GITHUB_REPOSITORY}" third_party/kineto - echo "======================================" - echo "Linked PR version of Kineto to PyTorch" - echo "======================================" + echo "====: Linked PR version of Kineto to PyTorch" # Build PyTorch from source pip install -r requirements.txt export USE_CUDA=1 export BUILD_TEST=1 python setup.py develop - echo "=========================" - echo "Built PyTorch from source" - echo "=========================" + echo "====: Built PyTorch from source" # Run PyTorch profiler tests # TODO: Dynamically add/remove tests to the exclusion list based on their @@ -104,6 +89,4 @@ jobs: --deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_profiler_debug_autotuner \ --deselect=test/profiler/test_torch_tidy.py::TestTorchTidyProfiler::test_tensorimpl_invalidation_scalar_args popd - echo "==========================" - echo "Ran PyTorch profiler tests" - echo "==========================" + echo "====: Ran PyTorch profiler tests" From 30d6d0a7899cd773b85b4344d31adcbe898bec5b Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Tue, 3 Feb 2026 10:24:20 -0800 Subject: [PATCH 08/10] Relative directory --- .github/workflows/linux_cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index 3c5f178d7..c2f396189 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -64,7 +64,7 @@ jobs: pushd pytorch rm -rf third_party/kineto - ln -s "${GITHUB_WORKSPACE}/${GITHUB_REPOSITORY}" third_party/kineto + ln -s ../kineto third_party/kineto echo "====: Linked PR version of Kineto to PyTorch" # Build PyTorch from source From aa1eca7ec8d55abbe453a8906fbd681abbd9107c Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Fri, 6 Feb 2026 13:52:45 -0800 Subject: [PATCH 09/10] Fix directory management --- .github/workflows/linux_cuda.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index c2f396189..c51210cba 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -58,14 +58,18 @@ jobs: popd echo "====: Ran static libkineto tests" + # Save kineto directory path before cloning PyTorch + KINETO_DIR=$(pwd) + echo "====: Kineto directory: ${KINETO_DIR}" + # Clone PyTorch and replace its Kineto with PR version git clone --recursive --branch viable/strict https://github.com/pytorch/pytorch.git echo "====: Cloned PyTorch" pushd pytorch rm -rf third_party/kineto - ln -s ../kineto third_party/kineto - echo "====: Linked PR version of Kineto to PyTorch" + ln -s "${KINETO_DIR}" third_party/kineto + echo "====: Linked PR version of Kineto to PyTorch (${KINETO_DIR} -> third_party/kineto)" # Build PyTorch from source pip install -r requirements.txt From bce28f119171f4458ca6bb6be7733c8bbead710c Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Fri, 6 Feb 2026 16:40:13 -0800 Subject: [PATCH 10/10] Install pytest --- .github/workflows/linux_cuda.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux_cuda.yml b/.github/workflows/linux_cuda.yml index c51210cba..d15a95b1d 100644 --- a/.github/workflows/linux_cuda.yml +++ b/.github/workflows/linux_cuda.yml @@ -29,14 +29,14 @@ jobs: echo "====: Working directory: $(pwd)" + # Ensure cmake is minimum version Kineto requires + conda install -y cmake>=3.27 + echo "====: Installed cmake version: $(cmake --version)" + # Upgrade pip python -m pip install --upgrade pip echo "====: Installed pip version: $(python -m pip --version)" - # Ensure cmake is minimum version Kineto requires - conda install -y cmake>=3.22 - echo "====: Installed cmake version: $(cmake --version)" - # Build libkineto (static and shared library) mkdir -p build_static build_shared @@ -82,6 +82,7 @@ jobs: # TODO: Dynamically add/remove tests to the exclusion list based on their # status on trunk instead of maintaining a hardcoded list of known failures. # This will prevent the list from becoming stale as tests get fixed upstream. + pip install pytest python -m pytest test/profiler/ -v \ --deselect=test/profiler/test_memory_profiler.py::TestDataFlow::test_data_flow_graph_complicated \ --deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_sequential_fwd_bwd \