Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 86 additions & 127 deletions .github/workflows/linux_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,133 +6,92 @@ on:
- main
pull_request:

# Use TorchBench's docker image which has all basic dependencies.
env:
DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest"

jobs:
pr-test:
# AWS A10G GPU instance label: linux.g5.4xlarge.nvidia.gpu
# OS version: Amazon Linux 2
runs-on: linux.g5.4xlarge.nvidia.gpu
timeout-minutes: 180 # 3 hours
steps:
- name: Checkout Kineto
uses: actions/checkout@v5
with:
path: kineto
submodules: recursive

- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ env.DOCKER_IMAGE }}

- name: Install NVIDIA Driver, docker runtime, set GPU_FLAG
uses: pytorch/test-infra/.github/actions/setup-nvidia@main

- name: Get env vars
run: |
echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW
echo HOME = $HOME
echo GITHUB_ACTION = $GITHUB_ACTION
echo GITHUB_ACTIONS = $GITHUB_ACTIONS
echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE
echo GITHUB_SHA = $GITHUB_SHA
echo GITHUB_REF = $GITHUB_REF

- name: Build libkineto (static and shared library)
run: |
container_name=$(docker run \
${GPU_FLAG:-} \
--tty \
--detach \
--shm-size=32gb \
-v "${PWD}/kineto:/kineto" \
-w / \
"${{ env.DOCKER_IMAGE }}"
)
echo "Container name: ${container_name}"

docker exec -t -w "/kineto" "${container_name}" bash -c "
set -eux
mkdir build_static
mkdir build_shared
pip install cmake
"

docker exec -t -w "/kineto/build_static" "${container_name}" bash -c "
set -eux
cmake -DKINETO_LIBRARY_TYPE=static ../libkineto/
make -j
"

docker exec -t -w "/kineto/build_shared" "${container_name}" bash -c "
set -eux
cmake -DKINETO_LIBRARY_TYPE=shared ../libkineto/
make -j
"

- name: Run libkineto tests
run: |
container_name=$(docker ps -lq)
docker exec -t -w "/kineto/build_static" "${container_name}" bash -c "make test"

- name: Clone PyTorch
run: |
container_name=$(docker ps -lq)
docker exec -t -w "/" "${container_name}" bash -c "
set -eux
git clone --recursive --branch viable/strict https://github.com/pytorch/pytorch.git
"

- name: Replace PyTorch's Kineto with PR version
run: |
container_name=$(docker ps -lq)
docker exec -t -w "/pytorch" "${container_name}" bash -c "
set -eux
rm -rf third_party/kineto
ln -s /kineto third_party/kineto
"

- name: Build PyTorch from source
run: |
container_name=$(docker ps -lq)
docker exec -t -w "/pytorch" "${container_name}" bash -c "
set -eux
pip install -r requirements.txt
export USE_CUDA=1
export BUILD_TEST=1
python setup.py develop
"

- name: Run PyTorch profiler tests
run: |
container_name=$(docker ps -lq)
docker exec -t -w "/pytorch" "${container_name}" bash -c "
set -eux
# TODO: Dynamically add/remove tests to the exclusion list based on their
# status on trunk instead of maintaining a hardcoded list of known failures.
# This will prevent the list from becoming stale as tests get fixed upstream.
python -m pytest test/profiler/ -v \
--deselect=test/profiler/test_memory_profiler.py::TestDataFlow::test_data_flow_graph_complicated \
--deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_sequential_fwd_bwd \
--deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_simple_fwd_bwd \
--deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_simple_fwd_bwd_step \
--deselect=test/profiler/test_profiler.py::TestProfiler::test_kineto \
--deselect=test/profiler/test_profiler.py::TestProfiler::test_user_annotation \
--deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_fuzz_symbolize \
--deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_profiler_debug_autotuner \
--deselect=test/profiler/test_torch_tidy.py::TestTorchTidyProfiler::test_tensorimpl_invalidation_scalar_args
"

- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true

jobs:
pr-test:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
job-name: linux-cuda-build-and-test
# AWS A10G GPU instance
runner: linux.g5.4xlarge.nvidia.gpu
timeout: 180
# Checkout the Kineto repo at the PR ref with submodules
repository: ${{ github.repository }}
ref: ${{ github.ref }}
submodules: recursive
gpu-arch-type: cuda
gpu-arch-version: "12.6"
script: |
set -eux

echo "====: Working directory: $(pwd)"

# Ensure cmake is minimum version Kineto requires
conda install -y cmake>=3.27
echo "====: Installed cmake version: $(cmake --version)"

# Upgrade pip
python -m pip install --upgrade pip
echo "====: Installed pip version: $(python -m pip --version)"

# Build libkineto (static and shared library)
mkdir -p build_static build_shared

pushd build_static
cmake -DKINETO_LIBRARY_TYPE=static ../libkineto/
make -j
popd
echo "====: Compiled static libkineto"

pushd build_shared
cmake -DKINETO_LIBRARY_TYPE=shared ../libkineto/
make -j
popd
echo "====: Compiled shared libkineto"

# Run libkineto tests
pushd build_static
make test
popd
echo "====: Ran static libkineto tests"

# Save kineto directory path before cloning PyTorch
KINETO_DIR=$(pwd)
echo "====: Kineto directory: ${KINETO_DIR}"

# Clone PyTorch and replace its Kineto with PR version
git clone --recursive --branch viable/strict https://github.com/pytorch/pytorch.git
echo "====: Cloned PyTorch"

pushd pytorch
rm -rf third_party/kineto
ln -s "${KINETO_DIR}" third_party/kineto
echo "====: Linked PR version of Kineto to PyTorch (${KINETO_DIR} -> third_party/kineto)"

# Build PyTorch from source
pip install -r requirements.txt
export USE_CUDA=1
export BUILD_TEST=1
python setup.py develop
echo "====: Built PyTorch from source"

# Run PyTorch profiler tests
# TODO: Dynamically add/remove tests to the exclusion list based on their
# status on trunk instead of maintaining a hardcoded list of known failures.
# This will prevent the list from becoming stale as tests get fixed upstream.
pip install pytest
python -m pytest test/profiler/ -v \
--deselect=test/profiler/test_memory_profiler.py::TestDataFlow::test_data_flow_graph_complicated \
--deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_sequential_fwd_bwd \
--deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_simple_fwd_bwd \
--deselect=test/profiler/test_memory_profiler.py::TestMemoryProfilerE2E::test_categories_e2e_simple_fwd_bwd_step \
--deselect=test/profiler/test_profiler.py::TestProfiler::test_kineto \
--deselect=test/profiler/test_profiler.py::TestProfiler::test_user_annotation \
--deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_fuzz_symbolize \
--deselect=test/profiler/test_profiler.py::TestExperimentalUtils::test_profiler_debug_autotuner \
--deselect=test/profiler/test_torch_tidy.py::TestTorchTidyProfiler::test_tensorimpl_invalidation_scalar_args
popd
echo "====: Ran PyTorch profiler tests"