Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions .github/workflows/iris-nightly-triton-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
name: Iris Nightly Triton Test

on:
schedule:
# Run nightly at 2 AM UTC
- cron: '0 2 * * *'
workflow_dispatch: # Allow manual triggering

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read

jobs:
build-container-with-latest-triton:
runs-on: [self-hosted, mi3xx]
timeout-minutes: 120 # Building with latest Triton may take longer

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Apptainer (if not available)
run: |
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
echo "Neither Apptainer nor Docker found, installing Apptainer..."
apt-get update && apt-get install -y software-properties-common
add-apt-repository -y ppa:apptainer/ppa
apt-get update && apt-get install -y apptainer
else
echo "Container runtime already available"
fi

- name: Modify Apptainer def file to use latest Triton
run: |
echo "Modifying apptainer/iris.def to use latest Triton from main branch"
# Replace any 40-character hex commit checkout with main branch
sed -i 's/git checkout [0-9a-f]\{40\}/git checkout main/' apptainer/iris.def
echo "Modified iris.def:"
grep -A2 -B2 "git checkout" apptainer/iris.def

- name: Build Iris container with latest Triton
run: |
set -e

# Check /dev/shm size
shm_size_gb=$(df -k /dev/shm | tail -1 | awk '{print int($2/1024/1024)}')
if [ "${shm_size_gb:-0}" -lt 64 ]; then
echo "❌ ERROR: /dev/shm is too small (${shm_size_gb}GB < 64GB)"
echo "Fix: mount -o remount,size=64G /dev/shm"
exit 1
fi
echo "✅ /dev/shm size OK (${shm_size_gb}GB)"

# Build with Apptainer, forcing rebuild
DEF_FILE=apptainer/iris.def
IMAGE_PATH="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif"

mkdir -p "${HOME}/iris-apptainer-nightly"

echo "Building Apptainer image with latest Triton..."
apptainer build --force "$IMAGE_PATH" "$DEF_FILE"

echo "Built image: $IMAGE_PATH"

test-nightly-triton:
name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks)
needs: build-container-with-latest-triton
runs-on: [self-hosted, mi3xx]
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include:
# Test each subdirectory with different rank counts
- test_dir: examples
num_ranks: 1
gpu_devices: "0,1"
- test_dir: examples
num_ranks: 2
gpu_devices: "2,3"
- test_dir: examples
num_ranks: 4
gpu_devices: "4,5,6,7"
- test_dir: examples
num_ranks: 8
gpu_devices: "0,1,2,3,4,5,6,7"
- test_dir: unittests
num_ranks: 1
gpu_devices: "0,1"
- test_dir: unittests
num_ranks: 2
gpu_devices: "2,3"
- test_dir: unittests
num_ranks: 4
gpu_devices: "4,5,6,7"
- test_dir: unittests
num_ranks: 8
gpu_devices: "0,1,2,3,4,5,6,7"
- test_dir: ccl
num_ranks: 1
gpu_devices: "0,1"
- test_dir: ccl
num_ranks: 2
gpu_devices: "2,3"
- test_dir: ccl
num_ranks: 4
gpu_devices: "4,5,6,7"
- test_dir: ccl
num_ranks: 8
gpu_devices: "0,1,2,3,4,5,6,7"
- test_dir: x
num_ranks: 1
gpu_devices: "0,1"
- test_dir: x
num_ranks: 2
gpu_devices: "2,3"
- test_dir: x
num_ranks: 4
gpu_devices: "4,5,6,7"
- test_dir: x
num_ranks: 8
gpu_devices: "0,1,2,3,4,5,6,7"
- test_dir: ops
num_ranks: 1
gpu_devices: "0,1"
- test_dir: ops
num_ranks: 2
gpu_devices: "2,3"
- test_dir: ops
num_ranks: 4
gpu_devices: "4,5,6,7"
- test_dir: ops
num_ranks: 8
gpu_devices: "0,1,2,3,4,5,6,7"

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Cleanup lingering ports before tests
run: |
bash .github/scripts/cleanup_ports.sh

- name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks
run: |
set -e
echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)"

# Use nightly image path
NIGHTLY_IMAGE="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif"

# Build GPU argument
GPU_ARG=""
if [ -n "${{ matrix.gpu_devices }}" ]; then
GPU_ARG="--gpus ${{ matrix.gpu_devices }}"
fi

# Run tests in container using the nightly image
.github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" "
set -e

# tritonBLAS version to use
TRITONBLAS_COMMIT=\"47768c93acb7f89511d797964b84544c30ab81ad\"

# Install tritonBLAS if not already installed
echo \"Checking for tritonBLAS...\"
if ! python -c 'import tritonblas' 2>/dev/null; then
echo \"Installing tritonBLAS...\"
TRITONBLAS_DIR=\"./tritonblas_install\"
if [ ! -d \"\$TRITONBLAS_DIR\" ]; then
git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\"
cd \"\$TRITONBLAS_DIR\"
git checkout \"\$TRITONBLAS_COMMIT\"
else
cd \"\$TRITONBLAS_DIR\"
git fetch
git checkout \"\$TRITONBLAS_COMMIT\"
fi
pip install -e .
cd ..
echo \"tritonBLAS installed successfully\"
else
echo \"tritonBLAS already installed\"
fi

echo \"Installing iris in editable mode\"
pip install -e .

# Run tests in the specified directory
for test_file in tests/${{ matrix.test_dir }}/test_*.py; do
if [ -f \"\$test_file\" ]; then
echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\"
python tests/run_tests_distributed.py --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10
fi
done
"
echo "::endgroup::"
echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!"
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
<a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License: MIT"></a>
<a href="https://github.com/ROCm/iris/blob/main/.github/workflows/lint.yml"><img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json" alt="Ruff"></a>
<a href="https://github.com/ROCm/iris/actions/workflows/iris-tests.yml"><img src="https://github.com/ROCm/iris/actions/workflows/iris-tests.yml/badge.svg" alt="Iris Tests"></a>
<a href="https://github.com/ROCm/iris/actions/workflows/iris-nightly-triton-test.yml"><img src="https://github.com/ROCm/iris/actions/workflows/iris-nightly-triton-test.yml/badge.svg" alt="Nightly Triton Tests"></a>
<a href="https://doi.org/10.5281/zenodo.17382307"><img src="https://zenodo.org/badge/DOI/10.5281/zenodo.17382307.svg" alt="DOI"></a>
<a href="https://doi.org/10.48550/arXiv.2511.12500"><img src="https://img.shields.io/badge/cs.DC%2C%20cs.LG-arXiv%3A2511.12500-B31B1B.svg" alt="DOI"></a>
</p>
Expand Down