diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml new file mode 100644 index 00000000..c9d1d698 --- /dev/null +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -0,0 +1,201 @@ +name: Iris Nightly Triton Test + +on: + schedule: + # Run nightly at 2 AM UTC + - cron: '0 2 * * *' + workflow_dispatch: # Allow manual triggering + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + build-container-with-latest-triton: + runs-on: [self-hosted, mi3xx] + timeout-minutes: 120 # Building with latest Triton may take longer + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Apptainer (if not available) + run: | + if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then + echo "Neither Apptainer nor Docker found, installing Apptainer..." + apt-get update && apt-get install -y software-properties-common + add-apt-repository -y ppa:apptainer/ppa + apt-get update && apt-get install -y apptainer + else + echo "Container runtime already available" + fi + + - name: Modify Apptainer def file to use latest Triton + run: | + echo "Modifying apptainer/iris.def to use latest Triton from main branch" + # Replace any 40-character hex commit checkout with main branch + sed -i 's/git checkout [0-9a-f]\{40\}/git checkout main/' apptainer/iris.def + echo "Modified iris.def:" + grep -A2 -B2 "git checkout" apptainer/iris.def + + - name: Build Iris container with latest Triton + run: | + set -e + + # Check /dev/shm size + shm_size_gb=$(df -k /dev/shm | tail -1 | awk '{print int($2/1024/1024)}') + if [ "${shm_size_gb:-0}" -lt 64 ]; then + echo "❌ ERROR: /dev/shm is too small (${shm_size_gb}GB < 64GB)" + echo "Fix: mount -o remount,size=64G /dev/shm" + exit 1 + fi + echo "✅ /dev/shm size OK (${shm_size_gb}GB)" + + # Build with Apptainer, forcing rebuild + DEF_FILE=apptainer/iris.def + IMAGE_PATH="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif" + + mkdir -p "${HOME}/iris-apptainer-nightly" + + echo "Building Apptainer image with latest Triton..." + apptainer build --force "$IMAGE_PATH" "$DEF_FILE" + + echo "Built image: $IMAGE_PATH" + + test-nightly-triton: + name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks) + needs: build-container-with-latest-triton + runs-on: [self-hosted, mi3xx] + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + # Test each subdirectory with different rank counts + - test_dir: examples + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: examples + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: examples + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: examples + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: unittests + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: unittests + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: unittests + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: unittests + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: ccl + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: ccl + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: ccl + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: ccl + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: x + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: x + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: x + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: x + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: ops + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: ops + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: ops + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: ops + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Cleanup lingering ports before tests + run: | + bash .github/scripts/cleanup_ports.sh + + - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks + run: | + set -e + echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)" + + # Use nightly image path + NIGHTLY_IMAGE="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif" + + # Build GPU argument + GPU_ARG="" + if [ -n "${{ matrix.gpu_devices }}" ]; then + GPU_ARG="--gpus ${{ matrix.gpu_devices }}" + fi + + # Run tests in container using the nightly image + .github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" " + set -e + + # tritonBLAS version to use + TRITONBLAS_COMMIT=\"47768c93acb7f89511d797964b84544c30ab81ad\" + + # Install tritonBLAS if not already installed + echo \"Checking for tritonBLAS...\" + if ! python -c 'import tritonblas' 2>/dev/null; then + echo \"Installing tritonBLAS...\" + TRITONBLAS_DIR=\"./tritonblas_install\" + if [ ! -d \"\$TRITONBLAS_DIR\" ]; then + git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\" + cd \"\$TRITONBLAS_DIR\" + git checkout \"\$TRITONBLAS_COMMIT\" + else + cd \"\$TRITONBLAS_DIR\" + git fetch + git checkout \"\$TRITONBLAS_COMMIT\" + fi + pip install -e . + cd .. + echo \"tritonBLAS installed successfully\" + else + echo \"tritonBLAS already installed\" + fi + + echo \"Installing iris in editable mode\" + pip install -e . + + # Run tests in the specified directory + for test_file in tests/${{ matrix.test_dir }}/test_*.py; do + if [ -f \"\$test_file\" ]; then + echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\" + python tests/run_tests_distributed.py --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10 + fi + done + " + echo "::endgroup::" + echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!" diff --git a/README.md b/README.md index 9fa2d55b..6f9a921c 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. License: MIT Ruff Iris Tests + Nightly Triton Tests DOI DOI