From a17acd5280de80b423db438846f4ab32189971bc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:22:55 +0000 Subject: [PATCH 1/5] Initial plan From c06a6e0a3fd8c17cec9facc4be2208b14bca7ed3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:25:19 +0000 Subject: [PATCH 2/5] Add nightly Triton testing workflow Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- .../workflows/iris-nightly-triton-test.yml | 195 ++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 .github/workflows/iris-nightly-triton-test.yml diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml new file mode 100644 index 00000000..2f533bb5 --- /dev/null +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -0,0 +1,195 @@ +name: Iris Nightly Triton Test + +on: + schedule: + # Run nightly at 2 AM UTC + - cron: '0 2 * * *' + workflow_dispatch: # Allow manual triggering + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-container-with-latest-triton: + runs-on: [self-hosted, mi3xx] + timeout-minutes: 120 # Building with latest Triton may take longer + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Apptainer (if not available) + run: | + if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then + echo "Neither Apptainer nor Docker found, installing Apptainer..." + apt-get update && apt-get install -y software-properties-common + add-apt-repository -y ppa:apptainer/ppa + apt-get update && apt-get install -y apptainer + else + echo "Container runtime already available" + fi + + - name: Modify Apptainer def file to use latest Triton + run: | + echo "Modifying apptainer/iris.def to use latest Triton from main branch" + # Replace the specific commit checkout with main branch + sed -i 's/git checkout bcbcabdd0cff6539c7168299075992b2a23ff38e/git checkout main/' apptainer/iris.def + echo "Modified iris.def:" + grep -A2 -B2 "git checkout" apptainer/iris.def + + - name: Build Iris container with latest Triton + run: | + set -e + + # Check /dev/shm size + shm_size_gb=$(df -k /dev/shm | tail -1 | awk '{print int($2/1024/1024)}') + if [ "${shm_size_gb:-0}" -lt 64 ]; then + echo "❌ ERROR: /dev/shm is too small (${shm_size_gb}GB < 64GB)" + echo "Fix: mount -o remount,size=64G /dev/shm" + exit 1 + fi + echo "✅ /dev/shm size OK (${shm_size_gb}GB)" + + # Build with Apptainer, forcing rebuild + DEF_FILE=apptainer/iris.def + IMAGE_PATH="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif" + + mkdir -p "${HOME}/iris-apptainer-nightly" + + echo "Building Apptainer image with latest Triton..." + apptainer build --force "$IMAGE_PATH" "$DEF_FILE" + + echo "Built image: $IMAGE_PATH" + + test-nightly-triton: + name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks) + needs: build-container-with-latest-triton + runs-on: [self-hosted, mi3xx] + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + # Test each subdirectory with different rank counts + - test_dir: examples + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: examples + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: examples + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: examples + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: unittests + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: unittests + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: unittests + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: unittests + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: ccl + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: ccl + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: ccl + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: ccl + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: x + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: x + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: x + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: x + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: ops + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: ops + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: ops + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: ops + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Cleanup lingering ports before tests + run: | + bash .github/scripts/cleanup_ports.sh + + - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks + run: | + set -e + echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)" + + # Use nightly image path + NIGHTLY_IMAGE="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif" + + # Build GPU argument + GPU_ARG="" + if [ -n "${{ matrix.gpu_devices }}" ]; then + GPU_ARG="--gpus ${{ matrix.gpu_devices }}" + fi + + # Run tests in container using the nightly image + .github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" " + set -e + + # Install tritonBLAS if not already installed + echo \"Checking for tritonBLAS...\" + if ! python -c 'import tritonblas' 2>/dev/null; then + echo \"Installing tritonBLAS...\" + TRITONBLAS_DIR=\"./tritonblas_install\" + if [ ! -d \"\$TRITONBLAS_DIR\" ]; then + git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\" + cd \"\$TRITONBLAS_DIR\" + git checkout 47768c93acb7f89511d797964b84544c30ab81ad + else + cd \"\$TRITONBLAS_DIR\" + git fetch + git checkout 47768c93acb7f89511d797964b84544c30ab81ad + fi + pip install -e . + cd .. + echo \"tritonBLAS installed successfully\" + else + echo \"tritonBLAS already installed\" + fi + + echo \"Installing iris in editable mode\" + pip install -e . + + # Run tests in the specified directory + for test_file in tests/${{ matrix.test_dir }}/test_*.py; do + if [ -f \"\$test_file\" ]; then + echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\" + python tests/run_tests_distributed.py --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10 + fi + done + " + echo "::endgroup::" + echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!" From 8679c4861e58384a49fe73457aad264d20f1d173 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:25:56 +0000 Subject: [PATCH 3/5] Add nightly Triton test badge to README Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9fa2d55b..6f9a921c 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. License: MIT Ruff Iris Tests + Nightly Triton Tests DOI DOI

From 8126f9244cb69b47e285329fa337e19d7fa8a144 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:27:07 +0000 Subject: [PATCH 4/5] Improve nightly workflow robustness per code review Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- .github/workflows/iris-nightly-triton-test.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml index 2f533bb5..7317759e 100644 --- a/.github/workflows/iris-nightly-triton-test.yml +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -33,8 +33,8 @@ jobs: - name: Modify Apptainer def file to use latest Triton run: | echo "Modifying apptainer/iris.def to use latest Triton from main branch" - # Replace the specific commit checkout with main branch - sed -i 's/git checkout bcbcabdd0cff6539c7168299075992b2a23ff38e/git checkout main/' apptainer/iris.def + # Replace any 40-character hex commit checkout with main branch + sed -i 's/git checkout [0-9a-f]\{40\}/git checkout main/' apptainer/iris.def echo "Modified iris.def:" grep -A2 -B2 "git checkout" apptainer/iris.def @@ -159,6 +159,9 @@ jobs: .github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" " set -e + # tritonBLAS version to use + TRITONBLAS_COMMIT=\"47768c93acb7f89511d797964b84544c30ab81ad\" + # Install tritonBLAS if not already installed echo \"Checking for tritonBLAS...\" if ! python -c 'import tritonblas' 2>/dev/null; then @@ -167,11 +170,11 @@ jobs: if [ ! -d \"\$TRITONBLAS_DIR\" ]; then git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\" cd \"\$TRITONBLAS_DIR\" - git checkout 47768c93acb7f89511d797964b84544c30ab81ad + git checkout \"\$TRITONBLAS_COMMIT\" else cd \"\$TRITONBLAS_DIR\" git fetch - git checkout 47768c93acb7f89511d797964b84544c30ab81ad + git checkout \"\$TRITONBLAS_COMMIT\" fi pip install -e . cd .. From a3a843f001e5e3ea9eb6e35c5a1220a028df42b0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:28:38 +0000 Subject: [PATCH 5/5] Add explicit permissions to nightly workflow for security Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- .github/workflows/iris-nightly-triton-test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml index 7317759e..c9d1d698 100644 --- a/.github/workflows/iris-nightly-triton-test.yml +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -10,6 +10,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + jobs: build-container-with-latest-triton: runs-on: [self-hosted, mi3xx]