Skip to content
Merged

Mi355 #455

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2156f84
Add MI355X GPU support for AMD GitHub runner
Mar 4, 2026
b463bab
Use amd-runner Docker container for MI355X workflow
Mar 4, 2026
bdc4523
Update AMD Dockerfile: ROCm 7.2, latest aiter, remove multi-GPU deps
Mar 4, 2026
bb5f2ee
Update AMD_REQUIREMENTS to use ROCm 7.2 nightly index
Mar 4, 2026
de5f1eb
Fix container permissions: run as root for GitHub Actions compatibility
Mar 4, 2026
c01bab1
Revert "Update AMD_REQUIREMENTS to use ROCm 7.2 nightly index"
Mar 4, 2026
e09a2cd
Revert "Update AMD Dockerfile: ROCm 7.2, latest aiter, remove multi-G…
Mar 4, 2026
b3dfd32
Simplify AMD workflow for MI355X: use container deps, skip requiremen…
Mar 4, 2026
a98fda0
Reapply "Update AMD Dockerfile: ROCm 7.2, latest aiter, remove multi-…
Mar 4, 2026
babcddd
Update AMD Dockerfile to ROCm 7.1 stable, latest aiter, remove multi-…
Mar 4, 2026
5f292e1
Use mia1-p02-g29 runner to build AMD Docker image
Mar 4, 2026
6a9f61b
Add workspace cleanup step before checkout in AMD Docker build
Mar 4, 2026
ae849ef
Remove workspace cleanup step from AMD Docker build
Mar 4, 2026
75e15c3
Use GITHUB_TOKEN instead of PUBLISH_TOKEN for ghcr.io login
Mar 4, 2026
9b359e9
Fix Dockerfile for Ubuntu 24.04 (Noble) base image
Mar 4, 2026
dbd319c
Remove pip upgrade step (incompatible with Noble system pip)
Mar 4, 2026
5157d8c
Use amd-runner:mi355 Docker image with working aiter + ROCm
Mar 4, 2026
e308b88
Fix pip install: add --break-system-packages for container environment
Mar 4, 2026
b394db8
Update amd-docker.Dockerfile
Mar 4, 2026
2155b05
Set minimum GitHub timeout to DEFAULT_GITHUB_TIMEOUT_MINUTES
Mar 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 6 additions & 23 deletions .github/workflows/amd_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:
runner:
description: 'AMD runner to run workflow on'
required: true
default: "amdgpu-mi300-x86-64"
default: "mia1-p02-g29"
type: string
requirements:
description: 'Contents for a requirements.txt file'
Expand All @@ -25,6 +25,9 @@ run-name: 'AMD Job - ${{ github.event.inputs.run_id }}'
jobs:
run:
runs-on: ${{ github.event.inputs.runner }}
container:
image: ghcr.io/gpu-mode/amd-runner:mi355
options: --user root --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 64G
strategy:
fail-fast: false
timeout-minutes: 20
Expand All @@ -42,34 +45,14 @@ jobs:
# Now write to file (won't be logged since it's masked)
echo "$PAYLOAD" > payload.json

- name: Set venv directory based on runner
run: |
if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then
echo "VENV_DIR=/groups/aig_sharks/pytorch_venv" >> $GITHUB_ENV
fi

- name: Setup Virtual Environment and Install Dependencies
- name: Install kernelbot
shell: bash
run: |
if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then
python -m venv ${VENV_DIR}
source ${VENV_DIR}/bin/activate
fi
pip install --upgrade pip
if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
cat > "requirements.txt" <<'EOL'
${{ github.event.inputs.requirements }}
EOL
pip install -r "requirements.txt"
fi
pip install -e .
pip install --break-system-packages -e .

- name: Run script
shell: bash
run: |
if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then
source ${VENV_DIR}/bin/activate
fi
python3 src/runners/github-runner.py

- name: Upload training artifacts
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/publish_amd_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ env:

jobs:
build-and-push-image:
runs-on: amd-docker
runs-on: mia1-p02-g29
# Sets the permissions granted to the `PUBLISH_TOKEN` for the actions in this job.
permissions:
contents: read
Expand All @@ -23,7 +23,7 @@ jobs:
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.PUBLISH_TOKEN }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
Expand Down
89 changes: 14 additions & 75 deletions docker/amd-docker.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
FROM ghcr.io/actions/actions-runner:latest

ENV CXX=clang++
ENV UCX_CXX=g++
ENV UCX_CC=gcc

RUN sudo apt-get update -y \
&& sudo apt-get install -y software-properties-common \
&& sudo add-apt-repository -y ppa:git-core/ppa \
&& sudo apt-get update -y \
&& sudo apt-get install -y --no-install-recommends \
software-properties-common \
curl \
ca-certificates \
git \
Expand All @@ -22,100 +18,43 @@ RUN sudo apt-get update -y \
lld \
wget \
psmisc \
python3.10-venv \
python3-venv \
python3-pip \
python3-setuptools \
python3-wheel \
python3-dev \
&& sudo rm -rf /var/lib/apt/lists/*

RUN sudo apt-get update && sudo apt-get install -y python3.10 python3-pip python-is-python3 python3-setuptools python3-wheel libpython3.10

RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash && \
sudo apt-get install git-lfs

RUN sudo groupadd -g 109 render

RUN sudo apt update -y \
&& sudo apt install -y "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)" \
&& sudo usermod -a -G render,video runner \
&& wget https://repo.radeon.com/amdgpu-install/6.3.1/ubuntu/jammy/amdgpu-install_6.3.60301-1_all.deb \
&& sudo apt install -y ./amdgpu-install_6.3.60301-1_all.deb \
&& wget https://repo.radeon.com/amdgpu-install/7.1/ubuntu/noble/amdgpu-install_7.1.70100-1_all.deb \
&& sudo apt install -y ./amdgpu-install_7.1.70100-1_all.deb \
&& sudo apt update -y \
&& sudo apt install -y rocm

RUN sudo pip install --upgrade pip
ENV ROCM_PATH=/opt/rocm

RUN sudo pip install --no-cache-dir torch==2.10.0.dev20250916+rocm6.3 pytorch-triton-rocm --index-url https://download.pytorch.org/whl/nightly/rocm6.3
RUN sudo pip install --break-system-packages --no-cache-dir torch==2.10.0+rocm7.1 --index-url https://download.pytorch.org/whl/rocm7.1

RUN git clone --recursive https://github.com/ROCm/aiter.git \
&& cd aiter \
&& git checkout 1d88633958236e942cba3c283864282f7af3ebc5 \
&& sudo pip install -r requirements.txt \
&& git checkout f3be04a12a0cfd6b5e2c7a94edc774f1bc24460d \
&& sudo pip install --break-system-packages -r requirements.txt \
&& sudo python3 setup.py develop

RUN sudo mkdir -p /home/runner/aiter/aiter/jit/build \
&& sudo chown -R runner:runner /home/runner/aiter/aiter/jit/build

RUN sudo pip install \
RUN sudo pip install --break-system-packages \
ninja \
numpy \
packaging \
wheel \
tinygrad

RUN sudo pip install git+https://github.com/ROCm/iris.git

RUN sudo apt-get update -y \
&& sudo apt-get install -y --no-install-recommends \
autoconf \
automake \
libtool \
pkg-config \
build-essential \
gfortran \
flex \
bison \
libomp-dev \
libhwloc-dev \
libnuma-dev \
&& sudo rm -rf /var/lib/apt/lists/*

ENV UCX_INSTALL_DIR=/opt/ucx
ENV OMPI_INSTALL_DIR=/opt/openmpi
ENV ROCSHMEM_INSTALL_DIR=/opt/rocshmem
ENV ROCM_PATH=/opt/rocm

RUN cd /tmp \
&& git clone https://github.com/openucx/ucx.git -b v1.17.x \
&& cd ucx \
&& ./autogen.sh \
&& CC=gcc CXX=g++ ./configure --prefix=${UCX_INSTALL_DIR} --with-rocm=${ROCM_PATH} --enable-mt --disable-optimizations \
&& make -j$(nproc) \
&& sudo make install \
&& cd / \
&& sudo rm -rf /tmp/ucx

RUN cd /tmp \
&& git clone --recursive https://github.com/open-mpi/ompi.git -b v5.0.x \
&& cd ompi \
&& ./autogen.pl \
&& ./configure --prefix=${OMPI_INSTALL_DIR} --with-rocm=${ROCM_PATH} --with-ucx=${UCX_INSTALL_DIR} \
&& make -j$(nproc) \
&& sudo make install \
&& cd / \
&& sudo rm -rf /tmp/ompi

ENV PATH="${OMPI_INSTALL_DIR}/bin:${PATH}"
ENV LD_LIBRARY_PATH="${OMPI_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib:/opt/rocm/lib"


RUN cd /tmp \
&& git clone https://github.com/ROCm/rocSHMEM.git \
&& cd rocSHMEM \
&& mkdir build \
&& cd build \
&& MPI_ROOT=${OMPI_INSTALL_DIR} UCX_ROOT=${UCX_INSTALL_DIR} CMAKE_PREFIX_PATH="${ROCM_PATH}:$CMAKE_PREFIX_PATH" \
sudo ../scripts/build_configs/ipc_single -DCMAKE_INSTALL_PREFIX=/opt/rocshmem \
&& cd / \
&& sudo rm -rf /tmp/rocSHMEM


ENV ROCSHMEM_INSTALL_DIR=${ROCSHMEM_INSTALL_DIR}
ENV LD_LIBRARY_PATH="${ROCSHMEM_INSTALL_DIR}/lib:${LD_LIBRARY_PATH}"
ENV LD_LIBRARY_PATH="/opt/rocm/lib"
3 changes: 3 additions & 0 deletions src/libkernelbot/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class GitHubGPU(Enum):
MI300 = "MI300"
MI250 = "MI250"
MI300x8 = "MI300x8"
MI355X = "MI355X"


class ModalGPU(Enum):
Expand Down Expand Up @@ -121,6 +122,7 @@ class RankCriterion(Enum):
"MI300": None,
"MI300x8": None,
"MI250": None,
"MI355X": None,
}


Expand Down Expand Up @@ -153,6 +155,7 @@ class RankCriterion(Enum):
AMD_REQUIREMENTS = """
--index-url https://download.pytorch.org/whl/rocm6.2.4
torch
numpy
"""

# A buffer for timeouts to account for github setup time
Expand Down
6 changes: 4 additions & 2 deletions src/libkernelbot/launchers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ def get_timeout(config: dict) -> int:
SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"),
}
seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60
return math.ceil(seconds / 60)
minutes = math.ceil(seconds / 60)
return max(minutes, DEFAULT_GITHUB_TIMEOUT_MINUTES)


class GitHubLauncher(Launcher):
Expand Down Expand Up @@ -93,12 +94,13 @@ async def run_submission( # noqa: C901
self, config: dict, gpu_type: GPU, status: RunProgressReporter
) -> FullResult:
gpu_vendor = None
if gpu_type.value in ["MI300", "MI250", "MI300x8"]:
if gpu_type.value in ["MI300", "MI250", "MI300x8", "MI355X"]:
selected_workflow = "amd_workflow.yml"
runner_name = {
"MI300": "amdgpu-mi300-x86-64",
"MI250": "amdgpu-mi250-x86-64",
"MI300x8": "amdgpu-mi300-8-x86-64",
"MI355X": "mia1-p02-g29",
}[gpu_type.value]
gpu_vendor = "AMD"
requirements = AMD_REQUIREMENTS
Expand Down
Loading