diff --git a/.github/workflows/amd_workflow.yml b/.github/workflows/amd_workflow.yml index 39dc66c58..b41331e6f 100644 --- a/.github/workflows/amd_workflow.yml +++ b/.github/workflows/amd_workflow.yml @@ -13,7 +13,7 @@ on: runner: description: 'AMD runner to run workflow on' required: true - default: "amdgpu-mi300-x86-64" + default: "mia1-p02-g29" type: string requirements: description: 'Contents for a requirements.txt file' @@ -25,6 +25,9 @@ run-name: 'AMD Job - ${{ github.event.inputs.run_id }}' jobs: run: runs-on: ${{ github.event.inputs.runner }} + container: + image: ghcr.io/gpu-mode/amd-runner:mi355 + options: --user root --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 64G strategy: fail-fast: false timeout-minutes: 20 @@ -42,34 +45,14 @@ jobs: # Now write to file (won't be logged since it's masked) echo "$PAYLOAD" > payload.json - - name: Set venv directory based on runner - run: | - if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then - echo "VENV_DIR=/groups/aig_sharks/pytorch_venv" >> $GITHUB_ENV - fi - - - name: Setup Virtual Environment and Install Dependencies + - name: Install kernelbot shell: bash run: | - if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then - python -m venv ${VENV_DIR} - source ${VENV_DIR}/bin/activate - fi - pip install --upgrade pip - if [[ -n "${{ github.event.inputs.requirements }}" ]]; then - cat > "requirements.txt" <<'EOL' - ${{ github.event.inputs.requirements }} - EOL - pip install -r "requirements.txt" - fi - pip install -e . + pip install --break-system-packages -e . - name: Run script shell: bash run: | - if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then - source ${VENV_DIR}/bin/activate - fi python3 src/runners/github-runner.py - name: Upload training artifacts diff --git a/.github/workflows/publish_amd_docker.yml b/.github/workflows/publish_amd_docker.yml index 7c5a799a8..404e99f41 100644 --- a/.github/workflows/publish_amd_docker.yml +++ b/.github/workflows/publish_amd_docker.yml @@ -10,7 +10,7 @@ env: jobs: build-and-push-image: - runs-on: amd-docker + runs-on: mia1-p02-g29 # Sets the permissions granted to the `PUBLISH_TOKEN` for the actions in this job. permissions: contents: read @@ -23,7 +23,7 @@ jobs: with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} - password: ${{ secrets.PUBLISH_TOKEN }} + password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata (tags, labels) for Docker id: meta uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 diff --git a/docker/amd-docker.Dockerfile b/docker/amd-docker.Dockerfile index 6482bdc79..f44e384de 100644 --- a/docker/amd-docker.Dockerfile +++ b/docker/amd-docker.Dockerfile @@ -1,14 +1,10 @@ FROM ghcr.io/actions/actions-runner:latest ENV CXX=clang++ -ENV UCX_CXX=g++ -ENV UCX_CC=gcc RUN sudo apt-get update -y \ - && sudo apt-get install -y software-properties-common \ - && sudo add-apt-repository -y ppa:git-core/ppa \ - && sudo apt-get update -y \ && sudo apt-get install -y --no-install-recommends \ + software-properties-common \ curl \ ca-certificates \ git \ @@ -22,100 +18,43 @@ RUN sudo apt-get update -y \ lld \ wget \ psmisc \ - python3.10-venv \ + python3-venv \ + python3-pip \ + python3-setuptools \ + python3-wheel \ + python3-dev \ && sudo rm -rf /var/lib/apt/lists/* -RUN sudo apt-get update && sudo apt-get install -y python3.10 python3-pip python-is-python3 python3-setuptools python3-wheel libpython3.10 - RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash && \ sudo apt-get install git-lfs RUN sudo groupadd -g 109 render RUN sudo apt update -y \ - && sudo apt install -y "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)" \ && sudo usermod -a -G render,video runner \ - && wget https://repo.radeon.com/amdgpu-install/6.3.1/ubuntu/jammy/amdgpu-install_6.3.60301-1_all.deb \ - && sudo apt install -y ./amdgpu-install_6.3.60301-1_all.deb \ + && wget https://repo.radeon.com/amdgpu-install/7.1/ubuntu/noble/amdgpu-install_7.1.70100-1_all.deb \ + && sudo apt install -y ./amdgpu-install_7.1.70100-1_all.deb \ && sudo apt update -y \ && sudo apt install -y rocm -RUN sudo pip install --upgrade pip +ENV ROCM_PATH=/opt/rocm -RUN sudo pip install --no-cache-dir torch==2.10.0.dev20250916+rocm6.3 pytorch-triton-rocm --index-url https://download.pytorch.org/whl/nightly/rocm6.3 +RUN sudo pip install --break-system-packages --no-cache-dir torch==2.10.0+rocm7.1 --index-url https://download.pytorch.org/whl/rocm7.1 RUN git clone --recursive https://github.com/ROCm/aiter.git \ && cd aiter \ - && git checkout 1d88633958236e942cba3c283864282f7af3ebc5 \ - && sudo pip install -r requirements.txt \ + && git checkout f3be04a12a0cfd6b5e2c7a94edc774f1bc24460d \ + && sudo pip install --break-system-packages -r requirements.txt \ && sudo python3 setup.py develop RUN sudo mkdir -p /home/runner/aiter/aiter/jit/build \ && sudo chown -R runner:runner /home/runner/aiter/aiter/jit/build -RUN sudo pip install \ +RUN sudo pip install --break-system-packages \ ninja \ numpy \ packaging \ wheel \ - tinygrad - -RUN sudo pip install git+https://github.com/ROCm/iris.git - -RUN sudo apt-get update -y \ - && sudo apt-get install -y --no-install-recommends \ - autoconf \ - automake \ - libtool \ - pkg-config \ - build-essential \ - gfortran \ - flex \ - bison \ - libomp-dev \ - libhwloc-dev \ - libnuma-dev \ - && sudo rm -rf /var/lib/apt/lists/* - -ENV UCX_INSTALL_DIR=/opt/ucx -ENV OMPI_INSTALL_DIR=/opt/openmpi -ENV ROCSHMEM_INSTALL_DIR=/opt/rocshmem -ENV ROCM_PATH=/opt/rocm - -RUN cd /tmp \ - && git clone https://github.com/openucx/ucx.git -b v1.17.x \ - && cd ucx \ - && ./autogen.sh \ - && CC=gcc CXX=g++ ./configure --prefix=${UCX_INSTALL_DIR} --with-rocm=${ROCM_PATH} --enable-mt --disable-optimizations \ - && make -j$(nproc) \ - && sudo make install \ - && cd / \ - && sudo rm -rf /tmp/ucx - -RUN cd /tmp \ - && git clone --recursive https://github.com/open-mpi/ompi.git -b v5.0.x \ - && cd ompi \ - && ./autogen.pl \ - && ./configure --prefix=${OMPI_INSTALL_DIR} --with-rocm=${ROCM_PATH} --with-ucx=${UCX_INSTALL_DIR} \ - && make -j$(nproc) \ - && sudo make install \ - && cd / \ - && sudo rm -rf /tmp/ompi - -ENV PATH="${OMPI_INSTALL_DIR}/bin:${PATH}" -ENV LD_LIBRARY_PATH="${OMPI_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib:/opt/rocm/lib" - - -RUN cd /tmp \ - && git clone https://github.com/ROCm/rocSHMEM.git \ - && cd rocSHMEM \ - && mkdir build \ - && cd build \ - && MPI_ROOT=${OMPI_INSTALL_DIR} UCX_ROOT=${UCX_INSTALL_DIR} CMAKE_PREFIX_PATH="${ROCM_PATH}:$CMAKE_PREFIX_PATH" \ - sudo ../scripts/build_configs/ipc_single -DCMAKE_INSTALL_PREFIX=/opt/rocshmem \ - && cd / \ - && sudo rm -rf /tmp/rocSHMEM -ENV ROCSHMEM_INSTALL_DIR=${ROCSHMEM_INSTALL_DIR} -ENV LD_LIBRARY_PATH="${ROCSHMEM_INSTALL_DIR}/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/opt/rocm/lib" diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index f60764deb..7b719622d 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -21,6 +21,7 @@ class GitHubGPU(Enum): MI300 = "MI300" MI250 = "MI250" MI300x8 = "MI300x8" + MI355X = "MI355X" class ModalGPU(Enum): @@ -121,6 +122,7 @@ class RankCriterion(Enum): "MI300": None, "MI300x8": None, "MI250": None, + "MI355X": None, } @@ -153,6 +155,7 @@ class RankCriterion(Enum): AMD_REQUIREMENTS = """ --index-url https://download.pytorch.org/whl/rocm6.2.4 torch +numpy """ # A buffer for timeouts to account for github setup time diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index e34883f8e..86b5be374 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -53,7 +53,8 @@ def get_timeout(config: dict) -> int: SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"), } seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60 - return math.ceil(seconds / 60) + minutes = math.ceil(seconds / 60) + return max(minutes, DEFAULT_GITHUB_TIMEOUT_MINUTES) class GitHubLauncher(Launcher): @@ -93,12 +94,13 @@ async def run_submission( # noqa: C901 self, config: dict, gpu_type: GPU, status: RunProgressReporter ) -> FullResult: gpu_vendor = None - if gpu_type.value in ["MI300", "MI250", "MI300x8"]: + if gpu_type.value in ["MI300", "MI250", "MI300x8", "MI355X"]: selected_workflow = "amd_workflow.yml" runner_name = { "MI300": "amdgpu-mi300-x86-64", "MI250": "amdgpu-mi250-x86-64", "MI300x8": "amdgpu-mi300-8-x86-64", + "MI355X": "mia1-p02-g29", }[gpu_type.value] gpu_vendor = "AMD" requirements = AMD_REQUIREMENTS