diff --git a/.github/workflows/build-rpm.yml b/.github/workflows/build-rpm.yml new file mode 100644 index 000000000..586f8371c --- /dev/null +++ b/.github/workflows/build-rpm.yml @@ -0,0 +1,68 @@ +name: Build RPM Packages + +on: + push: + tags: + - 'v*' + pull_request: + branches: [ main ] + paths: + - 'flagcx/**' + - 'packaging/rpm/**' + - '.github/workflows/build-rpm.yml' + workflow_dispatch: + +jobs: + build-rpm-packages: + runs-on: h20 + + strategy: + fail-fast: false + matrix: + backend: [nvidia, metax, ascend] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Build ${{ matrix.backend }} RPM packages + run: ./packaging/rpm/build-flagcx-rpm.sh ${{ matrix.backend }} + + - name: Upload ${{ matrix.backend }} RPM packages + uses: actions/upload-artifact@v4 + with: + name: flagcx-${{ matrix.backend }}-rpm-packages + path: rpm-packages/${{ matrix.backend }}/**/*.rpm + retention-days: 7 + + # Publish straight from the locally-built artifacts on the same self-hosted + # runner: h20 can reach the internal Nexus but not codeload/api.github.com + # reliably, so we deliberately avoid any cross-workflow artifact round-trip. + - name: Publish ${{ matrix.backend }} RPMs to Nexus YUM repository + if: startsWith(github.ref, 'refs/tags/v') + env: + NEXUS_USERNAME: ${{ secrets.REGISTRY_USERNAME }} + NEXUS_PASSWORD: ${{ secrets.CONTAINER_REGISTRY }} + NEXUS_REPO_URL: https://resource.flagos.net/repository/flagos-yum-hosted + BACKEND: ${{ matrix.backend }} + run: | + set -euo pipefail + + uploaded=0 + while IFS= read -r -d '' rpm; do + # rel keeps the RPMS// or SRPMS/ layout under the backend + rel="${rpm#rpm-packages/${BACKEND}/}" + echo "Uploading ${rpm} -> ${BACKEND}/${rel}" + curl -f -u "${NEXUS_USERNAME}:${NEXUS_PASSWORD}" \ + --upload-file "$rpm" \ + "${NEXUS_REPO_URL}/${BACKEND}/${rel}" + uploaded=$((uploaded + 1)) + done < <(find "rpm-packages/${BACKEND}" -name '*.rpm' -print0) + + if [ "$uploaded" -eq 0 ]; then + echo "No RPMs found for ${BACKEND}" + exit 1 + fi + echo "Uploaded ${uploaded} ${BACKEND} RPM(s) to Nexus YUM repository" diff --git a/.gitignore b/.gitignore index f47f3f9f3..ac8e47dc9 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ build plugin/*/build test/*/build debian-packages +rpm-packages # Ignore compiled Python files and shared object files plugin/*/*.so diff --git a/packaging/rpm/build-flagcx-rpm.sh b/packaging/rpm/build-flagcx-rpm.sh new file mode 100755 index 000000000..a4216bfb0 --- /dev/null +++ b/packaging/rpm/build-flagcx-rpm.sh @@ -0,0 +1,106 @@ +#!/bin/bash +set -e + +# FlagCX RPM package build script +# Usage: ./build-flagcx-rpm.sh [base_image_version] +# Supported backends: nvidia, metax, ascend + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")" +BACKEND="${1:-}" +BASE_IMAGE_VERSION="${2:-}" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } +log_step() { echo -e "${BLUE}[STEP]${NC} $1"; } + +# Show usage +if [ -z "$BACKEND" ]; then + log_error "No backend specified" + echo "" + echo "Usage: $0 [base_image_version]" + echo "" + echo "Supported backends:" + echo " nvidia - Build RPM packages for NVIDIA GPUs" + echo " metax - Build RPM packages for MetaX accelerators" + echo " ascend - Build RPM packages for Ascend NPUs" + echo "" + echo "Examples:" + echo " $0 nvidia" + echo " $0 ascend 8.5.0-910-openeuler24.03-py3.11" + exit 1 +fi + +# Validate backend and set base image +case "$BACKEND" in + nvidia) + BASE_IMAGE="nvcr.io/nvidia/cuda" + [ -z "$BASE_IMAGE_VERSION" ] && BASE_IMAGE_VERSION="12.4.1-devel-rockylinux8" + DOCKERFILE="${SCRIPT_DIR}/dockerfiles/Dockerfile.rpm.nvidia" + ;; + metax) + BASE_IMAGE="rockylinux" + [ -z "$BASE_IMAGE_VERSION" ] && BASE_IMAGE_VERSION="8" + DOCKERFILE="${SCRIPT_DIR}/dockerfiles/Dockerfile.rpm.metax" + ;; + ascend) + BASE_IMAGE="ascendai/cann" + [ -z "$BASE_IMAGE_VERSION" ] && BASE_IMAGE_VERSION="8.5.0-910-openeuler24.03-py3.11" + DOCKERFILE="${SCRIPT_DIR}/dockerfiles/Dockerfile.rpm.ascend" + ;; + *) + log_error "Invalid backend: $BACKEND" + echo "Supported backends: nvidia, metax, ascend" + exit 1 + ;; +esac + +log_info "Building FlagCX RPM packages for $BACKEND backend" +log_info "Using base image: ${BASE_IMAGE}:${BASE_IMAGE_VERSION}" + +# Sync changelog from CHANGELOG.md +log_step "Synchronizing changelog..." +if [ -f "${PROJECT_DIR}/packaging/sync-changelog.py" ]; then + python3 "${PROJECT_DIR}/packaging/sync-changelog.py" || log_warn "Failed to sync changelog" +else + log_warn "sync-changelog.py not found, skipping changelog sync" +fi + +# Build Docker image using backend-specific Dockerfile with shared RPM logic. +log_step "Building Docker image..." +docker build \ + --network=host \ + --build-arg BASE_IMAGE_VERSION="${BASE_IMAGE_VERSION}" \ + -f "${DOCKERFILE}" \ + -t "flagcx-rpm-${BACKEND}:${BASE_IMAGE_VERSION}" \ + "${PROJECT_DIR}" + +# Extract RPM packages +log_step "Extracting RPM packages..." +OUTPUT_DIR="${PROJECT_DIR}/rpm-packages/${BACKEND}" +mkdir -p "${OUTPUT_DIR}" + +CONTAINER_ID=$(docker create "flagcx-rpm-${BACKEND}:${BASE_IMAGE_VERSION}") +docker cp "${CONTAINER_ID}:/root/rpmbuild/RPMS/" "${OUTPUT_DIR}/" +docker cp "${CONTAINER_ID}:/root/rpmbuild/SRPMS/" "${OUTPUT_DIR}/" +docker rm "${CONTAINER_ID}" + +# Fail loudly if no RPMs were extracted, so CI doesn't silently upload empty artifacts. +if ! find "${OUTPUT_DIR}" -name '*.rpm' | grep -q .; then + log_error "No RPM packages found under ${OUTPUT_DIR}" + exit 1 +fi + +log_info "✓ Packages built successfully for ${BACKEND}:" +echo "" +find "${OUTPUT_DIR}" -name "*.rpm" -exec ls -lh {} \; + +log_info "Build complete! Packages in: ${OUTPUT_DIR}" diff --git a/packaging/rpm/dockerfiles/Dockerfile.rpm.ascend b/packaging/rpm/dockerfiles/Dockerfile.rpm.ascend new file mode 100644 index 000000000..f709d0508 --- /dev/null +++ b/packaging/rpm/dockerfiles/Dockerfile.rpm.ascend @@ -0,0 +1,11 @@ +# Build FlagCX Ascend RPM packages on CANN OpenEuler images. + +ARG BASE_IMAGE_VERSION=8.5.0-910-openeuler24.03-py3.11 +FROM ascendai/cann:${BASE_IMAGE_VERSION} + +WORKDIR /workspace +COPY . /workspace/ + +RUN bash packaging/rpm/dockerfiles/build-rpm-common.sh ascend + +CMD ["/bin/bash"] diff --git a/packaging/rpm/dockerfiles/Dockerfile.rpm.metax b/packaging/rpm/dockerfiles/Dockerfile.rpm.metax new file mode 100644 index 000000000..a3191727e --- /dev/null +++ b/packaging/rpm/dockerfiles/Dockerfile.rpm.metax @@ -0,0 +1,20 @@ +# Build FlagCX MetaX RPM packages on Rocky Linux. + +ARG BASE_IMAGE_VERSION=8 +FROM rockylinux:${BASE_IMAGE_VERSION} + +WORKDIR /workspace +COPY . /workspace/ + +# MetaX packages are served from the public MACA yum repository. +# TODO: switch gpgcheck=1 once MetaX publishes a stable GPG key for +# repos.metax-tech.com. Today this repo serves unsigned packages. +RUN printf '[maca-sdk]\nname=MACA SDK Yum Repository\nbaseurl=https://repos.metax-tech.com/r/maca-sdk-rpm-x86_64/\nenabled=1\ngpgcheck=0\n' \ + > /etc/yum.repos.d/maca-sdk-rpm.repo && \ + yum makecache && \ + yum install -y maca_sdk && \ + yum clean all + +RUN bash packaging/rpm/dockerfiles/build-rpm-common.sh metax + +CMD ["/bin/bash"] diff --git a/packaging/rpm/dockerfiles/Dockerfile.rpm.nvidia b/packaging/rpm/dockerfiles/Dockerfile.rpm.nvidia new file mode 100644 index 000000000..f23cb0f15 --- /dev/null +++ b/packaging/rpm/dockerfiles/Dockerfile.rpm.nvidia @@ -0,0 +1,11 @@ +# Build FlagCX NVIDIA RPM packages on Rocky Linux based CUDA images. + +ARG BASE_IMAGE_VERSION=12.4.1-devel-rockylinux8 +FROM nvcr.io/nvidia/cuda:${BASE_IMAGE_VERSION} + +WORKDIR /workspace +COPY . /workspace/ + +RUN bash packaging/rpm/dockerfiles/build-rpm-common.sh nvidia + +CMD ["/bin/bash"] diff --git a/packaging/rpm/dockerfiles/build-rpm-common.sh b/packaging/rpm/dockerfiles/build-rpm-common.sh new file mode 100755 index 000000000..ed7104fec --- /dev/null +++ b/packaging/rpm/dockerfiles/build-rpm-common.sh @@ -0,0 +1,58 @@ +#!/bin/bash +set -euo pipefail + +BACKEND="${1:-}" + +if [ -z "${BACKEND}" ]; then + echo "ERROR: backend is required" >&2 + exit 1 +fi + +case "${BACKEND}" in + nvidia|metax|ascend) + ;; + *) + echo "ERROR: unsupported backend: ${BACKEND}" >&2 + exit 1 + ;; +esac + +PKG_MANAGER="$(command -v dnf || command -v yum || true)" +if [ -z "${PKG_MANAGER}" ]; then + echo "ERROR: neither dnf nor yum is available in the base image" >&2 + exit 1 +fi + +"${PKG_MANAGER}" install -y epel-release || \ + echo "EPEL not available for this base image, continuing without it" + +"${PKG_MANAGER}" install -y \ + rpm-build \ + rpmdevtools \ + gcc-c++ \ + make \ + cmake \ + patchelf + +"${PKG_MANAGER}" install -y json-devel 2>/dev/null \ + || "${PKG_MANAGER}" install -y nlohmann-json-devel 2>/dev/null \ + || { echo "ERROR: neither json-devel nor nlohmann-json-devel is available; rpmbuild requires nlohmann::json headers" >&2; exit 1; } + +"${PKG_MANAGER}" clean all + +rpmdev-setuptree + +SPEC_VERSION="$(awk '/^Version:/ {print $2; exit}' /workspace/packaging/rpm/specs/flagcx.spec)" +tar czf "/root/rpmbuild/SOURCES/flagcx-${SPEC_VERSION}.tar.gz" \ + --transform "s,^\.,flagcx-${SPEC_VERSION}," \ + --exclude='.git' \ + --exclude='build' \ + --exclude='debian-packages' \ + --exclude='rpm-packages' \ + . + +rpmbuild -ba \ + --define "backend ${BACKEND}" \ + /workspace/packaging/rpm/specs/flagcx.spec + +ls -lh /root/rpmbuild/RPMS/*/*.rpm diff --git a/packaging/rpm/specs/flagcx.spec b/packaging/rpm/specs/flagcx.spec new file mode 100644 index 000000000..6cec195f5 --- /dev/null +++ b/packaging/rpm/specs/flagcx.spec @@ -0,0 +1,150 @@ +%global debug_package %{nil} +%global _build_id_links none +# Main "flagcx" package intentionally has no %files of its own; all artifacts +# live in the libflagcx-%{backend}{,-devel} subpackages. Without this guard, +# rpmbuild treats an empty main package manifest as an error. +%global _empty_manifest_terminate_build 0 + +# Backend must be specified via: rpmbuild --define 'backend nvidia|metax|ascend' +%{!?backend: %{error: backend must be defined (nvidia, metax, or ascend)}} + +# Derive uppercase backend name for make flag (USE_NVIDIA=1, etc.) +%global backend_upper %(echo %{backend} | tr a-z A-Z) + +# Pin build/install arch by backend. Ascend CANN images are available for +# both x86_64 development hosts and aarch64 deployment hosts; NVIDIA and +# MetaX RPM builds currently target x86_64. ExclusiveArch makes rpmbuild +# refuse to start on unsupported hosts, avoiding CPU-arch-mislabeled RPMs. +%if "%{backend}" == "ascend" +ExclusiveArch: x86_64 aarch64 +%else +ExclusiveArch: x86_64 +%endif + +Name: flagcx +Version: 0.8.0 +Release: 1%{?dist} +Summary: FlagCX scalable cross-chip communication library + +License: Apache-2.0 +URL: https://github.com/flagos-ai/FlagCX +Source0: %{url}/archive/refs/tags/v%{version}.tar.gz#/%{name}-%{version}.tar.gz + +BuildRequires: gcc-c++ +BuildRequires: make +BuildRequires: cmake +BuildRequires: patchelf +# nlohmann-json package name varies by distro: +# - RHEL/Rocky 8 (via EPEL): json-devel +# - RHEL/Rocky 9 (via EPEL): nlohmann-json-devel +# - OpenEuler / others: nlohmann-json-devel (fallback) +# TODO: verify Rocky 9 / RHEL 9 build path end-to-end; the EPEL 9 package +# name is nlohmann-json-devel, but this has only been smoke-tested. +%if 0%{?rhel} == 8 +BuildRequires: json-devel +%else +%if 0%{?rhel} >= 9 +BuildRequires: nlohmann-json-devel +%else +# Non-RHEL (OpenEuler, etc.) – assume upstream nlohmann-json-devel package name. +BuildRequires: nlohmann-json-devel +%endif +%endif + +%description +FlagCX is a scalable and adaptive cross-chip communication library. +It serves as a platform where developers, researchers, and AI engineers +can collaborate on various projects. + +# Only the target backend's subpackages are defined +%package -n libflagcx-%{backend} +Summary: FlagCX library for %{backend} +%if "%{backend}" == "nvidia" +# Group-call API arrived in NCCL 2.10; ncclConfig appeared in 2.14. +# 2.10 is the practical minimum for FlagCX's adaptor today; bump to 2.14 +# once we confirm ncclConfig is actually exercised. +Requires: libnccl >= 2.10 +%endif + +%description -n libflagcx-%{backend} +FlagCX communication library built for %{backend} hardware. + +%package -n libflagcx-%{backend}-devel +Summary: Development files for libflagcx-%{backend} +Requires: libflagcx-%{backend} = %{version}-%{release} + +%description -n libflagcx-%{backend}-devel +Development files (headers and libraries) for libflagcx-%{backend}. + +%prep +%setup -q + +%build +make USE_%{backend_upper}=1 PREFIX=%{_prefix} + +%install +rm -rf %{buildroot} + +# Install shared library +install -d %{buildroot}%{_libdir} +install -m 755 build/lib/libflagcx.so %{buildroot}%{_libdir}/libflagcx.so.0 +ln -s libflagcx.so.0 %{buildroot}%{_libdir}/libflagcx.so + +# Install headers +install -d %{buildroot}%{_includedir}/flagcx +cp -r flagcx/include/* %{buildroot}%{_includedir}/flagcx/ + +# Fix RPATH and set SONAME — fail loud if patchelf can't normalize the .so, +# otherwise a misconfigured SONAME ships and crashes consumers at runtime. +patchelf --remove-rpath %{buildroot}%{_libdir}/libflagcx.so.0 +patchelf --set-soname libflagcx.so.0 %{buildroot}%{_libdir}/libflagcx.so.0 + +%files -n libflagcx-%{backend} +%license LICENSE +%{_libdir}/libflagcx.so.0 + +%files -n libflagcx-%{backend}-devel +%{_includedir}/flagcx/ +%{_libdir}/libflagcx.so + +%changelog +* Sat Nov 01 2025 FlagOS Contributors - 0.7-1 +- Added support to TsingMicro, including device adaptor tsmicroAdaptor and CCL adaptor tcclAdaptor. +- Implemented an experimental kernel-free non-reduce collective communication (SendRecv, AlltoAll, AlltoAllv, Broadcast, Gather, Scatter, AllGather) using device-buffer IPC/RDMA. +- Enabled auto-tuning on NVIDIA, MetaX, and Hygon platforms, achieving 1.02×–1.26× speedups for AllReduce, AllGather, ReduceScatter, and AlltoAll. +- Enhanced flagcxNetAdaptor with one-sided primitives (put, putSignal, waitValue) and added retransmission support for reliability improvement. + +* Wed Oct 01 2025 FlagOS Contributors - 0.6-1 +- Implemented device-buffer IPC communication to support intra-node SendRecv operations. +- Introduced device-initiated, host-launched device-side primitives, enabling kernel-based communication directly from devices. +- Enhanced auto-tuning with 50% performance improvement on MetaX platforms for the AllReduce operations. + +* Mon Sep 01 2025 FlagOS Contributors - 0.5-1 +- Added support for AMD GPUs, including a device adaptor hipAdaptor and a CCL adaptor rcclAdaptor. +- Introduced flagcxNetAdaptor to unify network backends, currently supporting socket, IBRC, UCX and IBUC (experimental). +- Enabled zero-copy device-buffer RDMA (user-buffer RDMA) to boost performance for small messages. +- Supported auto-tuning in homogeneous scenarios via flagcxTuner. +- Added test automation in CI/CD for PyTorch APIs. + +* Fri Aug 01 2025 FlagOS Contributors - 0.4-1 +- Supported heterogeneous training of ERNIE4.5 (Baidu) on NVIDIA and Iluvatar GPUs with Paddle + FlagCX. +- Improved heterogeneous communication across arbitrary NIC configurations, with more robust and flexible deployments. +- Introduced an experimental network plugin interface with extended supports for IBRC and SOCKET. Device buffer registration now can be done via DMA-BUF. +- Added an InterOp-level DSL to enable customized C2C algorithm design. +- Provided user documentation under docs/. + +* Tue Jul 01 2025 FlagOS Contributors - 0.3-1 +- Integrated three additional native communication libraries: HCCL (Huawei), MUSACCL (Moore Threads) and MPI. +- Enhanced heterogeneous collective communication operations with pipeline optimizations. +- Introduced device-side functions to enable device-buffer RDMA, complementing the existing host-side functions. +- Delivered a full-stack open-source solution, FlagScale + FlagCX, for efficient heterogeneous prefilling-decoding disaggregation. + +* Thu May 01 2025 FlagOS Contributors - 0.2-1 +- Integrated 3 additional native communications libraries, including MCCL (Moore Threads), XCCL (Mellanox) and DUCCL (BAAI). +- Improved 11 heterogeneous collective communication operations with automatic topology detection and full support to single-NIC and multi-NIC environments. + +* Tue Apr 01 2025 FlagOS Contributors - 0.1-1 +- Added 5 native communications libraries including CCL adaptors for NCCL (NVIDIA), IXCCL (Iluvatar), and CNCL (Cambricon), and Host CCL adaptors GLOO and Bootstrap. +- Supported 11 heterogeneous collective communication operations using the C2C (Cluster-to-Cluster) algorithm. +- Provided a full-stack open-source solution, FlagScale + FlagCX, for efficient heterogeneous training. +- Natively integrated into PaddlePaddle [v3.0.0](https://github.com/PaddlePaddle/Paddle/tree/v3.0.0), with support for both dynamic and static graphs.