Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/build-rpm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Build RPM Packages

on:
push:
tags:
- 'v*'
pull_request:
branches: [ main ]
paths:
- 'flagcx/**'
- 'packaging/rpm/**'
- '.github/workflows/build-rpm.yml'
workflow_dispatch:

jobs:
build-rpm-packages:
runs-on: h20

strategy:
fail-fast: false
matrix:
backend: [nvidia, metax, ascend]

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
submodules: recursive

- name: Build ${{ matrix.backend }} RPM packages
run: ./packaging/rpm/build-flagcx-rpm.sh ${{ matrix.backend }}

- name: Upload ${{ matrix.backend }} RPM packages
uses: actions/upload-artifact@v4
with:
name: flagcx-${{ matrix.backend }}-rpm-packages
path: rpm-packages/${{ matrix.backend }}/**/*.rpm
retention-days: 7
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ build
plugin/*/build
test/*/build
debian-packages
rpm-packages

# Ignore compiled Python files and shared object files
plugin/*/*.so
Expand Down
100 changes: 100 additions & 0 deletions packaging/rpm/build-flagcx-rpm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/bin/bash
set -e

# FlagCX RPM package build script
# Usage: ./build-flagcx-rpm.sh <backend> [base_image_version]
# Supported backends: nvidia, metax, ascend

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
BACKEND="${1:-}"
BASE_IMAGE_VERSION="${2:-}"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_step() { echo -e "${BLUE}[STEP]${NC} $1"; }

# Show usage
if [ -z "$BACKEND" ]; then
log_error "No backend specified"
echo ""
echo "Usage: $0 <backend> [base_image_version]"
echo ""
echo "Supported backends:"
echo " nvidia - Build RPM packages for NVIDIA GPUs"
echo " metax - Build RPM packages for MetaX accelerators"
echo " ascend - Build RPM packages for Ascend NPUs"
echo ""
echo "Examples:"
echo " $0 nvidia"
echo " $0 ascend 8.5.0-910-openeuler24.03-py3.11"
exit 1
fi

# Validate backend and set base image
case "$BACKEND" in
nvidia)
BASE_IMAGE="nvcr.io/nvidia/cuda"
[ -z "$BASE_IMAGE_VERSION" ] && BASE_IMAGE_VERSION="12.4.1-devel-rockylinux8"
;;
metax)
BASE_IMAGE="rockylinux"
[ -z "$BASE_IMAGE_VERSION" ] && BASE_IMAGE_VERSION="8"
;;
ascend)
BASE_IMAGE="ascendai/cann"
[ -z "$BASE_IMAGE_VERSION" ] && BASE_IMAGE_VERSION="8.5.0-910-openeuler24.03-py3.11"
;;
*)
log_error "Invalid backend: $BACKEND"
echo "Supported backends: nvidia, metax, ascend"
exit 1
;;
esac

log_info "Building FlagCX RPM packages for $BACKEND backend"
log_info "Using base image: ${BASE_IMAGE}:${BASE_IMAGE_VERSION}"

# Sync changelog from CHANGELOG.md
log_step "Synchronizing changelog..."
if [ -f "${PROJECT_DIR}/packaging/sync-changelog.py" ]; then
python3 "${PROJECT_DIR}/packaging/sync-changelog.py" || log_warn "Failed to sync changelog"
else
log_warn "sync-changelog.py not found, skipping changelog sync"
fi

# Build Docker image using unified Dockerfile
DOCKERFILE="${SCRIPT_DIR}/dockerfiles/Dockerfile.rpm"
log_step "Building Docker image..."
docker build \
--network=host \
--build-arg BASE_IMAGE="${BASE_IMAGE}" \
--build-arg BASE_IMAGE_VERSION="${BASE_IMAGE_VERSION}" \
--build-arg BACKEND="${BACKEND}" \
-f "${DOCKERFILE}" \
-t "flagcx-rpm-${BACKEND}:${BASE_IMAGE_VERSION}" \
"${PROJECT_DIR}"

# Extract RPM packages
log_step "Extracting RPM packages..."
OUTPUT_DIR="${PROJECT_DIR}/rpm-packages/${BACKEND}"
mkdir -p "${OUTPUT_DIR}"

CONTAINER_ID=$(docker create "flagcx-rpm-${BACKEND}:${BASE_IMAGE_VERSION}")
docker cp "${CONTAINER_ID}:/root/rpmbuild/RPMS/" "${OUTPUT_DIR}/" 2>/dev/null || true
docker cp "${CONTAINER_ID}:/root/rpmbuild/SRPMS/" "${OUTPUT_DIR}/" 2>/dev/null || true
docker rm "${CONTAINER_ID}"

log_info "✓ Packages built successfully for ${BACKEND}:"
echo ""
find "${OUTPUT_DIR}" -name "*.rpm" -exec ls -lh {} \;

log_info "Build complete! Packages in: ${OUTPUT_DIR}"
61 changes: 61 additions & 0 deletions packaging/rpm/dockerfiles/Dockerfile.rpm
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Unified Dockerfile to build RPM packages for FlagCX
# Supports multiple backends via build arguments

ARG BASE_IMAGE
ARG BASE_IMAGE_VERSION

FROM ${BASE_IMAGE}:${BASE_IMAGE_VERSION}

ARG BACKEND

# Install RPM build tools and dependencies
# Handles differences between RHEL/Rocky (epel + json-devel) and
# OpenEuler (nlohmann-json-devel), and MetaX SDK repo setup.
RUN yum install -y epel-release 2>/dev/null || true
RUN yum install -y \
rpm-build \
rpmdevtools \
gcc-c++ \
make \
cmake \
patchelf \
&& yum clean all
RUN yum install -y json-devel 2>/dev/null \
|| yum install -y nlohmann-json-devel 2>/dev/null \
|| true

# MetaX-specific: configure MACA SDK yum repository and install SDK
RUN if [ "${BACKEND}" = "metax" ]; then \
printf '[maca-sdk]\nname=MACA SDK Yum Repository\nbaseurl=https://repos.metax-tech.com/r/maca-sdk-rpm-x86_64/\nenabled=1\ngpgcheck=0\n' \
> /etc/yum.repos.d/maca-sdk-rpm.repo && \
yum makecache && \
yum install -y maca_sdk && \
yum clean all; \
fi

# Setup RPM build environment
RUN rpmdev-setuptree

# Copy source code
WORKDIR /workspace
COPY . /workspace/

# Read version from spec and create source tarball
RUN SPEC_VERSION=$(grep '^Version:' /workspace/packaging/rpm/specs/flagcx.spec | awk '{print $2}') && \
tar czf /root/rpmbuild/SOURCES/flagcx-${SPEC_VERSION}.tar.gz \
--transform "s,^\.,flagcx-${SPEC_VERSION}," \
--exclude='.git' \
--exclude='build' \
--exclude='debian-packages' \
--exclude='rpm-packages' \
.

# Build RPM with specified backend
RUN rpmbuild -ba \
--define "backend ${BACKEND}" \
/workspace/packaging/rpm/specs/flagcx.spec

# List built packages
RUN ls -lh /root/rpmbuild/RPMS/*/*.rpm

CMD ["/bin/bash"]
122 changes: 122 additions & 0 deletions packaging/rpm/specs/flagcx.spec
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
%global debug_package %{nil}
%global _build_id_links none

# Backend must be specified via: rpmbuild --define 'backend nvidia|metax|ascend'
%{!?backend: %{error: backend must be defined (nvidia, metax, or ascend)}}

# Derive uppercase backend name for make flag (USE_NVIDIA=1, etc.)
%global backend_upper %(echo %{backend} | tr a-z A-Z)

Name: flagcx
Version: 0.8.0
Release: 1%{?dist}
Summary: FlagCX scalable cross-chip communication library

License: ASL 2.0
URL: https://github.com/flagos-ai/FlagCX
Source0: %{name}-%{version}.tar.gz

BuildRequires: gcc-c++
BuildRequires: make
BuildRequires: cmake
BuildRequires: patchelf
# nlohmann-json package name varies by distro
%if 0%{?rhel} == 8
BuildRequires: json-devel
%else
BuildRequires: nlohmann-json-devel
%endif

%description
FlagCX is a scalable and adaptive cross-chip communication library.
It serves as a platform where developers, researchers, and AI engineers
can collaborate on various projects.

# Only the target backend's subpackages are defined
%package -n libflagcx-%{backend}
Summary: FlagCX library for %{backend}
%if "%{backend}" == "nvidia"
Requires: libnccl >= 2.0
%endif

%description -n libflagcx-%{backend}
FlagCX communication library built for %{backend} hardware.

%package -n libflagcx-%{backend}-devel
Summary: Development files for libflagcx-%{backend}
Requires: libflagcx-%{backend} = %{version}-%{release}

%description -n libflagcx-%{backend}-devel
Development files (headers and libraries) for libflagcx-%{backend}.

%prep
%setup -q

%build
make USE_%{backend_upper}=1 PREFIX=%{_prefix}

%install
rm -rf %{buildroot}

# Install shared library
install -d %{buildroot}%{_libdir}
install -m 755 build/lib/libflagcx.so %{buildroot}%{_libdir}/libflagcx.so.0
ln -s libflagcx.so.0 %{buildroot}%{_libdir}/libflagcx.so

# Install headers
install -d %{buildroot}%{_includedir}/flagcx
cp -r flagcx/include/* %{buildroot}%{_includedir}/flagcx/

# Fix RPATH and set SONAME
patchelf --remove-rpath %{buildroot}%{_libdir}/libflagcx.so.0 || true
patchelf --set-soname libflagcx.so.0 %{buildroot}%{_libdir}/libflagcx.so.0 || true

%files -n libflagcx-%{backend}
%license LICENSE
%{_libdir}/libflagcx.so.0

%files -n libflagcx-%{backend}-devel
%{_includedir}/flagcx/
%{_libdir}/libflagcx.so

%changelog
* Sat Nov 01 2025 FlagOS Contributors <contact@flagos.io> - 0.7-1
- Added support to TsingMicro, including device adaptor tsmicroAdaptor and CCL adaptor tcclAdaptor.
- Implemented an experimental kernel-free non-reduce collective communication (SendRecv, AlltoAll, AlltoAllv, Broadcast, Gather, Scatter, AllGather) using device-buffer IPC/RDMA.
- Enabled auto-tuning on NVIDIA, MetaX, and Hygon platforms, achieving 1.02×–1.26× speedups for AllReduce, AllGather, ReduceScatter, and AlltoAll.
- Enhanced flagcxNetAdaptor with one-sided primitives (put, putSignal, waitValue) and added retransmission support for reliability improvement.

* Wed Oct 01 2025 FlagOS Contributors <contact@flagos.io> - 0.6-1
- Implemented device-buffer IPC communication to support intra-node SendRecv operations.
- Introduced device-initiated, host-launched device-side primitives, enabling kernel-based communication directly from devices.
- Enhanced auto-tuning with 50% performance improvement on MetaX platforms for the AllReduce operations.

* Mon Sep 01 2025 FlagOS Contributors <contact@flagos.io> - 0.5-1
- Added support for AMD GPUs, including a device adaptor hipAdaptor and a CCL adaptor rcclAdaptor.
- Introduced flagcxNetAdaptor to unify network backends, currently supporting socket, IBRC, UCX and IBUC (experimental).
- Enabled zero-copy device-buffer RDMA (user-buffer RDMA) to boost performance for small messages.
- Supported auto-tuning in homogeneous scenarios via flagcxTuner.
- Added test automation in CI/CD for PyTorch APIs.

* Fri Aug 01 2025 FlagOS Contributors <contact@flagos.io> - 0.4-1
- Supported heterogeneous training of ERNIE4.5 (Baidu) on NVIDIA and Iluvatar GPUs with Paddle + FlagCX.
- Improved heterogeneous communication across arbitrary NIC configurations, with more robust and flexible deployments.
- Introduced an experimental network plugin interface with extended supports for IBRC and SOCKET. Device buffer registration now can be done via DMA-BUF.
- Added an InterOp-level DSL to enable customized C2C algorithm design.
- Provided user documentation under docs/.

* Tue Jul 01 2025 FlagOS Contributors <contact@flagos.io> - 0.3-1
- Integrated three additional native communication libraries: HCCL (Huawei), MUSACCL (Moore Threads) and MPI.
- Enhanced heterogeneous collective communication operations with pipeline optimizations.
- Introduced device-side functions to enable device-buffer RDMA, complementing the existing host-side functions.
- Delivered a full-stack open-source solution, FlagScale + FlagCX, for efficient heterogeneous prefilling-decoding disaggregation.

* Thu May 01 2025 FlagOS Contributors <contact@flagos.io> - 0.2-1
- Integrated 3 additional native communications libraries, including MCCL (Moore Threads), XCCL (Mellanox) and DUCCL (BAAI).
- Improved 11 heterogeneous collective communication operations with automatic topology detection and full support to single-NIC and multi-NIC environments.

* Tue Apr 01 2025 FlagOS Contributors <contact@flagos.io> - 0.1-1
- Added 5 native communications libraries including CCL adaptors for NCCL (NVIDIA), IXCCL (Iluvatar), and CNCL (Cambricon), and Host CCL adaptors GLOO and Bootstrap.
- Supported 11 heterogeneous collective communication operations using the C2C (Cluster-to-Cluster) algorithm.
- Provided a full-stack open-source solution, FlagScale + FlagCX, for efficient heterogeneous training.
- Natively integrated into PaddlePaddle [v3.0.0](https://github.com/PaddlePaddle/Paddle/tree/v3.0.0), with support for both dynamic and static graphs.