Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions .github/configs/metax.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# MetaX Hardware Configuration
# This file defines CI/CD settings for MetaX-based testing
# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml

hardware_name: metax
display_name: "MetaX Tests"

# Docker image for this hardware
# TODO: Replace with actual MetaX Docker image
ci_image: localhost:5000/flagscale:metax-TODO
ci_train_image: localhost:5000/flagscale-train:metax-TODO
ci_inference_image: localhost:5000/flagscale-inference:metax-TODO

# Runner labels for this hardware
# TODO: Replace with actual MetaX runner labels
runner_labels:
- self-hosted
- Linux
- X64
- metax-0 # TODO: Update to actual MetaX runner label
- gpus-8 # TODO: Update to actual GPU count

# Container volumes (hardware-specific paths)
# TODO: Update paths if MetaX uses different mount points
container_volumes:
- /home/flagscale_cicd/flask/static:/workspace/report
- /home/flagscale_cicd/flask/config:/workspace/config
- /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data
- /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers
- /home/flagscale_cicd/sccache:/github/home/.cache/sccache

# Container options (hardware-specific settings)
# TODO: Update GPU runtime options for MetaX (e.g., replace --gpus all if needed)
container_options: "--gpus all --shm-size=500g --hostname flagscale_cicd --user root --ulimit nofile=65535:65535"

# =============================================================================
# Package Manager Configuration
# =============================================================================
# Supported package managers: pip, uv, conda
# - pip: Use pip directly (standard Python)
# - uv: Use uv pip (fast, modern package manager)
# - conda: Use conda environment with pip for PyPI packages
#
# TODO: Update package manager settings for MetaX environment
pkg_mgr: "conda"

# Environment path (venv path for uv, conda installation path for conda)
# TODO: Update to actual MetaX environment path
env_path: "/root/miniconda3"

# Conda environment name (for conda only)
# TODO: Update environment names for MetaX
env_names:
train: "flagscale-train"
hetero_train: "flagscale-train"
inference: "flagscale-inference"
rl: "flagscale-rl"
31 changes: 31 additions & 0 deletions .github/workflows/all_tests_metax.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: metax_tests

on:
push:
branches: ["main"]
pull_request:
branches: ["main"]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
cancel-in-progress: true

jobs:
run_tests:
# Package manager and environment settings are read from .github/configs/metax.yml
uses: ./.github/workflows/all_tests_common.yml
with:
platform: metax

all_tests:
needs: run_tests
runs-on: ubuntu-latest
if: always()
steps:
- name: Verify workflow status
run: |
if [ "${{ needs.run_tests.result }}" != "success" ]; then
echo "❌ Tests workflow failed"
exit 1
fi
echo "✅ All tests passed!"
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
defaults:
- envs: cuda
- _self_
- train: dp2dp4_shared_embedding

Expand All @@ -13,19 +14,7 @@ experiment:
ssh_port: null
shell_cmds: null
envs:
HYDRA_FULL_ERROR: 1
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5"
CUDA_DEVICE_MAX_CONNECTIONS: 1
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
NCCL_ALGO: "Ring"
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NVTE_FLASH_ATTN: 0
NVTE_FUSED_ATTN: 0
CUDNN_BENCHMARK: "false"
CUDNN_DETERMINISTIC: "true"
cmds:
before_start: source /root/miniconda3/bin/activate flagscale-train
action: run

hydra:
Expand Down
17 changes: 17 additions & 0 deletions tests/functional_tests/hetero_train/aquila/conf/envs/cuda.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# @package _global_
# CUDA platform environment variables for hetero_train/aquila
experiment:
envs:
HYDRA_FULL_ERROR: 1
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
NCCL_ALGO: "Ring"
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NVTE_FLASH_ATTN: 0
NVTE_FUSED_ATTN: 0
CUDNN_BENCHMARK: "false"
CUDNN_DETERMINISTIC: "true"
cmds:
before_start: source /root/miniconda3/bin/activate flagscale-train
11 changes: 11 additions & 0 deletions tests/functional_tests/hetero_train/aquila/conf/envs/metax.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# @package _global_
# MetaX platform environment variables for hetero_train/aquila
# TODO: Replace with actual MetaX environment variables
experiment:
envs:
HYDRA_FULL_ERROR: 1
# TODO: MetaX visible devices env var
# MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
cmds:
# TODO: Update conda env activation for MetaX
before_start: source /root/miniconda3/bin/activate flagscale-train
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
defaults:
- envs: cuda
- _self_
- train: tp2dp1pp1_tp2dp2pp1_tp1dp2pp1

Expand All @@ -12,20 +13,6 @@ experiment:
runner:
ssh_port: null
shell_cmds: null
envs:
HYDRA_FULL_ERROR: 1
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
NCCL_ALGO: "Ring"
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NVTE_FLASH_ATTN: 0
NVTE_FUSED_ATTN: 0
CUDNN_BENCHMARK: "false"
CUDNN_DETERMINISTIC: "true"
cmds:
before_start: source /root/miniconda3/bin/activate flagscale-train
action: run

hydra:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
defaults:
- envs: cuda
- _self_
- train: tp2pp1_tp4pp1_tp2pp1

Expand All @@ -12,20 +13,6 @@ experiment:
runner:
ssh_port: null
shell_cmds: null
envs:
HYDRA_FULL_ERROR: 1
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
NCCL_ALGO: "Ring"
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NVTE_FLASH_ATTN: 0
NVTE_FUSED_ATTN: 0
CUDNN_BENCHMARK: "false"
CUDNN_DETERMINISTIC: "true"
cmds:
before_start: source /root/miniconda3/bin/activate flagscale-train
action: run

hydra:
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
{"lm loss:": {"values": [11.55754, 11.56045, 11.3609, 11.22254, 11.10463, 11.01332, 10.95259, 10.9088, 10.88758, 10.86586]}}
{
"cuda": {
"a100": {
"lm loss:": {
"values": [11.55754, 11.56045, 11.3609, 11.22254, 11.10463, 11.01332, 10.95259, 10.9088, 10.88758, 10.86586]
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
{"lm loss:": {"values": [11.62049, 11.61899, 11.41389, 11.27374, 11.15958, 11.07645, 11.01809, 10.97522, 10.95196, 10.93447]}}
{
"cuda": {
"a100": {
"lm loss:": {
"values": [11.62049, 11.61899, 11.41389, 11.27374, 11.15958, 11.07645, 11.01809, 10.97522, 10.95196, 10.93447]
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
{"lm loss:": {"values": [11.60803, 11.60942, 11.39587, 11.24672, 11.12878, 11.03954, 10.97887, 10.93456, 10.91292, 10.89361]}}
{
"cuda": {
"a100": {
"lm loss:": {
"values": [11.60803, 11.60942, 11.39587, 11.24672, 11.12878, 11.03954, 10.97887, 10.93456, 10.91292, 10.89361]
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
defaults:
- envs: cuda
- _self_
- inference: 7b_tp2

Expand All @@ -11,36 +12,6 @@ experiment:
entrypoint: flagscale/inference/inference_llm.py
runner:
hostfile: null
cmds:
before_start:
source /root/miniconda3/bin/activate flagscale-inference
envs:
HYDRA_FULL_ERROR: 1
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
CUDNN_BENCHMARK: "false"
CUDNN_DETERMINISTIC: "true"
# Quantitative perception training related
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NVTE_FLASH_ATTN: 0
NVTE_FUSED_ATTN: 0
# GPU parallel control
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
NCCL_ALGO: "Ring"
NCCL_PROTOCOL: LLC
# Basic randomness control
SEED: 1234
PYTHONHASHSEED: 0
MKL_NUM_THREADS: 1
OMP_NUM_THREADS: 1
NUMEXPR_NUM_THREADS: 1
SCIPY_RDRANDOM: 0
TF_DETERMINISTIC_OPS: 1
TORCH_CUDNN_DETERMINISM: true
CUDA_LAUNCH_BLOCKING: 1
NCCL_DEBUG: INFO
MAGIC_CACHE: disabled

action: run

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# CUDA platform environment variables for inference/deepseek_r1_distill_qwen
experiment:
envs:
HYDRA_FULL_ERROR: 1
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
CUDNN_BENCHMARK: "false"
CUDNN_DETERMINISTIC: "true"
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NVTE_FLASH_ATTN: 0
NVTE_FUSED_ATTN: 0
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
NCCL_ALGO: "Ring"
NCCL_PROTOCOL: LLC
SEED: 1234
PYTHONHASHSEED: 0
MKL_NUM_THREADS: 1
OMP_NUM_THREADS: 1
NUMEXPR_NUM_THREADS: 1
SCIPY_RDRANDOM: 0
TF_DETERMINISTIC_OPS: 1
TORCH_CUDNN_DETERMINISM: true
CUDA_LAUNCH_BLOCKING: 1
NCCL_DEBUG: INFO
MAGIC_CACHE: disabled
cmds:
before_start: source /root/miniconda3/bin/activate flagscale-inference
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# MetaX platform environment variables for inference/deepseek_r1_distill_qwen
# TODO: Replace with actual MetaX environment variables
experiment:
envs:
HYDRA_FULL_ERROR: 1
SEED: 1234
PYTHONHASHSEED: 0
MKL_NUM_THREADS: 1
OMP_NUM_THREADS: 1
NUMEXPR_NUM_THREADS: 1
SCIPY_RDRANDOM: 0
TF_DETERMINISTIC_OPS: 1
MAGIC_CACHE: disabled
# TODO: MetaX visible devices and platform-specific env vars
# MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
cmds:
# TODO: Update conda env activation for MetaX
before_start: source /root/miniconda3/bin/activate flagscale-inference
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
defaults:
- envs: cuda
- _self_
- inference: 7b_tp2

Expand All @@ -11,37 +12,6 @@ experiment:
entrypoint: flagscale/inference/inference_llm.py
runner:
hostfile: null
cmds:
before_start:
source /root/miniconda3/bin/activate flagscale-inference
envs:
HYDRA_FULL_ERROR: 1
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
CUDNN_BENCHMARK: "false"
CUDNN_DETERMINISTIC: "true"
USE_FLAGGEMS: "true"
# Quantitative perception training related
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NVTE_FLASH_ATTN: 0
NVTE_FUSED_ATTN: 0
# GPU parallel control
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
NCCL_ALGO: "Ring"
NCCL_PROTOCOL: LLC
# Basic randomness control
SEED: 1234
PYTHONHASHSEED: 0
MKL_NUM_THREADS: 1
OMP_NUM_THREADS: 1
NUMEXPR_NUM_THREADS: 1
SCIPY_RDRANDOM: 0
TF_DETERMINISTIC_OPS: 1
TORCH_CUDNN_DETERMINISM: true
CUDA_LAUNCH_BLOCKING: 1
NCCL_DEBUG: INFO
MAGIC_CACHE: disabled

action: run

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# CUDA platform environment variables for inference/deepseek_r1_distill_qwen_flaggems
experiment:
envs:
HYDRA_FULL_ERROR: 1
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
CUDNN_BENCHMARK: "false"
CUDNN_DETERMINISTIC: "true"
USE_FLAGGEMS: "true"
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NVTE_FLASH_ATTN: 0
NVTE_FUSED_ATTN: 0
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
NCCL_ALGO: "Ring"
NCCL_PROTOCOL: LLC
SEED: 1234
PYTHONHASHSEED: 0
MKL_NUM_THREADS: 1
OMP_NUM_THREADS: 1
NUMEXPR_NUM_THREADS: 1
SCIPY_RDRANDOM: 0
TF_DETERMINISTIC_OPS: 1
TORCH_CUDNN_DETERMINISM: true
CUDA_LAUNCH_BLOCKING: 1
NCCL_DEBUG: INFO
MAGIC_CACHE: disabled
cmds:
before_start: source /root/miniconda3/bin/activate flagscale-inference
Loading
Loading