diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml new file mode 100644 index 0000000000..14f2b4327e --- /dev/null +++ b/.github/configs/metax.yml @@ -0,0 +1,57 @@ +# MetaX Hardware Configuration +# This file defines CI/CD settings for MetaX-based testing +# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml + +hardware_name: metax +display_name: "MetaX Tests" + +# Docker image for this hardware +# TODO: Replace with actual MetaX Docker image +ci_image: localhost:5000/flagscale:metax-TODO +ci_train_image: localhost:5000/flagscale-train:metax-TODO +ci_inference_image: localhost:5000/flagscale-inference:metax-TODO + +# Runner labels for this hardware +# TODO: Replace with actual MetaX runner labels +runner_labels: + - self-hosted + - Linux + - X64 + - metax-0 # TODO: Update to actual MetaX runner label + - gpus-8 # TODO: Update to actual GPU count + +# Container volumes (hardware-specific paths) +# TODO: Update paths if MetaX uses different mount points +container_volumes: + - /home/flagscale_cicd/flask/static:/workspace/report + - /home/flagscale_cicd/flask/config:/workspace/config + - /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data + - /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers + - /home/flagscale_cicd/sccache:/github/home/.cache/sccache + +# Container options (hardware-specific settings) +# TODO: Update GPU runtime options for MetaX (e.g., replace --gpus all if needed) +container_options: "--gpus all --shm-size=500g --hostname flagscale_cicd --user root --ulimit nofile=65535:65535" + +# ============================================================================= +# Package Manager Configuration +# ============================================================================= +# Supported package managers: pip, uv, conda +# - pip: Use pip directly (standard Python) +# - uv: Use uv pip (fast, modern package manager) +# - conda: Use conda environment with pip for PyPI packages +# +# TODO: Update package manager settings for MetaX environment +pkg_mgr: "conda" + +# Environment path (venv path for uv, conda installation path for conda) +# TODO: Update to actual MetaX environment path +env_path: "/root/miniconda3" + +# Conda environment name (for conda only) +# TODO: Update environment names for MetaX +env_names: + train: "flagscale-train" + hetero_train: "flagscale-train" + inference: "flagscale-inference" + rl: "flagscale-rl" diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml new file mode 100644 index 0000000000..5d60d19c94 --- /dev/null +++ b/.github/workflows/all_tests_metax.yml @@ -0,0 +1,31 @@ +name: metax_tests + +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }} + cancel-in-progress: true + +jobs: + run_tests: + # Package manager and environment settings are read from .github/configs/metax.yml + uses: ./.github/workflows/all_tests_common.yml + with: + platform: metax + + all_tests: + needs: run_tests + runs-on: ubuntu-latest + if: always() + steps: + - name: Verify workflow status + run: | + if [ "${{ needs.run_tests.result }}" != "success" ]; then + echo "❌ Tests workflow failed" + exit 1 + fi + echo "✅ All tests passed!" diff --git a/tests/functional_tests/hetero_train/aquila/conf/dp2dp4_shared_embedding.yaml b/tests/functional_tests/hetero_train/aquila/conf/dp2dp4_shared_embedding.yaml index e5e31421b6..3fdb1b46b9 100644 --- a/tests/functional_tests/hetero_train/aquila/conf/dp2dp4_shared_embedding.yaml +++ b/tests/functional_tests/hetero_train/aquila/conf/dp2dp4_shared_embedding.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - train: dp2dp4_shared_embedding @@ -13,19 +14,7 @@ experiment: ssh_port: null shell_cmds: null envs: - HYDRA_FULL_ERROR: 1 CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - NCCL_ALGO: "Ring" - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-train action: run hydra: diff --git a/tests/functional_tests/hetero_train/aquila/conf/envs/cuda.yaml b/tests/functional_tests/hetero_train/aquila/conf/envs/cuda.yaml new file mode 100644 index 0000000000..6e2c0c5a1d --- /dev/null +++ b/tests/functional_tests/hetero_train/aquila/conf/envs/cuda.yaml @@ -0,0 +1,17 @@ +# @package _global_ +# CUDA platform environment variables for hetero_train/aquila +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + NCCL_ALGO: "Ring" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/hetero_train/aquila/conf/envs/metax.yaml b/tests/functional_tests/hetero_train/aquila/conf/envs/metax.yaml new file mode 100644 index 0000000000..6edf70969c --- /dev/null +++ b/tests/functional_tests/hetero_train/aquila/conf/envs/metax.yaml @@ -0,0 +1,11 @@ +# @package _global_ +# MetaX platform environment variables for hetero_train/aquila +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + # TODO: MetaX visible devices env var + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml b/tests/functional_tests/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml index 579f6cf28c..90581953e1 100644 --- a/tests/functional_tests/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml +++ b/tests/functional_tests/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - train: tp2dp1pp1_tp2dp2pp1_tp1dp2pp1 @@ -12,20 +13,6 @@ experiment: runner: ssh_port: null shell_cmds: null - envs: - HYDRA_FULL_ERROR: 1 - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - NCCL_ALGO: "Ring" - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-train action: run hydra: diff --git a/tests/functional_tests/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml b/tests/functional_tests/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml index 8dbab54730..df67e12808 100644 --- a/tests/functional_tests/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml +++ b/tests/functional_tests/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - train: tp2pp1_tp4pp1_tp2pp1 @@ -12,20 +13,6 @@ experiment: runner: ssh_port: null shell_cmds: null - envs: - HYDRA_FULL_ERROR: 1 - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - NCCL_ALGO: "Ring" - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-train action: run hydra: diff --git a/tests/functional_tests/hetero_train/aquila/gold_values/dp2dp4_shared_embedding.json b/tests/functional_tests/hetero_train/aquila/gold_values/dp2dp4_shared_embedding.json index 36e2b99379..3e4d573a7a 100644 --- a/tests/functional_tests/hetero_train/aquila/gold_values/dp2dp4_shared_embedding.json +++ b/tests/functional_tests/hetero_train/aquila/gold_values/dp2dp4_shared_embedding.json @@ -1 +1,9 @@ -{"lm loss:": {"values": [11.55754, 11.56045, 11.3609, 11.22254, 11.10463, 11.01332, 10.95259, 10.9088, 10.88758, 10.86586]}} +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [11.55754, 11.56045, 11.3609, 11.22254, 11.10463, 11.01332, 10.95259, 10.9088, 10.88758, 10.86586] + } + } + } +} diff --git a/tests/functional_tests/hetero_train/aquila/gold_values/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.json b/tests/functional_tests/hetero_train/aquila/gold_values/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.json index b30dd75dbc..d5c4cf4336 100644 --- a/tests/functional_tests/hetero_train/aquila/gold_values/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.json +++ b/tests/functional_tests/hetero_train/aquila/gold_values/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.json @@ -1 +1,9 @@ -{"lm loss:": {"values": [11.62049, 11.61899, 11.41389, 11.27374, 11.15958, 11.07645, 11.01809, 10.97522, 10.95196, 10.93447]}} +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [11.62049, 11.61899, 11.41389, 11.27374, 11.15958, 11.07645, 11.01809, 10.97522, 10.95196, 10.93447] + } + } + } +} diff --git a/tests/functional_tests/hetero_train/aquila/gold_values/tp2pp1_tp4pp1_tp2pp1.json b/tests/functional_tests/hetero_train/aquila/gold_values/tp2pp1_tp4pp1_tp2pp1.json index 17df314aa0..89e8a2c98b 100644 --- a/tests/functional_tests/hetero_train/aquila/gold_values/tp2pp1_tp4pp1_tp2pp1.json +++ b/tests/functional_tests/hetero_train/aquila/gold_values/tp2pp1_tp4pp1_tp2pp1.json @@ -1 +1,9 @@ -{"lm loss:": {"values": [11.60803, 11.60942, 11.39587, 11.24672, 11.12878, 11.03954, 10.97887, 10.93456, 10.91292, 10.89361]}} +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [11.60803, 11.60942, 11.39587, 11.24672, 11.12878, 11.03954, 10.97887, 10.93456, 10.91292, 10.89361] + } + } + } +} diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/7b_tp2.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/7b_tp2.yaml index c9ef51bd9b..c319571488 100644 --- a/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/7b_tp2.yaml +++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/7b_tp2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - inference: 7b_tp2 @@ -11,36 +12,6 @@ experiment: entrypoint: flagscale/inference/inference_llm.py runner: hostfile: null - cmds: - before_start: - source /root/miniconda3/bin/activate flagscale-inference - envs: - HYDRA_FULL_ERROR: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - # Quantitative perception training related - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - # GPU parallel control - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NCCL_ALGO: "Ring" - NCCL_PROTOCOL: LLC - # Basic randomness control - SEED: 1234 - PYTHONHASHSEED: 0 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - NUMEXPR_NUM_THREADS: 1 - SCIPY_RDRANDOM: 0 - TF_DETERMINISTIC_OPS: 1 - TORCH_CUDNN_DETERMINISM: true - CUDA_LAUNCH_BLOCKING: 1 - NCCL_DEBUG: INFO - MAGIC_CACHE: disabled action: run diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/cuda.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/cuda.yaml new file mode 100644 index 0000000000..cb88f22753 --- /dev/null +++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/cuda.yaml @@ -0,0 +1,28 @@ +# CUDA platform environment variables for inference/deepseek_r1_distill_qwen +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NCCL_ALGO: "Ring" + NCCL_PROTOCOL: LLC + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + TORCH_CUDNN_DETERMINISM: true + CUDA_LAUNCH_BLOCKING: 1 + NCCL_DEBUG: INFO + MAGIC_CACHE: disabled + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/metax.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/metax.yaml new file mode 100644 index 0000000000..2567a8e2e1 --- /dev/null +++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/metax.yaml @@ -0,0 +1,18 @@ +# MetaX platform environment variables for inference/deepseek_r1_distill_qwen +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + MAGIC_CACHE: disabled + # TODO: MetaX visible devices and platform-specific env vars + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/7b_tp2.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/7b_tp2.yaml index e11a52e0c0..403d3fc8f5 100644 --- a/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/7b_tp2.yaml +++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/7b_tp2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - inference: 7b_tp2 @@ -11,37 +12,6 @@ experiment: entrypoint: flagscale/inference/inference_llm.py runner: hostfile: null - cmds: - before_start: - source /root/miniconda3/bin/activate flagscale-inference - envs: - HYDRA_FULL_ERROR: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - USE_FLAGGEMS: "true" - # Quantitative perception training related - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - # GPU parallel control - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NCCL_ALGO: "Ring" - NCCL_PROTOCOL: LLC - # Basic randomness control - SEED: 1234 - PYTHONHASHSEED: 0 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - NUMEXPR_NUM_THREADS: 1 - SCIPY_RDRANDOM: 0 - TF_DETERMINISTIC_OPS: 1 - TORCH_CUDNN_DETERMINISM: true - CUDA_LAUNCH_BLOCKING: 1 - NCCL_DEBUG: INFO - MAGIC_CACHE: disabled action: run diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/cuda.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/cuda.yaml new file mode 100644 index 0000000000..dee0dc5f59 --- /dev/null +++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/cuda.yaml @@ -0,0 +1,29 @@ +# CUDA platform environment variables for inference/deepseek_r1_distill_qwen_flaggems +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + USE_FLAGGEMS: "true" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NCCL_ALGO: "Ring" + NCCL_PROTOCOL: LLC + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + TORCH_CUDNN_DETERMINISM: true + CUDA_LAUNCH_BLOCKING: 1 + NCCL_DEBUG: INFO + MAGIC_CACHE: disabled + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/metax.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/metax.yaml new file mode 100644 index 0000000000..1ed23b537c --- /dev/null +++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/metax.yaml @@ -0,0 +1,19 @@ +# MetaX platform environment variables for inference/deepseek_r1_distill_qwen_flaggems +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + USE_FLAGGEMS: "true" + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + MAGIC_CACHE: disabled + # TODO: MetaX visible devices and platform-specific env vars + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/qwen3/conf/4b_tp2.yaml b/tests/functional_tests/inference/qwen3/conf/4b_tp2.yaml index ea940956de..eace560a8a 100644 --- a/tests/functional_tests/inference/qwen3/conf/4b_tp2.yaml +++ b/tests/functional_tests/inference/qwen3/conf/4b_tp2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - inference: 4b_tp2 @@ -11,35 +12,6 @@ experiment: entrypoint: flagscale/inference/inference_llm.py runner: hostfile: null - cmds: - before_start: - source /root/miniconda3/bin/activate flagscale-inference - envs: - HYDRA_FULL_ERROR: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - # Quantitative perception training related - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - # GPU parallel control - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NCCL_ALGO: "Ring" - NCCL_PROTOCOL: LLC - # Basic randomness control - SEED: 1234 - PYTHONHASHSEED: 0 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - NUMEXPR_NUM_THREADS: 1 - SCIPY_RDRANDOM: 0 - TF_DETERMINISTIC_OPS: 1 - TORCH_CUDNN_DETERMINISM: true - CUDA_LAUNCH_BLOCKING: 1 - NCCL_DEBUG: INFO action: run diff --git a/tests/functional_tests/inference/qwen3/conf/envs/cuda.yaml b/tests/functional_tests/inference/qwen3/conf/envs/cuda.yaml new file mode 100644 index 0000000000..1032fa8d16 --- /dev/null +++ b/tests/functional_tests/inference/qwen3/conf/envs/cuda.yaml @@ -0,0 +1,27 @@ +# CUDA platform environment variables for inference/qwen3 +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NCCL_ALGO: "Ring" + NCCL_PROTOCOL: LLC + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + TORCH_CUDNN_DETERMINISM: true + CUDA_LAUNCH_BLOCKING: 1 + NCCL_DEBUG: INFO + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/qwen3/conf/envs/metax.yaml b/tests/functional_tests/inference/qwen3/conf/envs/metax.yaml new file mode 100644 index 0000000000..3d54a300ef --- /dev/null +++ b/tests/functional_tests/inference/qwen3/conf/envs/metax.yaml @@ -0,0 +1,17 @@ +# MetaX platform environment variables for inference/qwen3 +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + # TODO: MetaX visible devices and platform-specific env vars + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/qwen3_flaggems/conf/4b_tp2.yaml b/tests/functional_tests/inference/qwen3_flaggems/conf/4b_tp2.yaml index 47fe827f84..289ab240ef 100644 --- a/tests/functional_tests/inference/qwen3_flaggems/conf/4b_tp2.yaml +++ b/tests/functional_tests/inference/qwen3_flaggems/conf/4b_tp2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - inference: 4b_tp2 @@ -11,36 +12,6 @@ experiment: entrypoint: flagscale/inference/inference_llm.py runner: hostfile: null - cmds: - before_start: - source /root/miniconda3/bin/activate flagscale-inference - envs: - HYDRA_FULL_ERROR: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - USE_FLAGGEMS: "true" - # Quantitative perception training related - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - # GPU parallel control - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NCCL_ALGO: "Ring" - NCCL_PROTOCOL: LLC - # Basic randomness control - SEED: 1234 - PYTHONHASHSEED: 0 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - NUMEXPR_NUM_THREADS: 1 - SCIPY_RDRANDOM: 0 - TF_DETERMINISTIC_OPS: 1 - TORCH_CUDNN_DETERMINISM: true - CUDA_LAUNCH_BLOCKING: 1 - NCCL_DEBUG: INFO action: run diff --git a/tests/functional_tests/inference/qwen3_flaggems/conf/envs/cuda.yaml b/tests/functional_tests/inference/qwen3_flaggems/conf/envs/cuda.yaml new file mode 100644 index 0000000000..61c066d60b --- /dev/null +++ b/tests/functional_tests/inference/qwen3_flaggems/conf/envs/cuda.yaml @@ -0,0 +1,28 @@ +# CUDA platform environment variables for inference/qwen3_flaggems +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + USE_FLAGGEMS: "true" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NCCL_ALGO: "Ring" + NCCL_PROTOCOL: LLC + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + TORCH_CUDNN_DETERMINISM: true + CUDA_LAUNCH_BLOCKING: 1 + NCCL_DEBUG: INFO + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/qwen3_flaggems/conf/envs/metax.yaml b/tests/functional_tests/inference/qwen3_flaggems/conf/envs/metax.yaml new file mode 100644 index 0000000000..8260add4a2 --- /dev/null +++ b/tests/functional_tests/inference/qwen3_flaggems/conf/envs/metax.yaml @@ -0,0 +1,18 @@ +# MetaX platform environment variables for inference/qwen3_flaggems +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + USE_FLAGGEMS: "true" + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + # TODO: MetaX visible devices and platform-specific env vars + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/robobrain2/conf/7b_tp2.yaml b/tests/functional_tests/inference/robobrain2/conf/7b_tp2.yaml index f281dcc56b..628545bfe9 100644 --- a/tests/functional_tests/inference/robobrain2/conf/7b_tp2.yaml +++ b/tests/functional_tests/inference/robobrain2/conf/7b_tp2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - inference: 7b_tp2 @@ -11,39 +12,6 @@ experiment: entrypoint: flagscale/inference/inference_robobrain2.py runner: hostfile: null - cmds: - before_start: - source /root/miniconda3/bin/activate flagscale-inference - envs: - HYDRA_FULL_ERROR: 1 - VLLM_WORKER_MULTIPROC_METHOD: "spawn" - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - # Quantitative perception training related - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - # GPU parallel control - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NCCL_ALGO: "Ring" - NCCL_PROTOCOL: LLC - # Basic randomness control - SEED: 1234 - PYTHONHASHSEED: 0 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - NUMEXPR_NUM_THREADS: 1 - SCIPY_RDRANDOM: 0 - TF_DETERMINISTIC_OPS: 1 - TORCH_CUDNN_DETERMINISM: true - CUDA_LAUNCH_BLOCKING: 1 - NCCL_DEBUG: INFO - MAGIC_CACHE: disabled - # Serve specific - USE_FS_SERVE: false action: run diff --git a/tests/functional_tests/inference/robobrain2/conf/envs/cuda.yaml b/tests/functional_tests/inference/robobrain2/conf/envs/cuda.yaml new file mode 100644 index 0000000000..b07c831c3d --- /dev/null +++ b/tests/functional_tests/inference/robobrain2/conf/envs/cuda.yaml @@ -0,0 +1,30 @@ +# CUDA platform environment variables for inference/robobrain2 +experiment: + envs: + HYDRA_FULL_ERROR: 1 + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NCCL_ALGO: "Ring" + NCCL_PROTOCOL: LLC + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + TORCH_CUDNN_DETERMINISM: true + CUDA_LAUNCH_BLOCKING: 1 + NCCL_DEBUG: INFO + MAGIC_CACHE: disabled + USE_FS_SERVE: false + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/robobrain2/conf/envs/metax.yaml b/tests/functional_tests/inference/robobrain2/conf/envs/metax.yaml new file mode 100644 index 0000000000..d616a2654b --- /dev/null +++ b/tests/functional_tests/inference/robobrain2/conf/envs/metax.yaml @@ -0,0 +1,20 @@ +# MetaX platform environment variables for inference/robobrain2 +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + MAGIC_CACHE: disabled + USE_FS_SERVE: false + # TODO: MetaX visible devices and platform-specific env vars + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/robobrain2_flaggems/conf/7b_tp2.yaml b/tests/functional_tests/inference/robobrain2_flaggems/conf/7b_tp2.yaml index 53bdc6a5c8..733f9a0a94 100644 --- a/tests/functional_tests/inference/robobrain2_flaggems/conf/7b_tp2.yaml +++ b/tests/functional_tests/inference/robobrain2_flaggems/conf/7b_tp2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - inference: 7b_tp2 @@ -11,39 +12,6 @@ experiment: entrypoint: flagscale/inference/inference_robobrain2.py runner: hostfile: null - cmds: - before_start: - source /root/miniconda3/bin/activate flagscale-inference - envs: - HYDRA_FULL_ERROR: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - USE_FLAGGEMS: "true" - # Quantitative perception training related - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - # GPU parallel control - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NCCL_ALGO: "Ring" - NCCL_PROTOCOL: LLC - # Basic randomness control - SEED: 1234 - PYTHONHASHSEED: 0 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - NUMEXPR_NUM_THREADS: 1 - SCIPY_RDRANDOM: 0 - TF_DETERMINISTIC_OPS: 1 - TORCH_CUDNN_DETERMINISM: true - CUDA_LAUNCH_BLOCKING: 1 - NCCL_DEBUG: INFO - MAGIC_CACHE: disabled - # Serve specific - USE_FS_SERVE: false action: run diff --git a/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/cuda.yaml b/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/cuda.yaml new file mode 100644 index 0000000000..ff36d3700a --- /dev/null +++ b/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/cuda.yaml @@ -0,0 +1,30 @@ +# CUDA platform environment variables for inference/robobrain2_flaggems +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + USE_FLAGGEMS: "true" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NCCL_ALGO: "Ring" + NCCL_PROTOCOL: LLC + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + TORCH_CUDNN_DETERMINISM: true + CUDA_LAUNCH_BLOCKING: 1 + NCCL_DEBUG: INFO + MAGIC_CACHE: disabled + USE_FS_SERVE: false + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/metax.yaml b/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/metax.yaml new file mode 100644 index 0000000000..b845adabae --- /dev/null +++ b/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/metax.yaml @@ -0,0 +1,20 @@ +# MetaX platform environment variables for inference/robobrain2_flaggems +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + USE_FLAGGEMS: "true" + SEED: 1234 + PYTHONHASHSEED: 0 + MKL_NUM_THREADS: 1 + OMP_NUM_THREADS: 1 + NUMEXPR_NUM_THREADS: 1 + SCIPY_RDRANDOM: 0 + TF_DETERMINISTIC_OPS: 1 + MAGIC_CACHE: disabled + USE_FS_SERVE: false + # TODO: MetaX visible devices and platform-specific env vars + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/rl/qwen2_5/conf/0_5b.yaml b/tests/functional_tests/rl/qwen2_5/conf/0_5b.yaml index 694e8a3707..ce898a758f 100644 --- a/tests/functional_tests/rl/qwen2_5/conf/0_5b.yaml +++ b/tests/functional_tests/rl/qwen2_5/conf/0_5b.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - rl: 0_5b - _self_ @@ -9,15 +10,10 @@ experiment: type: rl backend: verl entrypoint: verl.trainer.main_ppo - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-RL runner: nnodes: 1 nproc_per_node: 8 hostfile: null - envs: - CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 - TORCH_DEVICE_BACKEND_AUTOLOAD: 0 action: run diff --git a/tests/functional_tests/rl/qwen2_5/conf/envs/cuda.yaml b/tests/functional_tests/rl/qwen2_5/conf/envs/cuda.yaml new file mode 100644 index 0000000000..7e61557b9d --- /dev/null +++ b/tests/functional_tests/rl/qwen2_5/conf/envs/cuda.yaml @@ -0,0 +1,7 @@ +# CUDA platform environment variables for rl/qwen2_5 +experiment: + envs: + CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 + TORCH_DEVICE_BACKEND_AUTOLOAD: 0 + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-RL diff --git a/tests/functional_tests/rl/qwen2_5/conf/envs/metax.yaml b/tests/functional_tests/rl/qwen2_5/conf/envs/metax.yaml new file mode 100644 index 0000000000..b3001a8fc9 --- /dev/null +++ b/tests/functional_tests/rl/qwen2_5/conf/envs/metax.yaml @@ -0,0 +1,10 @@ +# MetaX platform environment variables for rl/qwen2_5 +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + TORCH_DEVICE_BACKEND_AUTOLOAD: 0 + # TODO: MetaX visible devices env var + # MACA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-RL diff --git a/tests/functional_tests/serve/base/conf/envs/cuda.yaml b/tests/functional_tests/serve/base/conf/envs/cuda.yaml new file mode 100644 index 0000000000..4ba380c504 --- /dev/null +++ b/tests/functional_tests/serve/base/conf/envs/cuda.yaml @@ -0,0 +1,7 @@ +# CUDA platform environment variables for serve/base +experiment: + envs: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + no_proxy: "127.0.0.1,localhost" + cmds: + before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/serve/base/conf/envs/metax.yaml b/tests/functional_tests/serve/base/conf/envs/metax.yaml new file mode 100644 index 0000000000..8b524d4c23 --- /dev/null +++ b/tests/functional_tests/serve/base/conf/envs/metax.yaml @@ -0,0 +1,9 @@ +# MetaX platform environment variables for serve/base +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + no_proxy: "127.0.0.1,localhost" + # TODO: MetaX device connection env var + cmds: + # TODO: Update conda env activation for MetaX + before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/serve/base/conf/multiple_model.yaml b/tests/functional_tests/serve/base/conf/multiple_model.yaml index 727794f93a..019a4b6014 100644 --- a/tests/functional_tests/serve/base/conf/multiple_model.yaml +++ b/tests/functional_tests/serve/base/conf/multiple_model.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - serve: multiple_model @@ -22,11 +23,6 @@ experiment: type: Optional[str] required: false default: "You are a helpful assistant." - envs: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - no_proxy: "127.0.0.1,localhost" - cmds: - before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference action: run diff --git a/tests/functional_tests/serve/qwen2_5/conf/0.5b.yaml b/tests/functional_tests/serve/qwen2_5/conf/0.5b.yaml index c23595fb65..5bcc6a1ed8 100644 --- a/tests/functional_tests/serve/qwen2_5/conf/0.5b.yaml +++ b/tests/functional_tests/serve/qwen2_5/conf/0.5b.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - serve: 0.5b @@ -15,10 +16,6 @@ experiment: use_fs_serve: false envs: CUDA_VISIBLE_DEVICES: 0 - CUDA_DEVICE_MAX_CONNECTIONS: 1 - no_proxy: "127.0.0.1,localhost" - cmds: - before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference action: run diff --git a/tests/functional_tests/serve/qwen2_5/conf/0.5b_multiple_instance.yaml b/tests/functional_tests/serve/qwen2_5/conf/0.5b_multiple_instance.yaml index e4f03ac2f5..cbd26eac07 100644 --- a/tests/functional_tests/serve/qwen2_5/conf/0.5b_multiple_instance.yaml +++ b/tests/functional_tests/serve/qwen2_5/conf/0.5b_multiple_instance.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - serve: 0.5b_multiple_instance @@ -9,7 +10,7 @@ experiment: type: serve entrypoint: null runner: - hostfile: null # /path/to/hostfile.txt + hostfile: null docker: ds ssh_port: 22 nnodes: 1 @@ -22,17 +23,14 @@ experiment: tensor_model_parallel_size: [1] pipeline_model_parallel_size: [1] instance: [2] - block_size: [16] # [8, 16, 32] + block_size: [16] max_num_batched_tokens: [512] - max_num_seqs: [128] # [128, 256] + max_num_seqs: [128] control: interval: 20 run_best: false - cmds: - before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference envs: CUDA_VISIBLE_DEVICES: 0,1 - no_proxy: "127.0.0.1,localhost" RAY_DEDUP_LOGS: 0 action: auto_tune diff --git a/tests/functional_tests/serve/qwen2_5/conf/envs/cuda.yaml b/tests/functional_tests/serve/qwen2_5/conf/envs/cuda.yaml new file mode 100644 index 0000000000..2de8bcf2d2 --- /dev/null +++ b/tests/functional_tests/serve/qwen2_5/conf/envs/cuda.yaml @@ -0,0 +1,7 @@ +# CUDA platform environment variables for serve/qwen2_5 +experiment: + envs: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + no_proxy: "127.0.0.1,localhost" + cmds: + before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/serve/qwen2_5/conf/envs/metax.yaml b/tests/functional_tests/serve/qwen2_5/conf/envs/metax.yaml new file mode 100644 index 0000000000..9915bb30b8 --- /dev/null +++ b/tests/functional_tests/serve/qwen2_5/conf/envs/metax.yaml @@ -0,0 +1,9 @@ +# MetaX platform environment variables for serve/qwen2_5 +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + no_proxy: "127.0.0.1,localhost" + # TODO: MetaX device connection env var + cmds: + # TODO: Update conda env activation for MetaX + before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/serve/qwen3/conf/4b.yaml b/tests/functional_tests/serve/qwen3/conf/4b.yaml new file mode 100644 index 0000000000..248135d744 --- /dev/null +++ b/tests/functional_tests/serve/qwen3/conf/4b.yaml @@ -0,0 +1,24 @@ +defaults: + - envs: cuda + - _self_ + - serve: 4b + +experiment: + exp_name: qwen3 + exp_dir: tests/functional_tests/serve/qwen3/test_results/4b + task: + type: serve + entrypoint: null + runner: + hostfile: null + deploy: + port: 6704 + use_fs_serve: false + envs: + CUDA_VISIBLE_DEVICES: 0 + +action: run + +hydra: + run: + dir: ${experiment.exp_dir}/hydra diff --git a/tests/functional_tests/serve/qwen3/conf/4b_multiple_instance.yaml b/tests/functional_tests/serve/qwen3/conf/4b_multiple_instance.yaml new file mode 100644 index 0000000000..5d7be1771f --- /dev/null +++ b/tests/functional_tests/serve/qwen3/conf/4b_multiple_instance.yaml @@ -0,0 +1,40 @@ +defaults: + - envs: cuda + - _self_ + - serve: 4b_multiple_instance + +experiment: + exp_name: qwen3 + exp_dir: tests/functional_tests/serve/qwen3/test_results/4b_multiple_instance + task: + type: serve + entrypoint: null + runner: + hostfile: null + docker: ds + ssh_port: 22 + nnodes: 1 + nproc_per_node: 2 + deploy: + port: 6705 + use_fs_serve: true + auto_tuner: + space: + tensor_model_parallel_size: [1] + pipeline_model_parallel_size: [1] + instance: [2] + block_size: [16] + max_num_batched_tokens: [512] + max_num_seqs: [128] + control: + interval: 20 + run_best: false + envs: + CUDA_VISIBLE_DEVICES: 0,1 + RAY_DEDUP_LOGS: 0 + +action: auto_tune + +hydra: + run: + dir: ${experiment.exp_dir}/hydra diff --git a/tests/functional_tests/serve/qwen3/conf/envs/cuda.yaml b/tests/functional_tests/serve/qwen3/conf/envs/cuda.yaml new file mode 100644 index 0000000000..fcdaada3f3 --- /dev/null +++ b/tests/functional_tests/serve/qwen3/conf/envs/cuda.yaml @@ -0,0 +1,7 @@ +# CUDA platform environment variables for serve/qwen3 +experiment: + envs: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + no_proxy: "127.0.0.1,localhost" + cmds: + before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/serve/qwen3/conf/envs/metax.yaml b/tests/functional_tests/serve/qwen3/conf/envs/metax.yaml new file mode 100644 index 0000000000..07d07246f2 --- /dev/null +++ b/tests/functional_tests/serve/qwen3/conf/envs/metax.yaml @@ -0,0 +1,9 @@ +# MetaX platform environment variables for serve/qwen3 +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + no_proxy: "127.0.0.1,localhost" + # TODO: MetaX device connection env var + cmds: + # TODO: Update conda env activation for MetaX + before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference diff --git a/tests/functional_tests/serve/qwen3/conf/serve/4b.yaml b/tests/functional_tests/serve/qwen3/conf/serve/4b.yaml new file mode 100644 index 0000000000..0009f0f773 --- /dev/null +++ b/tests/functional_tests/serve/qwen3/conf/serve/4b.yaml @@ -0,0 +1,12 @@ +- serve_id: vllm_model + engine: vllm + engine_args: + model: /home/gitlab-runner/data/Qwen3-4B + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + gpu_memory_utilization: 0.9 + max_model_len: 32768 + max_num_seqs: 256 + enforce_eager: true + trust_remote_code: true + enable_chunked_prefill: true diff --git a/tests/functional_tests/serve/qwen3/conf/serve/4b_multiple_instance.yaml b/tests/functional_tests/serve/qwen3/conf/serve/4b_multiple_instance.yaml new file mode 100644 index 0000000000..18bda1597a --- /dev/null +++ b/tests/functional_tests/serve/qwen3/conf/serve/4b_multiple_instance.yaml @@ -0,0 +1,16 @@ +- serve_id: vllm_model + engine: vllm + engine_args: + model: /home/gitlab-runner/data/Qwen3-4B + served_model_name: qwen + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + gpu_memory_utilization: 0.9 + max_model_len: 32768 + max_num_seqs: 256 + enforce_eager: true + trust_remote_code: true + enable_chunked_prefill: true + resources: + num_replicas: 2 + num_gpus: 1 diff --git a/tests/functional_tests/serve/qwen3/gold_values/4b.json b/tests/functional_tests/serve/qwen3/gold_values/4b.json new file mode 100644 index 0000000000..0f8a4237c6 --- /dev/null +++ b/tests/functional_tests/serve/qwen3/gold_values/4b.json @@ -0,0 +1 @@ +{"response": ", including his background, philosophy, and legacy. Also, include his influence on martial arts and popular"} diff --git a/tests/functional_tests/serve/qwen3/gold_values/4b_multiple_instance.json b/tests/functional_tests/serve/qwen3/gold_values/4b_multiple_instance.json new file mode 100644 index 0000000000..c637dad45b --- /dev/null +++ b/tests/functional_tests/serve/qwen3/gold_values/4b_multiple_instance.json @@ -0,0 +1 @@ +{"response": ". Bruce Lee was a legendary martial artist, actor, and film director who lived from 19"} diff --git a/tests/functional_tests/train/aquila/conf/envs/cuda.yaml b/tests/functional_tests/train/aquila/conf/envs/cuda.yaml new file mode 100644 index 0000000000..04d6cbcc79 --- /dev/null +++ b/tests/functional_tests/train/aquila/conf/envs/cuda.yaml @@ -0,0 +1,17 @@ +# @package _global_ +# CUDA platform environment variables for train/aquila +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + NCCL_ALGO: "Ring" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/train/aquila/conf/envs/metax.yaml b/tests/functional_tests/train/aquila/conf/envs/metax.yaml new file mode 100644 index 0000000000..b147a75cd6 --- /dev/null +++ b/tests/functional_tests/train/aquila/conf/envs/metax.yaml @@ -0,0 +1,11 @@ +# @package _global_ +# MetaX platform environment variables for train/aquila +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + # TODO: MetaX visible devices env var + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/train/aquila/conf/tp2_pp2.yaml b/tests/functional_tests/train/aquila/conf/tp2_pp2.yaml index 76091741cb..b958bd6f06 100644 --- a/tests/functional_tests/train/aquila/conf/tp2_pp2.yaml +++ b/tests/functional_tests/train/aquila/conf/tp2_pp2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - train: tp2_pp2 @@ -12,20 +13,6 @@ experiment: runner: ssh_port: null shell_cmds: null - envs: - HYDRA_FULL_ERROR: 1 - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - NCCL_ALGO: "Ring" - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-train action: run hydra: diff --git a/tests/functional_tests/train/aquila/conf/tp4_pp2.yaml b/tests/functional_tests/train/aquila/conf/tp4_pp2.yaml index 560aa9a06e..2b9f856e53 100644 --- a/tests/functional_tests/train/aquila/conf/tp4_pp2.yaml +++ b/tests/functional_tests/train/aquila/conf/tp4_pp2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - train: tp4_pp2 @@ -12,20 +13,6 @@ experiment: runner: ssh_port: null shell_cmds: null - envs: - HYDRA_FULL_ERROR: 1 - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - NCCL_ALGO: "Ring" - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-train action: run hydra: diff --git a/tests/functional_tests/train/aquila/gold_values/tp2_pp2.json b/tests/functional_tests/train/aquila/gold_values/tp2_pp2.json index 8f66ae59d4..d0f4266397 100644 --- a/tests/functional_tests/train/aquila/gold_values/tp2_pp2.json +++ b/tests/functional_tests/train/aquila/gold_values/tp2_pp2.json @@ -1 +1,9 @@ -{"lm loss:": {"values": [11.63839, 11.63632, 11.49138, 11.38161, 11.28878, 11.21583, 74297.01, 11.22827, 11.16404, 11.1232, 11.10253, 11.07966, 11.07366, 64451.08, 11.07513, 64546.05, 11.07297, 64406.66]}} +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [11.63839, 11.63632, 11.49138, 11.38161, 11.28878, 11.21583, 74297.01, 11.22827, 11.16404, 11.1232, 11.10253, 11.07966, 11.07366, 64451.08, 11.07513, 64546.05, 11.07297, 64406.66] + } + } + } +} diff --git a/tests/functional_tests/train/aquila/gold_values/tp4_pp2.json b/tests/functional_tests/train/aquila/gold_values/tp4_pp2.json index 50d94279d9..2a3fca2f11 100644 --- a/tests/functional_tests/train/aquila/gold_values/tp4_pp2.json +++ b/tests/functional_tests/train/aquila/gold_values/tp4_pp2.json @@ -1 +1,9 @@ -{"lm loss:": {"values": [11.61479, 11.61304, 11.46996, 11.36611, 11.26903, 11.20205, 11.15256, 11.11047, 11.09079, 11.07022]}} +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [11.61479, 11.61304, 11.46996, 11.36611, 11.26903, 11.20205, 11.15256, 11.11047, 11.09079, 11.07022] + } + } + } +} diff --git a/tests/functional_tests/train/deepseek/conf/envs/cuda.yaml b/tests/functional_tests/train/deepseek/conf/envs/cuda.yaml new file mode 100644 index 0000000000..64d1bfe3bc --- /dev/null +++ b/tests/functional_tests/train/deepseek/conf/envs/cuda.yaml @@ -0,0 +1,14 @@ +# CUDA platform environment variables for train/deepseek +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + NCCL_ALGO: "Ring" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/train/deepseek/conf/envs/metax.yaml b/tests/functional_tests/train/deepseek/conf/envs/metax.yaml new file mode 100644 index 0000000000..e089abd513 --- /dev/null +++ b/tests/functional_tests/train/deepseek/conf/envs/metax.yaml @@ -0,0 +1,10 @@ +# MetaX platform environment variables for train/deepseek +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + # TODO: MetaX visible devices env var + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2.yaml b/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2.yaml index 686853877b..c4d9442a17 100644 --- a/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2.yaml +++ b/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - train: tp2_pp2_ep2 @@ -12,26 +13,6 @@ experiment: runner: ssh_port: null shell_cmds: null - envs: - HYDRA_FULL_ERROR: 1 - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - NCCL_ALGO: "Ring" - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - # Only for debug - # NVTE_DEBUG: 1 - # NVTE_DEBUG_LEVEL: 2 - # CUDNN_LOGERR_DBG: 1 - # The following parameters passed the local test - # CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - # CUDA_DEVICE_MAX_CONNECTIONS: 1 - # NVTE_TORCH_COMPILE: 0 - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-train action: run hydra: diff --git a/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2_engram.yaml b/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2_engram.yaml index 14ed97c215..25ef469677 100644 --- a/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2_engram.yaml +++ b/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2_engram.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - train: tp2_pp2_ep2_engram @@ -12,26 +13,6 @@ experiment: runner: ssh_port: null shell_cmds: null - envs: - HYDRA_FULL_ERROR: 1 - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - NCCL_ALGO: "Ring" - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - # Only for debug - # NVTE_DEBUG: 1 - # NVTE_DEBUG_LEVEL: 2 - # CUDNN_LOGERR_DBG: 1 - # The following parameters passed the local test - # CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - # CUDA_DEVICE_MAX_CONNECTIONS: 1 - # NVTE_TORCH_COMPILE: 0 - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-train action: run hydra: diff --git a/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2.json b/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2.json index fc56d2a014..8c5bc07ace 100644 --- a/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2.json +++ b/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2.json @@ -1 +1,9 @@ -{"lm loss:": {"values": [12.32605, 12.33183, 18.6708, 13.8059, 9.761076, 10.68691, 10.55642, 10.59995, 10.00202, 11.28867]}} +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [12.32605, 12.33183, 18.6708, 13.8059, 9.761076, 10.68691, 10.55642, 10.59995, 10.00202, 11.28867] + } + } + } +} diff --git a/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2_engram.json b/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2_engram.json index 2830ce8a2f..df995035ed 100644 --- a/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2_engram.json +++ b/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2_engram.json @@ -1 +1,9 @@ -{"lm loss:": {"values": [12.31449, 12.31611, 20.91898, 12.73568, 13.30583, 11.24752, 9.966835, 11.10967, 10.74055, 10.14268]}} +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [12.31449, 12.31611, 20.91898, 12.73568, 13.30583, 11.24752, 9.966835, 11.10967, 10.74055, 10.14268] + } + } + } +} diff --git a/tests/functional_tests/train/mixtral/conf/envs/cuda.yaml b/tests/functional_tests/train/mixtral/conf/envs/cuda.yaml new file mode 100644 index 0000000000..0cd4bcb196 --- /dev/null +++ b/tests/functional_tests/train/mixtral/conf/envs/cuda.yaml @@ -0,0 +1,16 @@ +# CUDA platform environment variables for train/mixtral +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + NCCL_ALGO: "Ring" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/train/mixtral/conf/envs/metax.yaml b/tests/functional_tests/train/mixtral/conf/envs/metax.yaml new file mode 100644 index 0000000000..c509107020 --- /dev/null +++ b/tests/functional_tests/train/mixtral/conf/envs/metax.yaml @@ -0,0 +1,10 @@ +# MetaX platform environment variables for train/mixtral +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + # TODO: MetaX visible devices env var + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/train/mixtral/conf/tp2_pp1_ep2.yaml b/tests/functional_tests/train/mixtral/conf/tp2_pp1_ep2.yaml index a711648de0..4bc6f7da7d 100644 --- a/tests/functional_tests/train/mixtral/conf/tp2_pp1_ep2.yaml +++ b/tests/functional_tests/train/mixtral/conf/tp2_pp1_ep2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - train: tp2_pp1_ep2 @@ -12,20 +13,6 @@ experiment: runner: ssh_port: null shell_cmds: null - envs: - HYDRA_FULL_ERROR: 1 - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - NCCL_ALGO: "Ring" - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-train action: run hydra: diff --git a/tests/functional_tests/train/mixtral/conf/tp4_pp1_ep2.yaml b/tests/functional_tests/train/mixtral/conf/tp4_pp1_ep2.yaml index a48e17e647..d81ce72d92 100644 --- a/tests/functional_tests/train/mixtral/conf/tp4_pp1_ep2.yaml +++ b/tests/functional_tests/train/mixtral/conf/tp4_pp1_ep2.yaml @@ -1,4 +1,5 @@ defaults: + - envs: cuda - _self_ - train: tp4_pp1_ep2 @@ -12,20 +13,6 @@ experiment: runner: ssh_port: null shell_cmds: null - envs: - HYDRA_FULL_ERROR: 1 - CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" - CUDA_DEVICE_MAX_CONNECTIONS: 1 - CUBLAS_WORKSPACE_CONFIG: ":4096:8" - NCCL_ALGO: "Ring" - NVTE_APPLY_QK_LAYER_SCALING: 0 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NVTE_FLASH_ATTN: 0 - NVTE_FUSED_ATTN: 0 - CUDNN_BENCHMARK: "false" - CUDNN_DETERMINISTIC: "true" - cmds: - before_start: source /root/miniconda3/bin/activate flagscale-train action: run hydra: diff --git a/tests/functional_tests/train/mixtral/gold_values/tp2_pp1_ep2.json b/tests/functional_tests/train/mixtral/gold_values/tp2_pp1_ep2.json index c35033d3d2..1d39a255b2 100644 --- a/tests/functional_tests/train/mixtral/gold_values/tp2_pp1_ep2.json +++ b/tests/functional_tests/train/mixtral/gold_values/tp2_pp1_ep2.json @@ -1 +1,9 @@ -{"lm loss:": {"values": [11.17587, 11.16908, 10.41927, 11.66834, 9.679541, 9.481043, 9.194503, 9.234812, 9.128164, 8.960205]}} +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [11.17587, 11.16908, 10.41927, 11.66834, 9.679541, 9.481043, 9.194503, 9.234812, 9.128164, 8.960205] + } + } + } +} diff --git a/tests/functional_tests/train/mixtral/gold_values/tp4_pp1_ep2.json b/tests/functional_tests/train/mixtral/gold_values/tp4_pp1_ep2.json index bea8991fda..72990e9a55 100644 --- a/tests/functional_tests/train/mixtral/gold_values/tp4_pp1_ep2.json +++ b/tests/functional_tests/train/mixtral/gold_values/tp4_pp1_ep2.json @@ -1 +1,9 @@ -{"lm loss:": {"values": [11.21206, 11.20481, 10.32151, 12.73904, 10.828, 10.09151, 9.412704, 9.257725, 9.139534, 9.054203]}} +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [11.21206, 11.20481, 10.32151, 12.73904, 10.828, 10.09151, 9.412704, 9.257725, 9.139534, 9.054203] + } + } + } +} diff --git a/tests/functional_tests/train/qwen3/conf/envs/cuda.yaml b/tests/functional_tests/train/qwen3/conf/envs/cuda.yaml new file mode 100644 index 0000000000..1fc01e091a --- /dev/null +++ b/tests/functional_tests/train/qwen3/conf/envs/cuda.yaml @@ -0,0 +1,17 @@ +# @package _global_ +# CUDA platform environment variables for train/qwen3 +experiment: + envs: + HYDRA_FULL_ERROR: 1 + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + CUBLAS_WORKSPACE_CONFIG: ":4096:8" + NCCL_ALGO: "Ring" + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NVTE_FLASH_ATTN: 0 + NVTE_FUSED_ATTN: 0 + CUDNN_BENCHMARK: "false" + CUDNN_DETERMINISTIC: "true" + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/train/qwen3/conf/envs/metax.yaml b/tests/functional_tests/train/qwen3/conf/envs/metax.yaml new file mode 100644 index 0000000000..a33602967d --- /dev/null +++ b/tests/functional_tests/train/qwen3/conf/envs/metax.yaml @@ -0,0 +1,11 @@ +# @package _global_ +# MetaX platform environment variables for train/qwen3 +# TODO: Replace with actual MetaX environment variables +experiment: + envs: + HYDRA_FULL_ERROR: 1 + # TODO: MetaX visible devices env var + # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + cmds: + # TODO: Update conda env activation for MetaX + before_start: source /root/miniconda3/bin/activate flagscale-train diff --git a/tests/functional_tests/train/qwen3/conf/tp2_pp2.yaml b/tests/functional_tests/train/qwen3/conf/tp2_pp2.yaml new file mode 100644 index 0000000000..eb1d344d94 --- /dev/null +++ b/tests/functional_tests/train/qwen3/conf/tp2_pp2.yaml @@ -0,0 +1,20 @@ +defaults: + - envs: cuda + - _self_ + - train: tp2_pp2 + +experiment: + exp_name: tp2_pp2 + exp_dir: tests/functional_tests/train/qwen3/test_results/tp2_pp2 + task: + type: train + backend: megatron + entrypoint: flagscale/train/megatron/train_gpt.py + runner: + ssh_port: null + shell_cmds: null +action: run + +hydra: + run: + dir: ${experiment.exp_dir}/hydra diff --git a/tests/functional_tests/train/qwen3/conf/tp4_pp2.yaml b/tests/functional_tests/train/qwen3/conf/tp4_pp2.yaml new file mode 100644 index 0000000000..403ecf58c3 --- /dev/null +++ b/tests/functional_tests/train/qwen3/conf/tp4_pp2.yaml @@ -0,0 +1,20 @@ +defaults: + - envs: cuda + - _self_ + - train: tp4_pp2 + +experiment: + exp_name: tp4_pp2 + exp_dir: tests/functional_tests/train/qwen3/test_results/tp4_pp2 + task: + type: train + backend: megatron + entrypoint: flagscale/train/megatron/train_gpt.py + runner: + ssh_port: null + shell_cmds: null +action: run + +hydra: + run: + dir: ${experiment.exp_dir}/hydra diff --git a/tests/functional_tests/train/qwen3/conf/train/data.yaml b/tests/functional_tests/train/qwen3/conf/train/data.yaml new file mode 100644 index 0000000000..cd53f9eb6e --- /dev/null +++ b/tests/functional_tests/train/qwen3/conf/train/data.yaml @@ -0,0 +1,10 @@ +data: + data_path: /home/gitlab-runner/data/pile_wikipedia_demo/pile_wikipedia_demo + split: 1 + no_mmap_bin_files: true + tokenizer: + legacy_tokenizer: true + tokenizer_type: QwenTokenizerFS + tokenizer_path: /home/gitlab-runner/tokenizers/qwentokenizer + vocab_size: 151936 + make_vocab_size_divisible_by: 64 diff --git a/tests/functional_tests/train/qwen3/conf/train/tp2_pp2.yaml b/tests/functional_tests/train/qwen3/conf/train/tp2_pp2.yaml new file mode 100644 index 0000000000..316bce9820 --- /dev/null +++ b/tests/functional_tests/train/qwen3/conf/train/tp2_pp2.yaml @@ -0,0 +1,67 @@ +defaults: + - data + +system: + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 2 + disable_bias_linear: true + qk_layernorm: true + sequence_parallel: true + use_distributed_optimizer: true + precision: + bf16: true + attention_softmax_in_fp32: true + accumulate_allreduce_grads_in_fp32: true + logging: + log_interval: 1 + no_log_loss_scale_to_tensorboard: true + checkpoint: + no_save_optim: true + no_save_rng: true + save_interval: 100000 + tensorboard_log_interval: 999999 + +model: + attention_backend: unfused + deterministic_mode: true + use_mcore_models: true + transformer_impl: transformer_engine + num_layers: 4 + hidden_size: 512 + ffn_hidden_size: 1536 + kv_channels: 128 + num_attention_heads: 8 + group_query_attention: true + num_query_groups: 4 + seq_length: 1024 + max_position_embeddings: 1024 + norm_epsilon: 1e-6 + use_rotary_position_embeddings: true + rotary_base: 1000000 + no_position_embedding: true + no_rope_fusion: true + swiglu: true + normalization: RMSNorm + position_embedding_type: rope + untie_embeddings_and_output_weights: false + init_method_std: 0.02 + attention_dropout: 0.0 + hidden_dropout: 0.0 + weight_decay: 0.1 + clip_grad: 1.0 + train_iters: 10 + eval_iters: 0 + micro_batch_size: 4 + global_batch_size: 1024 + seed: 42 + + optimizer: + weight_decay: 0.1 + adam_beta1: 0.9 + adam_beta2: 0.95 + lr_scheduler: + lr: 2.0e-5 + min_lr: 2.0e-6 + lr_warmup_samples: 0 + lr_warmup_fraction: 0.01 + lr_decay_style: cosine diff --git a/tests/functional_tests/train/qwen3/conf/train/tp4_pp2.yaml b/tests/functional_tests/train/qwen3/conf/train/tp4_pp2.yaml new file mode 100644 index 0000000000..e18111ad3c --- /dev/null +++ b/tests/functional_tests/train/qwen3/conf/train/tp4_pp2.yaml @@ -0,0 +1,67 @@ +defaults: + - data + +system: + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 2 + disable_bias_linear: true + qk_layernorm: true + sequence_parallel: true + use_distributed_optimizer: true + precision: + bf16: true + attention_softmax_in_fp32: true + accumulate_allreduce_grads_in_fp32: true + logging: + log_interval: 1 + no_log_loss_scale_to_tensorboard: true + checkpoint: + no_save_optim: true + no_save_rng: true + save_interval: 100000 + tensorboard_log_interval: 999999 + +model: + attention_backend: unfused + deterministic_mode: true + use_mcore_models: true + transformer_impl: transformer_engine + num_layers: 4 + hidden_size: 512 + ffn_hidden_size: 1536 + kv_channels: 128 + num_attention_heads: 8 + group_query_attention: true + num_query_groups: 4 + seq_length: 1024 + max_position_embeddings: 1024 + norm_epsilon: 1e-6 + use_rotary_position_embeddings: true + rotary_base: 1000000 + no_position_embedding: true + no_rope_fusion: true + swiglu: true + normalization: RMSNorm + position_embedding_type: rope + untie_embeddings_and_output_weights: false + init_method_std: 0.02 + attention_dropout: 0.0 + hidden_dropout: 0.0 + weight_decay: 0.1 + clip_grad: 1.0 + train_iters: 10 + eval_iters: 0 + micro_batch_size: 4 + global_batch_size: 1024 + seed: 42 + + optimizer: + weight_decay: 0.1 + adam_beta1: 0.9 + adam_beta2: 0.95 + lr_scheduler: + lr: 2.0e-5 + min_lr: 2.0e-6 + lr_warmup_samples: 0 + lr_warmup_fraction: 0.01 + lr_decay_style: cosine diff --git a/tests/functional_tests/train/qwen3/gold_values/tp2_pp2.json b/tests/functional_tests/train/qwen3/gold_values/tp2_pp2.json new file mode 100644 index 0000000000..f8146e57d4 --- /dev/null +++ b/tests/functional_tests/train/qwen3/gold_values/tp2_pp2.json @@ -0,0 +1,9 @@ +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [12.00312, 12.00331, 11.8856, 11.73663, 11.64624, 11.55245, 11.4999, 11.47063, 11.4367, 11.41239] + } + } + } +} diff --git a/tests/functional_tests/train/qwen3/gold_values/tp4_pp2.json b/tests/functional_tests/train/qwen3/gold_values/tp4_pp2.json new file mode 100644 index 0000000000..268054b396 --- /dev/null +++ b/tests/functional_tests/train/qwen3/gold_values/tp4_pp2.json @@ -0,0 +1,9 @@ +{ + "cuda": { + "a100": { + "lm loss:": { + "values": [11.98952, 11.99043, 11.8672, 11.71454, 11.62524, 11.53021, 11.48006, 11.45335, 11.41747, 11.39388] + } + } + } +} diff --git a/tests/test_utils/config/platforms/cuda.yaml b/tests/test_utils/config/platforms/cuda.yaml index 4a4b70a448..a89a6f02df 100644 --- a/tests/test_utils/config/platforms/cuda.yaml +++ b/tests/test_utils/config/platforms/cuda.yaml @@ -23,6 +23,7 @@ a100: aquila: ["tp2_pp2", "tp4_pp2"] deepseek: ["tp2_pp2_ep2", "tp2_pp2_ep2_engram"] mixtral: ["tp2_pp1_ep2", "tp4_pp1_ep2"] + qwen3: ["tp2_pp2", "tp4_pp2"] hetero_train: aquila: ["tp2pp1_tp4pp1_tp2pp1", "tp2dp1pp1_tp2dp2pp1_tp1dp2pp1", "dp2dp4_shared_embedding"] inference: diff --git a/tests/test_utils/config/platforms/metax.yaml b/tests/test_utils/config/platforms/metax.yaml new file mode 100644 index 0000000000..23e9cad8ca --- /dev/null +++ b/tests/test_utils/config/platforms/metax.yaml @@ -0,0 +1,42 @@ +# MetaX Platform Configuration +# Test selection mechanism for MetaX GPU environments +# +# This configuration mirrors the cuda.yaml structure. +# To add/remove tests: modify the functional test case lists below. +# Platform-specific environment variables are in each model's conf/envs/metax.yaml. +# +# Users can modify the following: +# - functional: Add/remove test case names in the support lists +# Example: aquila: ["tp2_pp2", "tp4_pp2"] -> add or remove items +# - unit: Add/remove paths in include and exclude lists +# Example: include: "*" or ["test_basic.py", "runner/*"] +# exclude: [] or ["test_spiky_loss_detector.py"] + +# Define device types available for this platform +# TODO: Replace with actual MetaX device type name +device_types: + - metax_gpu + +# Device-specific test configurations +# TODO: Replace 'metax_gpu' with actual device type name +metax_gpu: + name: "metax_gpu" + tests: + functional: + # TODO: Uncomment and adjust test cases that MetaX supports + # Available test cases (see cuda.yaml for reference): + train: + aquila: ["tp2_pp2", "tp4_pp2"] + # deepseek: ["tp2_pp2_ep2", "tp2_pp2_ep2_engram"] + # mixtral: ["tp2_pp1_ep2", "tp4_pp1_ep2"] + # hetero_train: + # aquila: ["tp2pp1_tp4pp1_tp2pp1", "tp2dp1pp1_tp2dp2pp1_tp1dp2pp1", "dp2dp4_shared_embedding"] + # inference: + # qwen3: ["4b_tp2"] + # serve: + # qwen2_5: ["0.5b"] + unit: + # Include patterns: "*" for all, or list specific paths + include: "*" + # Exclude patterns: empty list or list paths to exclude + exclude: [] diff --git a/tests/test_utils/runners/check_results.py b/tests/test_utils/runners/check_results.py index e90fe2c436..0921390c9c 100644 --- a/tests/test_utils/runners/check_results.py +++ b/tests/test_utils/runners/check_results.py @@ -135,13 +135,15 @@ def find_latest_stdout_log(start_path): return None, latest_attempt -@pytest.mark.usefixtures("path", "task", "model", "case") -def test_train_equal(path, task, model, case): +@pytest.mark.usefixtures("path", "task", "model", "case", "platform", "device") +def test_train_equal(path, task, model, case, platform, device): """ Compare training metrics from test run against gold values. This test extracts loss metrics from stdout.log and compares them against pre-recorded gold values using numpy.allclose for tolerance. + + Gold values are stored in nested format: {platform: {device: {metric: {values: [...]}}}} """ # Construct the test_result_path using the provided fixtures test_result_path = os.path.join(path, task, model, "test_results", case) @@ -165,6 +167,16 @@ def test_train_equal(path, task, model, case): with open(gold_value_path, "r") as f: gold_result_json = json.load(f) + # Navigate nested structure: {platform: {device: {metric: ...}}} + assert platform in gold_result_json, ( + f"Platform '{platform}' not found in gold values. Available: {list(gold_result_json.keys())}" + ) + platform_data = gold_result_json[platform] + assert device in platform_data, ( + f"Device '{device}' not found for platform '{platform}'. Available: {list(platform_data.keys())}" + ) + gold_result_json = platform_data[device] + # Extract the metric keys from gold values metric_keys = list(gold_result_json.keys()) diff --git a/tests/test_utils/runners/run_functional_tests.sh b/tests/test_utils/runners/run_functional_tests.sh index cc6eec7d94..0abaadb4e4 100755 --- a/tests/test_utils/runners/run_functional_tests.sh +++ b/tests/test_utils/runners/run_functional_tests.sh @@ -90,17 +90,17 @@ run_test() { rm -rf "$exp_dir"/* 2>/dev/null || true fi - # Map task name to flagscale CLI subcommand - # e.g. hetero_train -> train, train -> train, others unchanged - local cli_task="$task" - case "$task" in - *train*) cli_task="train" ;; - esac + # Run test via flagscale CLI using the 'run' command with Hydra overrides. + # This allows passing platform-specific envs config group (envs=). + local run_args=("run" "--config-path" "$conf_dir" "--config-name" "$config" "--action" "test") + + # If platform-specific envs config exists, pass it as a Hydra override + if [ -d "$conf_dir/envs" ] && [ -f "$conf_dir/envs/${PLATFORM}.yaml" ]; then + run_args+=("envs=${PLATFORM}") + fi - # Run test via flagscale CLI - # --config expects the full YAML path - log_info "Running: flagscale $cli_task $model --config $config_file --test" - flagscale "$cli_task" "$model" --config "$config_file" --test || return 1 + log_info "Running: flagscale ${run_args[*]}" + flagscale "${run_args[@]}" || return 1 # Match the corresponding comparison function according to task type # Matching rules: