diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
new file mode 100644
index 0000000000..14f2b4327e
--- /dev/null
+++ b/.github/configs/metax.yml
@@ -0,0 +1,57 @@
+# MetaX Hardware Configuration
+# This file defines CI/CD settings for MetaX-based testing
+# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml
+
+hardware_name: metax
+display_name: "MetaX Tests"
+
+# Docker image for this hardware
+# TODO: Replace with actual MetaX Docker image
+ci_image: localhost:5000/flagscale:metax-TODO
+ci_train_image: localhost:5000/flagscale-train:metax-TODO
+ci_inference_image: localhost:5000/flagscale-inference:metax-TODO
+
+# Runner labels for this hardware
+# TODO: Replace with actual MetaX runner labels
+runner_labels:
+  - self-hosted
+  - Linux
+  - X64
+  - metax-0        # TODO: Update to actual MetaX runner label
+  - gpus-8         # TODO: Update to actual GPU count
+
+# Container volumes (hardware-specific paths)
+# TODO: Update paths if MetaX uses different mount points
+container_volumes:
+  - /home/flagscale_cicd/flask/static:/workspace/report
+  - /home/flagscale_cicd/flask/config:/workspace/config
+  - /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data
+  - /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers
+  - /home/flagscale_cicd/sccache:/github/home/.cache/sccache
+
+# Container options (hardware-specific settings)
+# TODO: Update GPU runtime options for MetaX (e.g., replace --gpus all if needed)
+container_options: "--gpus all --shm-size=500g --hostname flagscale_cicd --user root --ulimit nofile=65535:65535"
+
+# =============================================================================
+# Package Manager Configuration
+# =============================================================================
+# Supported package managers: pip, uv, conda
+#   - pip:   Use pip directly (standard Python)
+#   - uv:    Use uv pip (fast, modern package manager)
+#   - conda: Use conda environment with pip for PyPI packages
+#
+# TODO: Update package manager settings for MetaX environment
+pkg_mgr: "conda"
+
+# Environment path (venv path for uv, conda installation path for conda)
+# TODO: Update to actual MetaX environment path
+env_path: "/root/miniconda3"
+
+# Conda environment name (for conda only)
+# TODO: Update environment names for MetaX
+env_names:
+  train: "flagscale-train"
+  hetero_train: "flagscale-train"
+  inference: "flagscale-inference"
+  rl: "flagscale-rl"
diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml
new file mode 100644
index 0000000000..5d60d19c94
--- /dev/null
+++ b/.github/workflows/all_tests_metax.yml
@@ -0,0 +1,31 @@
+name: metax_tests
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run_tests:
+    # Package manager and environment settings are read from .github/configs/metax.yml
+    uses: ./.github/workflows/all_tests_common.yml
+    with:
+      platform: metax
+
+  all_tests:
+    needs: run_tests
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify workflow status
+        run: |
+          if [ "${{ needs.run_tests.result }}" != "success" ]; then
+            echo "❌ Tests workflow failed"
+            exit 1
+          fi
+          echo "✅ All tests passed!"
diff --git a/tests/functional_tests/hetero_train/aquila/conf/dp2dp4_shared_embedding.yaml b/tests/functional_tests/hetero_train/aquila/conf/dp2dp4_shared_embedding.yaml
index e5e31421b6..3fdb1b46b9 100644
--- a/tests/functional_tests/hetero_train/aquila/conf/dp2dp4_shared_embedding.yaml
+++ b/tests/functional_tests/hetero_train/aquila/conf/dp2dp4_shared_embedding.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - train: dp2dp4_shared_embedding
 
@@ -13,19 +14,7 @@ experiment:
     ssh_port: null
   shell_cmds: null
   envs:
-    HYDRA_FULL_ERROR: 1
     CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    NCCL_ALGO: "Ring"
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-train
 action: run
 
 hydra:
diff --git a/tests/functional_tests/hetero_train/aquila/conf/envs/cuda.yaml b/tests/functional_tests/hetero_train/aquila/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..6e2c0c5a1d
--- /dev/null
+++ b/tests/functional_tests/hetero_train/aquila/conf/envs/cuda.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+# CUDA platform environment variables for hetero_train/aquila
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    NCCL_ALGO: "Ring"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/hetero_train/aquila/conf/envs/metax.yaml b/tests/functional_tests/hetero_train/aquila/conf/envs/metax.yaml
new file mode 100644
index 0000000000..6edf70969c
--- /dev/null
+++ b/tests/functional_tests/hetero_train/aquila/conf/envs/metax.yaml
@@ -0,0 +1,11 @@
+# @package _global_
+# MetaX platform environment variables for hetero_train/aquila
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    # TODO: MetaX visible devices env var
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml b/tests/functional_tests/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml
index 579f6cf28c..90581953e1 100644
--- a/tests/functional_tests/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml
+++ b/tests/functional_tests/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - train: tp2dp1pp1_tp2dp2pp1_tp1dp2pp1
 
@@ -12,20 +13,6 @@ experiment:
   runner:
     ssh_port: null
   shell_cmds: null
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    NCCL_ALGO: "Ring"
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-train
 action: run
 
 hydra:
diff --git a/tests/functional_tests/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml b/tests/functional_tests/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml
index 8dbab54730..df67e12808 100644
--- a/tests/functional_tests/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml
+++ b/tests/functional_tests/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - train: tp2pp1_tp4pp1_tp2pp1
 
@@ -12,20 +13,6 @@ experiment:
   runner:
     ssh_port: null
   shell_cmds: null
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    NCCL_ALGO: "Ring"
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-train
 action: run
 
 hydra:
diff --git a/tests/functional_tests/hetero_train/aquila/gold_values/dp2dp4_shared_embedding.json b/tests/functional_tests/hetero_train/aquila/gold_values/dp2dp4_shared_embedding.json
index 36e2b99379..3e4d573a7a 100644
--- a/tests/functional_tests/hetero_train/aquila/gold_values/dp2dp4_shared_embedding.json
+++ b/tests/functional_tests/hetero_train/aquila/gold_values/dp2dp4_shared_embedding.json
@@ -1 +1,9 @@
-{"lm loss:": {"values": [11.55754, 11.56045, 11.3609, 11.22254, 11.10463, 11.01332, 10.95259, 10.9088, 10.88758, 10.86586]}}
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [11.55754, 11.56045, 11.3609, 11.22254, 11.10463, 11.01332, 10.95259, 10.9088, 10.88758, 10.86586]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/hetero_train/aquila/gold_values/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.json b/tests/functional_tests/hetero_train/aquila/gold_values/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.json
index b30dd75dbc..d5c4cf4336 100644
--- a/tests/functional_tests/hetero_train/aquila/gold_values/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.json
+++ b/tests/functional_tests/hetero_train/aquila/gold_values/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.json
@@ -1 +1,9 @@
-{"lm loss:": {"values": [11.62049, 11.61899, 11.41389, 11.27374, 11.15958, 11.07645, 11.01809, 10.97522, 10.95196, 10.93447]}}
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [11.62049, 11.61899, 11.41389, 11.27374, 11.15958, 11.07645, 11.01809, 10.97522, 10.95196, 10.93447]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/hetero_train/aquila/gold_values/tp2pp1_tp4pp1_tp2pp1.json b/tests/functional_tests/hetero_train/aquila/gold_values/tp2pp1_tp4pp1_tp2pp1.json
index 17df314aa0..89e8a2c98b 100644
--- a/tests/functional_tests/hetero_train/aquila/gold_values/tp2pp1_tp4pp1_tp2pp1.json
+++ b/tests/functional_tests/hetero_train/aquila/gold_values/tp2pp1_tp4pp1_tp2pp1.json
@@ -1 +1,9 @@
-{"lm loss:": {"values": [11.60803, 11.60942, 11.39587, 11.24672, 11.12878, 11.03954, 10.97887, 10.93456, 10.91292, 10.89361]}}
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [11.60803, 11.60942, 11.39587, 11.24672, 11.12878, 11.03954, 10.97887, 10.93456, 10.91292, 10.89361]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/7b_tp2.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/7b_tp2.yaml
index c9ef51bd9b..c319571488 100644
--- a/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/7b_tp2.yaml
+++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/7b_tp2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - inference: 7b_tp2
 
@@ -11,36 +12,6 @@ experiment:
     entrypoint: flagscale/inference/inference_llm.py
   runner:
     hostfile: null
-  cmds:
-    before_start:
-      source /root/miniconda3/bin/activate flagscale-inference
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-    # Quantitative perception training related
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    # GPU parallel control
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    NCCL_ALGO: "Ring"
-    NCCL_PROTOCOL: LLC
-    # Basic randomness control
-    SEED: 1234
-    PYTHONHASHSEED: 0
-    MKL_NUM_THREADS: 1
-    OMP_NUM_THREADS: 1
-    NUMEXPR_NUM_THREADS: 1
-    SCIPY_RDRANDOM: 0
-    TF_DETERMINISTIC_OPS: 1
-    TORCH_CUDNN_DETERMINISM: true
-    CUDA_LAUNCH_BLOCKING: 1
-    NCCL_DEBUG: INFO
-    MAGIC_CACHE: disabled
 
 action: run
 
diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/cuda.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..cb88f22753
--- /dev/null
+++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/cuda.yaml
@@ -0,0 +1,28 @@
+# CUDA platform environment variables for inference/deepseek_r1_distill_qwen
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_ALGO: "Ring"
+    NCCL_PROTOCOL: LLC
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    TORCH_CUDNN_DETERMINISM: true
+    CUDA_LAUNCH_BLOCKING: 1
+    NCCL_DEBUG: INFO
+    MAGIC_CACHE: disabled
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/metax.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/metax.yaml
new file mode 100644
index 0000000000..2567a8e2e1
--- /dev/null
+++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen/conf/envs/metax.yaml
@@ -0,0 +1,18 @@
+# MetaX platform environment variables for inference/deepseek_r1_distill_qwen
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    MAGIC_CACHE: disabled
+    # TODO: MetaX visible devices and platform-specific env vars
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/7b_tp2.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/7b_tp2.yaml
index e11a52e0c0..403d3fc8f5 100644
--- a/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/7b_tp2.yaml
+++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/7b_tp2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - inference: 7b_tp2
 
@@ -11,37 +12,6 @@ experiment:
     entrypoint: flagscale/inference/inference_llm.py
   runner:
     hostfile: null
-  cmds:
-    before_start:
-      source /root/miniconda3/bin/activate flagscale-inference
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-    USE_FLAGGEMS: "true"
-    # Quantitative perception training related
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    # GPU parallel control
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    NCCL_ALGO: "Ring"
-    NCCL_PROTOCOL: LLC
-    # Basic randomness control
-    SEED: 1234
-    PYTHONHASHSEED: 0
-    MKL_NUM_THREADS: 1
-    OMP_NUM_THREADS: 1
-    NUMEXPR_NUM_THREADS: 1
-    SCIPY_RDRANDOM: 0
-    TF_DETERMINISTIC_OPS: 1
-    TORCH_CUDNN_DETERMINISM: true
-    CUDA_LAUNCH_BLOCKING: 1
-    NCCL_DEBUG: INFO
-    MAGIC_CACHE: disabled
 
 action: run
 
diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/cuda.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..dee0dc5f59
--- /dev/null
+++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/cuda.yaml
@@ -0,0 +1,29 @@
+# CUDA platform environment variables for inference/deepseek_r1_distill_qwen_flaggems
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    USE_FLAGGEMS: "true"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_ALGO: "Ring"
+    NCCL_PROTOCOL: LLC
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    TORCH_CUDNN_DETERMINISM: true
+    CUDA_LAUNCH_BLOCKING: 1
+    NCCL_DEBUG: INFO
+    MAGIC_CACHE: disabled
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/metax.yaml b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/metax.yaml
new file mode 100644
index 0000000000..1ed23b537c
--- /dev/null
+++ b/tests/functional_tests/inference/deepseek_r1_distill_qwen_flaggems/conf/envs/metax.yaml
@@ -0,0 +1,19 @@
+# MetaX platform environment variables for inference/deepseek_r1_distill_qwen_flaggems
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    USE_FLAGGEMS: "true"
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    MAGIC_CACHE: disabled
+    # TODO: MetaX visible devices and platform-specific env vars
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/qwen3/conf/4b_tp2.yaml b/tests/functional_tests/inference/qwen3/conf/4b_tp2.yaml
index ea940956de..eace560a8a 100644
--- a/tests/functional_tests/inference/qwen3/conf/4b_tp2.yaml
+++ b/tests/functional_tests/inference/qwen3/conf/4b_tp2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - inference: 4b_tp2
 
@@ -11,35 +12,6 @@ experiment:
     entrypoint: flagscale/inference/inference_llm.py
   runner:
     hostfile: null
-  cmds:
-    before_start:
-      source /root/miniconda3/bin/activate flagscale-inference
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-    # Quantitative perception training related
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    # GPU parallel control
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    NCCL_ALGO: "Ring"
-    NCCL_PROTOCOL: LLC
-    # Basic randomness control
-    SEED: 1234
-    PYTHONHASHSEED: 0
-    MKL_NUM_THREADS: 1
-    OMP_NUM_THREADS: 1
-    NUMEXPR_NUM_THREADS: 1
-    SCIPY_RDRANDOM: 0
-    TF_DETERMINISTIC_OPS: 1
-    TORCH_CUDNN_DETERMINISM: true
-    CUDA_LAUNCH_BLOCKING: 1
-    NCCL_DEBUG: INFO
 
 action: run
 
diff --git a/tests/functional_tests/inference/qwen3/conf/envs/cuda.yaml b/tests/functional_tests/inference/qwen3/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..1032fa8d16
--- /dev/null
+++ b/tests/functional_tests/inference/qwen3/conf/envs/cuda.yaml
@@ -0,0 +1,27 @@
+# CUDA platform environment variables for inference/qwen3
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_ALGO: "Ring"
+    NCCL_PROTOCOL: LLC
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    TORCH_CUDNN_DETERMINISM: true
+    CUDA_LAUNCH_BLOCKING: 1
+    NCCL_DEBUG: INFO
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/qwen3/conf/envs/metax.yaml b/tests/functional_tests/inference/qwen3/conf/envs/metax.yaml
new file mode 100644
index 0000000000..3d54a300ef
--- /dev/null
+++ b/tests/functional_tests/inference/qwen3/conf/envs/metax.yaml
@@ -0,0 +1,17 @@
+# MetaX platform environment variables for inference/qwen3
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    # TODO: MetaX visible devices and platform-specific env vars
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/qwen3_flaggems/conf/4b_tp2.yaml b/tests/functional_tests/inference/qwen3_flaggems/conf/4b_tp2.yaml
index 47fe827f84..289ab240ef 100644
--- a/tests/functional_tests/inference/qwen3_flaggems/conf/4b_tp2.yaml
+++ b/tests/functional_tests/inference/qwen3_flaggems/conf/4b_tp2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - inference: 4b_tp2
 
@@ -11,36 +12,6 @@ experiment:
     entrypoint: flagscale/inference/inference_llm.py
   runner:
     hostfile: null
-  cmds:
-    before_start:
-      source /root/miniconda3/bin/activate flagscale-inference
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-    USE_FLAGGEMS: "true"
-    # Quantitative perception training related
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    # GPU parallel control
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    NCCL_ALGO: "Ring"
-    NCCL_PROTOCOL: LLC
-    # Basic randomness control
-    SEED: 1234
-    PYTHONHASHSEED: 0
-    MKL_NUM_THREADS: 1
-    OMP_NUM_THREADS: 1
-    NUMEXPR_NUM_THREADS: 1
-    SCIPY_RDRANDOM: 0
-    TF_DETERMINISTIC_OPS: 1
-    TORCH_CUDNN_DETERMINISM: true
-    CUDA_LAUNCH_BLOCKING: 1
-    NCCL_DEBUG: INFO
 
 action: run
 
diff --git a/tests/functional_tests/inference/qwen3_flaggems/conf/envs/cuda.yaml b/tests/functional_tests/inference/qwen3_flaggems/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..61c066d60b
--- /dev/null
+++ b/tests/functional_tests/inference/qwen3_flaggems/conf/envs/cuda.yaml
@@ -0,0 +1,28 @@
+# CUDA platform environment variables for inference/qwen3_flaggems
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    USE_FLAGGEMS: "true"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_ALGO: "Ring"
+    NCCL_PROTOCOL: LLC
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    TORCH_CUDNN_DETERMINISM: true
+    CUDA_LAUNCH_BLOCKING: 1
+    NCCL_DEBUG: INFO
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/qwen3_flaggems/conf/envs/metax.yaml b/tests/functional_tests/inference/qwen3_flaggems/conf/envs/metax.yaml
new file mode 100644
index 0000000000..8260add4a2
--- /dev/null
+++ b/tests/functional_tests/inference/qwen3_flaggems/conf/envs/metax.yaml
@@ -0,0 +1,18 @@
+# MetaX platform environment variables for inference/qwen3_flaggems
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    USE_FLAGGEMS: "true"
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    # TODO: MetaX visible devices and platform-specific env vars
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/robobrain2/conf/7b_tp2.yaml b/tests/functional_tests/inference/robobrain2/conf/7b_tp2.yaml
index f281dcc56b..628545bfe9 100644
--- a/tests/functional_tests/inference/robobrain2/conf/7b_tp2.yaml
+++ b/tests/functional_tests/inference/robobrain2/conf/7b_tp2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - inference: 7b_tp2
 
@@ -11,39 +12,6 @@ experiment:
     entrypoint: flagscale/inference/inference_robobrain2.py
   runner:
     hostfile: null
-  cmds:
-    before_start:
-      source /root/miniconda3/bin/activate flagscale-inference
-  envs:
-    HYDRA_FULL_ERROR: 1
-    VLLM_WORKER_MULTIPROC_METHOD: "spawn"
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-    # Quantitative perception training related
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    # GPU parallel control
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    NCCL_ALGO: "Ring"
-    NCCL_PROTOCOL: LLC
-    # Basic randomness control
-    SEED: 1234
-    PYTHONHASHSEED: 0
-    MKL_NUM_THREADS: 1
-    OMP_NUM_THREADS: 1
-    NUMEXPR_NUM_THREADS: 1
-    SCIPY_RDRANDOM: 0
-    TF_DETERMINISTIC_OPS: 1
-    TORCH_CUDNN_DETERMINISM: true
-    CUDA_LAUNCH_BLOCKING: 1
-    NCCL_DEBUG: INFO
-    MAGIC_CACHE: disabled
-    # Serve specific
-    USE_FS_SERVE: false
 
 action: run
 
diff --git a/tests/functional_tests/inference/robobrain2/conf/envs/cuda.yaml b/tests/functional_tests/inference/robobrain2/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..b07c831c3d
--- /dev/null
+++ b/tests/functional_tests/inference/robobrain2/conf/envs/cuda.yaml
@@ -0,0 +1,30 @@
+# CUDA platform environment variables for inference/robobrain2
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    VLLM_WORKER_MULTIPROC_METHOD: "spawn"
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_ALGO: "Ring"
+    NCCL_PROTOCOL: LLC
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    TORCH_CUDNN_DETERMINISM: true
+    CUDA_LAUNCH_BLOCKING: 1
+    NCCL_DEBUG: INFO
+    MAGIC_CACHE: disabled
+    USE_FS_SERVE: false
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/robobrain2/conf/envs/metax.yaml b/tests/functional_tests/inference/robobrain2/conf/envs/metax.yaml
new file mode 100644
index 0000000000..d616a2654b
--- /dev/null
+++ b/tests/functional_tests/inference/robobrain2/conf/envs/metax.yaml
@@ -0,0 +1,20 @@
+# MetaX platform environment variables for inference/robobrain2
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    VLLM_WORKER_MULTIPROC_METHOD: "spawn"
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    MAGIC_CACHE: disabled
+    USE_FS_SERVE: false
+    # TODO: MetaX visible devices and platform-specific env vars
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/robobrain2_flaggems/conf/7b_tp2.yaml b/tests/functional_tests/inference/robobrain2_flaggems/conf/7b_tp2.yaml
index 53bdc6a5c8..733f9a0a94 100644
--- a/tests/functional_tests/inference/robobrain2_flaggems/conf/7b_tp2.yaml
+++ b/tests/functional_tests/inference/robobrain2_flaggems/conf/7b_tp2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - inference: 7b_tp2
 
@@ -11,39 +12,6 @@ experiment:
     entrypoint: flagscale/inference/inference_robobrain2.py
   runner:
     hostfile: null
-  cmds:
-    before_start:
-      source /root/miniconda3/bin/activate flagscale-inference
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-    USE_FLAGGEMS: "true"
-    # Quantitative perception training related
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    # GPU parallel control
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    NCCL_ALGO: "Ring"
-    NCCL_PROTOCOL: LLC
-    # Basic randomness control
-    SEED: 1234
-    PYTHONHASHSEED: 0
-    MKL_NUM_THREADS: 1
-    OMP_NUM_THREADS: 1
-    NUMEXPR_NUM_THREADS: 1
-    SCIPY_RDRANDOM: 0
-    TF_DETERMINISTIC_OPS: 1
-    TORCH_CUDNN_DETERMINISM: true
-    CUDA_LAUNCH_BLOCKING: 1
-    NCCL_DEBUG: INFO
-    MAGIC_CACHE: disabled
-    # Serve specific
-    USE_FS_SERVE: false
 
 action: run
 
diff --git a/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/cuda.yaml b/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..ff36d3700a
--- /dev/null
+++ b/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/cuda.yaml
@@ -0,0 +1,30 @@
+# CUDA platform environment variables for inference/robobrain2_flaggems
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+    USE_FLAGGEMS: "true"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_ALGO: "Ring"
+    NCCL_PROTOCOL: LLC
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    TORCH_CUDNN_DETERMINISM: true
+    CUDA_LAUNCH_BLOCKING: 1
+    NCCL_DEBUG: INFO
+    MAGIC_CACHE: disabled
+    USE_FS_SERVE: false
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/metax.yaml b/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/metax.yaml
new file mode 100644
index 0000000000..b845adabae
--- /dev/null
+++ b/tests/functional_tests/inference/robobrain2_flaggems/conf/envs/metax.yaml
@@ -0,0 +1,20 @@
+# MetaX platform environment variables for inference/robobrain2_flaggems
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    USE_FLAGGEMS: "true"
+    SEED: 1234
+    PYTHONHASHSEED: 0
+    MKL_NUM_THREADS: 1
+    OMP_NUM_THREADS: 1
+    NUMEXPR_NUM_THREADS: 1
+    SCIPY_RDRANDOM: 0
+    TF_DETERMINISTIC_OPS: 1
+    MAGIC_CACHE: disabled
+    USE_FS_SERVE: false
+    # TODO: MetaX visible devices and platform-specific env vars
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/rl/qwen2_5/conf/0_5b.yaml b/tests/functional_tests/rl/qwen2_5/conf/0_5b.yaml
index 694e8a3707..ce898a758f 100644
--- a/tests/functional_tests/rl/qwen2_5/conf/0_5b.yaml
+++ b/tests/functional_tests/rl/qwen2_5/conf/0_5b.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - rl: 0_5b
   - _self_
 
@@ -9,15 +10,10 @@ experiment:
     type: rl
     backend: verl
     entrypoint: verl.trainer.main_ppo
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-RL
   runner:
     nnodes: 1
     nproc_per_node: 8
     hostfile: null
-  envs:
-    CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
-    TORCH_DEVICE_BACKEND_AUTOLOAD: 0
 
 action: run
 
diff --git a/tests/functional_tests/rl/qwen2_5/conf/envs/cuda.yaml b/tests/functional_tests/rl/qwen2_5/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..7e61557b9d
--- /dev/null
+++ b/tests/functional_tests/rl/qwen2_5/conf/envs/cuda.yaml
@@ -0,0 +1,7 @@
+# CUDA platform environment variables for rl/qwen2_5
+experiment:
+  envs:
+    CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+    TORCH_DEVICE_BACKEND_AUTOLOAD: 0
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-RL
diff --git a/tests/functional_tests/rl/qwen2_5/conf/envs/metax.yaml b/tests/functional_tests/rl/qwen2_5/conf/envs/metax.yaml
new file mode 100644
index 0000000000..b3001a8fc9
--- /dev/null
+++ b/tests/functional_tests/rl/qwen2_5/conf/envs/metax.yaml
@@ -0,0 +1,10 @@
+# MetaX platform environment variables for rl/qwen2_5
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    TORCH_DEVICE_BACKEND_AUTOLOAD: 0
+    # TODO: MetaX visible devices env var
+    # MACA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-RL
diff --git a/tests/functional_tests/serve/base/conf/envs/cuda.yaml b/tests/functional_tests/serve/base/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..4ba380c504
--- /dev/null
+++ b/tests/functional_tests/serve/base/conf/envs/cuda.yaml
@@ -0,0 +1,7 @@
+# CUDA platform environment variables for serve/base
+experiment:
+  envs:
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    no_proxy: "127.0.0.1,localhost"
+  cmds:
+    before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/serve/base/conf/envs/metax.yaml b/tests/functional_tests/serve/base/conf/envs/metax.yaml
new file mode 100644
index 0000000000..8b524d4c23
--- /dev/null
+++ b/tests/functional_tests/serve/base/conf/envs/metax.yaml
@@ -0,0 +1,9 @@
+# MetaX platform environment variables for serve/base
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    no_proxy: "127.0.0.1,localhost"
+    # TODO: MetaX device connection env var
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/serve/base/conf/multiple_model.yaml b/tests/functional_tests/serve/base/conf/multiple_model.yaml
index 727794f93a..019a4b6014 100644
--- a/tests/functional_tests/serve/base/conf/multiple_model.yaml
+++ b/tests/functional_tests/serve/base/conf/multiple_model.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - serve: multiple_model
 
@@ -22,11 +23,6 @@ experiment:
           type: Optional[str]
           required: false
           default: "You are a helpful assistant."
-  envs:
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    no_proxy: "127.0.0.1,localhost"
-  cmds:
-    before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference
 
 action: run
 
diff --git a/tests/functional_tests/serve/qwen2_5/conf/0.5b.yaml b/tests/functional_tests/serve/qwen2_5/conf/0.5b.yaml
index c23595fb65..5bcc6a1ed8 100644
--- a/tests/functional_tests/serve/qwen2_5/conf/0.5b.yaml
+++ b/tests/functional_tests/serve/qwen2_5/conf/0.5b.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - serve: 0.5b
 
@@ -15,10 +16,6 @@ experiment:
       use_fs_serve: false
   envs:
     CUDA_VISIBLE_DEVICES: 0
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    no_proxy: "127.0.0.1,localhost"
-  cmds:
-    before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference
 
 action: run
 
diff --git a/tests/functional_tests/serve/qwen2_5/conf/0.5b_multiple_instance.yaml b/tests/functional_tests/serve/qwen2_5/conf/0.5b_multiple_instance.yaml
index e4f03ac2f5..cbd26eac07 100644
--- a/tests/functional_tests/serve/qwen2_5/conf/0.5b_multiple_instance.yaml
+++ b/tests/functional_tests/serve/qwen2_5/conf/0.5b_multiple_instance.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - serve: 0.5b_multiple_instance
 
@@ -9,7 +10,7 @@ experiment:
     type: serve
     entrypoint: null
   runner:
-    hostfile: null # /path/to/hostfile.txt
+    hostfile: null
     docker: ds
     ssh_port: 22
     nnodes: 1
@@ -22,17 +23,14 @@ experiment:
       tensor_model_parallel_size: [1]
       pipeline_model_parallel_size: [1]
       instance: [2]
-      block_size: [16] # [8, 16, 32]
+      block_size: [16]
       max_num_batched_tokens: [512]
-      max_num_seqs: [128] # [128, 256]
+      max_num_seqs: [128]
     control:
       interval: 20
       run_best: false
-  cmds:
-    before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference
   envs:
     CUDA_VISIBLE_DEVICES: 0,1
-    no_proxy: "127.0.0.1,localhost"
     RAY_DEDUP_LOGS: 0
 
 action: auto_tune
diff --git a/tests/functional_tests/serve/qwen2_5/conf/envs/cuda.yaml b/tests/functional_tests/serve/qwen2_5/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..2de8bcf2d2
--- /dev/null
+++ b/tests/functional_tests/serve/qwen2_5/conf/envs/cuda.yaml
@@ -0,0 +1,7 @@
+# CUDA platform environment variables for serve/qwen2_5
+experiment:
+  envs:
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    no_proxy: "127.0.0.1,localhost"
+  cmds:
+    before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/serve/qwen2_5/conf/envs/metax.yaml b/tests/functional_tests/serve/qwen2_5/conf/envs/metax.yaml
new file mode 100644
index 0000000000..9915bb30b8
--- /dev/null
+++ b/tests/functional_tests/serve/qwen2_5/conf/envs/metax.yaml
@@ -0,0 +1,9 @@
+# MetaX platform environment variables for serve/qwen2_5
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    no_proxy: "127.0.0.1,localhost"
+    # TODO: MetaX device connection env var
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/serve/qwen3/conf/4b.yaml b/tests/functional_tests/serve/qwen3/conf/4b.yaml
new file mode 100644
index 0000000000..248135d744
--- /dev/null
+++ b/tests/functional_tests/serve/qwen3/conf/4b.yaml
@@ -0,0 +1,24 @@
+defaults:
+  - envs: cuda
+  - _self_
+  - serve: 4b
+
+experiment:
+  exp_name: qwen3
+  exp_dir: tests/functional_tests/serve/qwen3/test_results/4b
+  task:
+    type: serve
+    entrypoint: null
+  runner:
+    hostfile: null
+    deploy:
+      port: 6704
+      use_fs_serve: false
+  envs:
+    CUDA_VISIBLE_DEVICES: 0
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/tests/functional_tests/serve/qwen3/conf/4b_multiple_instance.yaml b/tests/functional_tests/serve/qwen3/conf/4b_multiple_instance.yaml
new file mode 100644
index 0000000000..5d7be1771f
--- /dev/null
+++ b/tests/functional_tests/serve/qwen3/conf/4b_multiple_instance.yaml
@@ -0,0 +1,40 @@
+defaults:
+  - envs: cuda
+  - _self_
+  - serve: 4b_multiple_instance
+
+experiment:
+  exp_name: qwen3
+  exp_dir: tests/functional_tests/serve/qwen3/test_results/4b_multiple_instance
+  task:
+    type: serve
+    entrypoint: null
+  runner:
+    hostfile: null
+    docker: ds
+    ssh_port: 22
+    nnodes: 1
+    nproc_per_node: 2
+    deploy:
+      port: 6705
+      use_fs_serve: true
+  auto_tuner:
+    space:
+      tensor_model_parallel_size: [1]
+      pipeline_model_parallel_size: [1]
+      instance: [2]
+      block_size: [16]
+      max_num_batched_tokens: [512]
+      max_num_seqs: [128]
+    control:
+      interval: 20
+      run_best: false
+  envs:
+    CUDA_VISIBLE_DEVICES: 0,1
+    RAY_DEDUP_LOGS: 0
+
+action: auto_tune
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/tests/functional_tests/serve/qwen3/conf/envs/cuda.yaml b/tests/functional_tests/serve/qwen3/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..fcdaada3f3
--- /dev/null
+++ b/tests/functional_tests/serve/qwen3/conf/envs/cuda.yaml
@@ -0,0 +1,7 @@
+# CUDA platform environment variables for serve/qwen3
+experiment:
+  envs:
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    no_proxy: "127.0.0.1,localhost"
+  cmds:
+    before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/serve/qwen3/conf/envs/metax.yaml b/tests/functional_tests/serve/qwen3/conf/envs/metax.yaml
new file mode 100644
index 0000000000..07d07246f2
--- /dev/null
+++ b/tests/functional_tests/serve/qwen3/conf/envs/metax.yaml
@@ -0,0 +1,9 @@
+# MetaX platform environment variables for serve/qwen3
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    no_proxy: "127.0.0.1,localhost"
+    # TODO: MetaX device connection env var
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: ulimit -n 65535 && source /root/miniconda3/bin/activate flagscale-inference
diff --git a/tests/functional_tests/serve/qwen3/conf/serve/4b.yaml b/tests/functional_tests/serve/qwen3/conf/serve/4b.yaml
new file mode 100644
index 0000000000..0009f0f773
--- /dev/null
+++ b/tests/functional_tests/serve/qwen3/conf/serve/4b.yaml
@@ -0,0 +1,12 @@
+- serve_id: vllm_model
+  engine: vllm
+  engine_args:
+    model: /home/gitlab-runner/data/Qwen3-4B
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_len: 32768
+    max_num_seqs: 256
+    enforce_eager: true
+    trust_remote_code: true
+    enable_chunked_prefill: true
diff --git a/tests/functional_tests/serve/qwen3/conf/serve/4b_multiple_instance.yaml b/tests/functional_tests/serve/qwen3/conf/serve/4b_multiple_instance.yaml
new file mode 100644
index 0000000000..18bda1597a
--- /dev/null
+++ b/tests/functional_tests/serve/qwen3/conf/serve/4b_multiple_instance.yaml
@@ -0,0 +1,16 @@
+- serve_id: vllm_model
+  engine: vllm
+  engine_args:
+    model: /home/gitlab-runner/data/Qwen3-4B
+    served_model_name: qwen
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_len: 32768
+    max_num_seqs: 256
+    enforce_eager: true
+    trust_remote_code: true
+    enable_chunked_prefill: true
+  resources:
+    num_replicas: 2
+    num_gpus: 1
diff --git a/tests/functional_tests/serve/qwen3/gold_values/4b.json b/tests/functional_tests/serve/qwen3/gold_values/4b.json
new file mode 100644
index 0000000000..0f8a4237c6
--- /dev/null
+++ b/tests/functional_tests/serve/qwen3/gold_values/4b.json
@@ -0,0 +1 @@
+{"response": ", including his background, philosophy, and legacy. Also, include his influence on martial arts and popular"}
diff --git a/tests/functional_tests/serve/qwen3/gold_values/4b_multiple_instance.json b/tests/functional_tests/serve/qwen3/gold_values/4b_multiple_instance.json
new file mode 100644
index 0000000000..c637dad45b
--- /dev/null
+++ b/tests/functional_tests/serve/qwen3/gold_values/4b_multiple_instance.json
@@ -0,0 +1 @@
+{"response": ". Bruce Lee was a legendary martial artist, actor, and film director who lived from 19"}
diff --git a/tests/functional_tests/train/aquila/conf/envs/cuda.yaml b/tests/functional_tests/train/aquila/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..04d6cbcc79
--- /dev/null
+++ b/tests/functional_tests/train/aquila/conf/envs/cuda.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+# CUDA platform environment variables for train/aquila
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    NCCL_ALGO: "Ring"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/train/aquila/conf/envs/metax.yaml b/tests/functional_tests/train/aquila/conf/envs/metax.yaml
new file mode 100644
index 0000000000..b147a75cd6
--- /dev/null
+++ b/tests/functional_tests/train/aquila/conf/envs/metax.yaml
@@ -0,0 +1,11 @@
+# @package _global_
+# MetaX platform environment variables for train/aquila
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    # TODO: MetaX visible devices env var
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/train/aquila/conf/tp2_pp2.yaml b/tests/functional_tests/train/aquila/conf/tp2_pp2.yaml
index 76091741cb..b958bd6f06 100644
--- a/tests/functional_tests/train/aquila/conf/tp2_pp2.yaml
+++ b/tests/functional_tests/train/aquila/conf/tp2_pp2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - train: tp2_pp2
 
@@ -12,20 +13,6 @@ experiment:
   runner:
     ssh_port: null
   shell_cmds: null
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    NCCL_ALGO: "Ring"
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-train
 action: run
 
 hydra:
diff --git a/tests/functional_tests/train/aquila/conf/tp4_pp2.yaml b/tests/functional_tests/train/aquila/conf/tp4_pp2.yaml
index 560aa9a06e..2b9f856e53 100644
--- a/tests/functional_tests/train/aquila/conf/tp4_pp2.yaml
+++ b/tests/functional_tests/train/aquila/conf/tp4_pp2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - train: tp4_pp2
 
@@ -12,20 +13,6 @@ experiment:
   runner:
     ssh_port: null
   shell_cmds: null
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    NCCL_ALGO: "Ring"
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-train
 action: run
 
 hydra:
diff --git a/tests/functional_tests/train/aquila/gold_values/tp2_pp2.json b/tests/functional_tests/train/aquila/gold_values/tp2_pp2.json
index 8f66ae59d4..d0f4266397 100644
--- a/tests/functional_tests/train/aquila/gold_values/tp2_pp2.json
+++ b/tests/functional_tests/train/aquila/gold_values/tp2_pp2.json
@@ -1 +1,9 @@
-{"lm loss:": {"values": [11.63839, 11.63632, 11.49138, 11.38161, 11.28878, 11.21583, 74297.01, 11.22827, 11.16404, 11.1232, 11.10253, 11.07966, 11.07366, 64451.08, 11.07513, 64546.05, 11.07297, 64406.66]}}
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [11.63839, 11.63632, 11.49138, 11.38161, 11.28878, 11.21583, 74297.01, 11.22827, 11.16404, 11.1232, 11.10253, 11.07966, 11.07366, 64451.08, 11.07513, 64546.05, 11.07297, 64406.66]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/train/aquila/gold_values/tp4_pp2.json b/tests/functional_tests/train/aquila/gold_values/tp4_pp2.json
index 50d94279d9..2a3fca2f11 100644
--- a/tests/functional_tests/train/aquila/gold_values/tp4_pp2.json
+++ b/tests/functional_tests/train/aquila/gold_values/tp4_pp2.json
@@ -1 +1,9 @@
-{"lm loss:": {"values": [11.61479, 11.61304, 11.46996, 11.36611, 11.26903, 11.20205, 11.15256, 11.11047, 11.09079, 11.07022]}}
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [11.61479, 11.61304, 11.46996, 11.36611, 11.26903, 11.20205, 11.15256, 11.11047, 11.09079, 11.07022]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/train/deepseek/conf/envs/cuda.yaml b/tests/functional_tests/train/deepseek/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..64d1bfe3bc
--- /dev/null
+++ b/tests/functional_tests/train/deepseek/conf/envs/cuda.yaml
@@ -0,0 +1,14 @@
+# CUDA platform environment variables for train/deepseek
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    NCCL_ALGO: "Ring"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/train/deepseek/conf/envs/metax.yaml b/tests/functional_tests/train/deepseek/conf/envs/metax.yaml
new file mode 100644
index 0000000000..e089abd513
--- /dev/null
+++ b/tests/functional_tests/train/deepseek/conf/envs/metax.yaml
@@ -0,0 +1,10 @@
+# MetaX platform environment variables for train/deepseek
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    # TODO: MetaX visible devices env var
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2.yaml b/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2.yaml
index 686853877b..c4d9442a17 100644
--- a/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2.yaml
+++ b/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - train: tp2_pp2_ep2
 
@@ -12,26 +13,6 @@ experiment:
   runner:
     ssh_port: null
   shell_cmds: null
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    NCCL_ALGO: "Ring"
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-    # Only for debug
-    # NVTE_DEBUG: 1
-    # NVTE_DEBUG_LEVEL: 2
-    # CUDNN_LOGERR_DBG: 1
-    # The following parameters passed the local test
-    # CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    # CUDA_DEVICE_MAX_CONNECTIONS: 1
-    # NVTE_TORCH_COMPILE: 0
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-train
 action: run
 
 hydra:
diff --git a/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2_engram.yaml b/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2_engram.yaml
index 14ed97c215..25ef469677 100644
--- a/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2_engram.yaml
+++ b/tests/functional_tests/train/deepseek/conf/tp2_pp2_ep2_engram.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - train: tp2_pp2_ep2_engram
 
@@ -12,26 +13,6 @@ experiment:
   runner:
     ssh_port: null
   shell_cmds: null
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    NCCL_ALGO: "Ring"
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-    # Only for debug
-    # NVTE_DEBUG: 1
-    # NVTE_DEBUG_LEVEL: 2
-    # CUDNN_LOGERR_DBG: 1
-    # The following parameters passed the local test
-    # CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    # CUDA_DEVICE_MAX_CONNECTIONS: 1
-    # NVTE_TORCH_COMPILE: 0
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-train
 action: run
 
 hydra:
diff --git a/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2.json b/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2.json
index fc56d2a014..8c5bc07ace 100644
--- a/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2.json
+++ b/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2.json
@@ -1 +1,9 @@
-{"lm loss:": {"values": [12.32605, 12.33183, 18.6708, 13.8059, 9.761076, 10.68691, 10.55642, 10.59995, 10.00202, 11.28867]}}
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [12.32605, 12.33183, 18.6708, 13.8059, 9.761076, 10.68691, 10.55642, 10.59995, 10.00202, 11.28867]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2_engram.json b/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2_engram.json
index 2830ce8a2f..df995035ed 100644
--- a/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2_engram.json
+++ b/tests/functional_tests/train/deepseek/gold_values/tp2_pp2_ep2_engram.json
@@ -1 +1,9 @@
-{"lm loss:": {"values": [12.31449, 12.31611, 20.91898, 12.73568, 13.30583, 11.24752, 9.966835, 11.10967, 10.74055, 10.14268]}}
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [12.31449, 12.31611, 20.91898, 12.73568, 13.30583, 11.24752, 9.966835, 11.10967, 10.74055, 10.14268]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/train/mixtral/conf/envs/cuda.yaml b/tests/functional_tests/train/mixtral/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..0cd4bcb196
--- /dev/null
+++ b/tests/functional_tests/train/mixtral/conf/envs/cuda.yaml
@@ -0,0 +1,16 @@
+# CUDA platform environment variables for train/mixtral
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    NCCL_ALGO: "Ring"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/train/mixtral/conf/envs/metax.yaml b/tests/functional_tests/train/mixtral/conf/envs/metax.yaml
new file mode 100644
index 0000000000..c509107020
--- /dev/null
+++ b/tests/functional_tests/train/mixtral/conf/envs/metax.yaml
@@ -0,0 +1,10 @@
+# MetaX platform environment variables for train/mixtral
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    # TODO: MetaX visible devices env var
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/train/mixtral/conf/tp2_pp1_ep2.yaml b/tests/functional_tests/train/mixtral/conf/tp2_pp1_ep2.yaml
index a711648de0..4bc6f7da7d 100644
--- a/tests/functional_tests/train/mixtral/conf/tp2_pp1_ep2.yaml
+++ b/tests/functional_tests/train/mixtral/conf/tp2_pp1_ep2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - train: tp2_pp1_ep2
 
@@ -12,20 +13,6 @@ experiment:
   runner:
     ssh_port: null
   shell_cmds: null
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    NCCL_ALGO: "Ring"
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-train
 action: run
 
 hydra:
diff --git a/tests/functional_tests/train/mixtral/conf/tp4_pp1_ep2.yaml b/tests/functional_tests/train/mixtral/conf/tp4_pp1_ep2.yaml
index a48e17e647..d81ce72d92 100644
--- a/tests/functional_tests/train/mixtral/conf/tp4_pp1_ep2.yaml
+++ b/tests/functional_tests/train/mixtral/conf/tp4_pp1_ep2.yaml
@@ -1,4 +1,5 @@
 defaults:
+  - envs: cuda
   - _self_
   - train: tp4_pp1_ep2
 
@@ -12,20 +13,6 @@ experiment:
   runner:
     ssh_port: null
   shell_cmds: null
-  envs:
-    HYDRA_FULL_ERROR: 1
-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
-    CUDA_DEVICE_MAX_CONNECTIONS: 1
-    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
-    NCCL_ALGO: "Ring"
-    NVTE_APPLY_QK_LAYER_SCALING: 0
-    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-    NVTE_FLASH_ATTN: 0
-    NVTE_FUSED_ATTN: 0
-    CUDNN_BENCHMARK: "false"
-    CUDNN_DETERMINISTIC: "true"
-  cmds:
-    before_start: source /root/miniconda3/bin/activate flagscale-train
 action: run
 
 hydra:
diff --git a/tests/functional_tests/train/mixtral/gold_values/tp2_pp1_ep2.json b/tests/functional_tests/train/mixtral/gold_values/tp2_pp1_ep2.json
index c35033d3d2..1d39a255b2 100644
--- a/tests/functional_tests/train/mixtral/gold_values/tp2_pp1_ep2.json
+++ b/tests/functional_tests/train/mixtral/gold_values/tp2_pp1_ep2.json
@@ -1 +1,9 @@
-{"lm loss:": {"values": [11.17587, 11.16908, 10.41927, 11.66834, 9.679541, 9.481043, 9.194503, 9.234812, 9.128164, 8.960205]}}
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [11.17587, 11.16908, 10.41927, 11.66834, 9.679541, 9.481043, 9.194503, 9.234812, 9.128164, 8.960205]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/train/mixtral/gold_values/tp4_pp1_ep2.json b/tests/functional_tests/train/mixtral/gold_values/tp4_pp1_ep2.json
index bea8991fda..72990e9a55 100644
--- a/tests/functional_tests/train/mixtral/gold_values/tp4_pp1_ep2.json
+++ b/tests/functional_tests/train/mixtral/gold_values/tp4_pp1_ep2.json
@@ -1 +1,9 @@
-{"lm loss:": {"values": [11.21206, 11.20481, 10.32151, 12.73904, 10.828, 10.09151, 9.412704, 9.257725, 9.139534, 9.054203]}}
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [11.21206, 11.20481, 10.32151, 12.73904, 10.828, 10.09151, 9.412704, 9.257725, 9.139534, 9.054203]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/train/qwen3/conf/envs/cuda.yaml b/tests/functional_tests/train/qwen3/conf/envs/cuda.yaml
new file mode 100644
index 0000000000..1fc01e091a
--- /dev/null
+++ b/tests/functional_tests/train/qwen3/conf/envs/cuda.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+# CUDA platform environment variables for train/qwen3
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    CUBLAS_WORKSPACE_CONFIG: ":4096:8"
+    NCCL_ALGO: "Ring"
+    NVTE_APPLY_QK_LAYER_SCALING: 0
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+    NVTE_FLASH_ATTN: 0
+    NVTE_FUSED_ATTN: 0
+    CUDNN_BENCHMARK: "false"
+    CUDNN_DETERMINISTIC: "true"
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/train/qwen3/conf/envs/metax.yaml b/tests/functional_tests/train/qwen3/conf/envs/metax.yaml
new file mode 100644
index 0000000000..a33602967d
--- /dev/null
+++ b/tests/functional_tests/train/qwen3/conf/envs/metax.yaml
@@ -0,0 +1,11 @@
+# @package _global_
+# MetaX platform environment variables for train/qwen3
+# TODO: Replace with actual MetaX environment variables
+experiment:
+  envs:
+    HYDRA_FULL_ERROR: 1
+    # TODO: MetaX visible devices env var
+    # MACA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+  cmds:
+    # TODO: Update conda env activation for MetaX
+    before_start: source /root/miniconda3/bin/activate flagscale-train
diff --git a/tests/functional_tests/train/qwen3/conf/tp2_pp2.yaml b/tests/functional_tests/train/qwen3/conf/tp2_pp2.yaml
new file mode 100644
index 0000000000..eb1d344d94
--- /dev/null
+++ b/tests/functional_tests/train/qwen3/conf/tp2_pp2.yaml
@@ -0,0 +1,20 @@
+defaults:
+  - envs: cuda
+  - _self_
+  - train: tp2_pp2
+
+experiment:
+  exp_name: tp2_pp2
+  exp_dir: tests/functional_tests/train/qwen3/test_results/tp2_pp2
+  task:
+    type: train
+    backend: megatron
+    entrypoint: flagscale/train/megatron/train_gpt.py
+  runner:
+    ssh_port: null
+  shell_cmds: null
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/tests/functional_tests/train/qwen3/conf/tp4_pp2.yaml b/tests/functional_tests/train/qwen3/conf/tp4_pp2.yaml
new file mode 100644
index 0000000000..403ecf58c3
--- /dev/null
+++ b/tests/functional_tests/train/qwen3/conf/tp4_pp2.yaml
@@ -0,0 +1,20 @@
+defaults:
+  - envs: cuda
+  - _self_
+  - train: tp4_pp2
+
+experiment:
+  exp_name: tp4_pp2
+  exp_dir: tests/functional_tests/train/qwen3/test_results/tp4_pp2
+  task:
+    type: train
+    backend: megatron
+    entrypoint: flagscale/train/megatron/train_gpt.py
+  runner:
+    ssh_port: null
+  shell_cmds: null
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/tests/functional_tests/train/qwen3/conf/train/data.yaml b/tests/functional_tests/train/qwen3/conf/train/data.yaml
new file mode 100644
index 0000000000..cd53f9eb6e
--- /dev/null
+++ b/tests/functional_tests/train/qwen3/conf/train/data.yaml
@@ -0,0 +1,10 @@
+data:
+  data_path: /home/gitlab-runner/data/pile_wikipedia_demo/pile_wikipedia_demo
+  split: 1
+  no_mmap_bin_files: true
+  tokenizer:
+    legacy_tokenizer: true
+    tokenizer_type: QwenTokenizerFS
+    tokenizer_path: /home/gitlab-runner/tokenizers/qwentokenizer
+    vocab_size: 151936
+    make_vocab_size_divisible_by: 64
diff --git a/tests/functional_tests/train/qwen3/conf/train/tp2_pp2.yaml b/tests/functional_tests/train/qwen3/conf/train/tp2_pp2.yaml
new file mode 100644
index 0000000000..316bce9820
--- /dev/null
+++ b/tests/functional_tests/train/qwen3/conf/train/tp2_pp2.yaml
@@ -0,0 +1,67 @@
+defaults:
+  - data
+
+system:
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 2
+  disable_bias_linear: true
+  qk_layernorm: true
+  sequence_parallel: true
+  use_distributed_optimizer: true
+  precision:
+    bf16: true
+    attention_softmax_in_fp32: true
+    accumulate_allreduce_grads_in_fp32: true
+  logging:
+    log_interval: 1
+    no_log_loss_scale_to_tensorboard: true
+  checkpoint:
+    no_save_optim: true
+    no_save_rng: true
+    save_interval: 100000
+    tensorboard_log_interval: 999999
+
+model:
+  attention_backend: unfused
+  deterministic_mode: true
+  use_mcore_models: true
+  transformer_impl: transformer_engine
+  num_layers: 4
+  hidden_size: 512
+  ffn_hidden_size: 1536
+  kv_channels: 128
+  num_attention_heads: 8
+  group_query_attention: true
+  num_query_groups: 4
+  seq_length: 1024
+  max_position_embeddings: 1024
+  norm_epsilon: 1e-6
+  use_rotary_position_embeddings: true
+  rotary_base: 1000000
+  no_position_embedding: true
+  no_rope_fusion: true
+  swiglu: true
+  normalization: RMSNorm
+  position_embedding_type: rope
+  untie_embeddings_and_output_weights: false
+  init_method_std: 0.02
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  weight_decay: 0.1
+  clip_grad: 1.0
+  train_iters: 10
+  eval_iters: 0
+  micro_batch_size: 4
+  global_batch_size: 1024
+  seed: 42
+
+  optimizer:
+    weight_decay: 0.1
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    lr_scheduler:
+      lr: 2.0e-5
+      min_lr: 2.0e-6
+      lr_warmup_samples: 0
+      lr_warmup_fraction: 0.01
+      lr_decay_style: cosine
diff --git a/tests/functional_tests/train/qwen3/conf/train/tp4_pp2.yaml b/tests/functional_tests/train/qwen3/conf/train/tp4_pp2.yaml
new file mode 100644
index 0000000000..e18111ad3c
--- /dev/null
+++ b/tests/functional_tests/train/qwen3/conf/train/tp4_pp2.yaml
@@ -0,0 +1,67 @@
+defaults:
+  - data
+
+system:
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 2
+  disable_bias_linear: true
+  qk_layernorm: true
+  sequence_parallel: true
+  use_distributed_optimizer: true
+  precision:
+    bf16: true
+    attention_softmax_in_fp32: true
+    accumulate_allreduce_grads_in_fp32: true
+  logging:
+    log_interval: 1
+    no_log_loss_scale_to_tensorboard: true
+  checkpoint:
+    no_save_optim: true
+    no_save_rng: true
+    save_interval: 100000
+    tensorboard_log_interval: 999999
+
+model:
+  attention_backend: unfused
+  deterministic_mode: true
+  use_mcore_models: true
+  transformer_impl: transformer_engine
+  num_layers: 4
+  hidden_size: 512
+  ffn_hidden_size: 1536
+  kv_channels: 128
+  num_attention_heads: 8
+  group_query_attention: true
+  num_query_groups: 4
+  seq_length: 1024
+  max_position_embeddings: 1024
+  norm_epsilon: 1e-6
+  use_rotary_position_embeddings: true
+  rotary_base: 1000000
+  no_position_embedding: true
+  no_rope_fusion: true
+  swiglu: true
+  normalization: RMSNorm
+  position_embedding_type: rope
+  untie_embeddings_and_output_weights: false
+  init_method_std: 0.02
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  weight_decay: 0.1
+  clip_grad: 1.0
+  train_iters: 10
+  eval_iters: 0
+  micro_batch_size: 4
+  global_batch_size: 1024
+  seed: 42
+
+  optimizer:
+    weight_decay: 0.1
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    lr_scheduler:
+      lr: 2.0e-5
+      min_lr: 2.0e-6
+      lr_warmup_samples: 0
+      lr_warmup_fraction: 0.01
+      lr_decay_style: cosine
diff --git a/tests/functional_tests/train/qwen3/gold_values/tp2_pp2.json b/tests/functional_tests/train/qwen3/gold_values/tp2_pp2.json
new file mode 100644
index 0000000000..f8146e57d4
--- /dev/null
+++ b/tests/functional_tests/train/qwen3/gold_values/tp2_pp2.json
@@ -0,0 +1,9 @@
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [12.00312, 12.00331, 11.8856, 11.73663, 11.64624, 11.55245, 11.4999, 11.47063, 11.4367, 11.41239]
+            }
+        }
+    }
+}
diff --git a/tests/functional_tests/train/qwen3/gold_values/tp4_pp2.json b/tests/functional_tests/train/qwen3/gold_values/tp4_pp2.json
new file mode 100644
index 0000000000..268054b396
--- /dev/null
+++ b/tests/functional_tests/train/qwen3/gold_values/tp4_pp2.json
@@ -0,0 +1,9 @@
+{
+    "cuda": {
+        "a100": {
+            "lm loss:": {
+                "values": [11.98952, 11.99043, 11.8672, 11.71454, 11.62524, 11.53021, 11.48006, 11.45335, 11.41747, 11.39388]
+            }
+        }
+    }
+}
diff --git a/tests/test_utils/config/platforms/cuda.yaml b/tests/test_utils/config/platforms/cuda.yaml
index 4a4b70a448..a89a6f02df 100644
--- a/tests/test_utils/config/platforms/cuda.yaml
+++ b/tests/test_utils/config/platforms/cuda.yaml
@@ -23,6 +23,7 @@ a100:
         aquila: ["tp2_pp2", "tp4_pp2"]
         deepseek: ["tp2_pp2_ep2", "tp2_pp2_ep2_engram"]
         mixtral: ["tp2_pp1_ep2", "tp4_pp1_ep2"]
+        qwen3: ["tp2_pp2", "tp4_pp2"]
       hetero_train:
         aquila: ["tp2pp1_tp4pp1_tp2pp1", "tp2dp1pp1_tp2dp2pp1_tp1dp2pp1", "dp2dp4_shared_embedding"]
       inference:
diff --git a/tests/test_utils/config/platforms/metax.yaml b/tests/test_utils/config/platforms/metax.yaml
new file mode 100644
index 0000000000..23e9cad8ca
--- /dev/null
+++ b/tests/test_utils/config/platforms/metax.yaml
@@ -0,0 +1,42 @@
+# MetaX Platform Configuration
+# Test selection mechanism for MetaX GPU environments
+#
+# This configuration mirrors the cuda.yaml structure.
+# To add/remove tests: modify the functional test case lists below.
+# Platform-specific environment variables are in each model's conf/envs/metax.yaml.
+#
+# Users can modify the following:
+# - functional: Add/remove test case names in the support lists
+#   Example: aquila: ["tp2_pp2", "tp4_pp2"] -> add or remove items
+# - unit: Add/remove paths in include and exclude lists
+#   Example: include: "*" or ["test_basic.py", "runner/*"]
+#            exclude: [] or ["test_spiky_loss_detector.py"]
+
+# Define device types available for this platform
+# TODO: Replace with actual MetaX device type name
+device_types:
+  - metax_gpu
+
+# Device-specific test configurations
+# TODO: Replace 'metax_gpu' with actual device type name
+metax_gpu:
+  name: "metax_gpu"
+  tests:
+    functional:
+      # TODO: Uncomment and adjust test cases that MetaX supports
+      # Available test cases (see cuda.yaml for reference):
+      train:
+        aquila: ["tp2_pp2", "tp4_pp2"]
+      #   deepseek: ["tp2_pp2_ep2", "tp2_pp2_ep2_engram"]
+      #   mixtral: ["tp2_pp1_ep2", "tp4_pp1_ep2"]
+      # hetero_train:
+      #   aquila: ["tp2pp1_tp4pp1_tp2pp1", "tp2dp1pp1_tp2dp2pp1_tp1dp2pp1", "dp2dp4_shared_embedding"]
+      # inference:
+      #   qwen3: ["4b_tp2"]
+      # serve:
+      #   qwen2_5: ["0.5b"]
+    unit:
+      # Include patterns: "*" for all, or list specific paths
+      include: "*"
+      # Exclude patterns: empty list or list paths to exclude
+      exclude: []
diff --git a/tests/test_utils/runners/check_results.py b/tests/test_utils/runners/check_results.py
index e90fe2c436..0921390c9c 100644
--- a/tests/test_utils/runners/check_results.py
+++ b/tests/test_utils/runners/check_results.py
@@ -135,13 +135,15 @@ def find_latest_stdout_log(start_path):
     return None, latest_attempt
 
 
-@pytest.mark.usefixtures("path", "task", "model", "case")
-def test_train_equal(path, task, model, case):
+@pytest.mark.usefixtures("path", "task", "model", "case", "platform", "device")
+def test_train_equal(path, task, model, case, platform, device):
     """
     Compare training metrics from test run against gold values.
 
     This test extracts loss metrics from stdout.log and compares them
     against pre-recorded gold values using numpy.allclose for tolerance.
+
+    Gold values are stored in nested format: {platform: {device: {metric: {values: [...]}}}}
     """
     # Construct the test_result_path using the provided fixtures
     test_result_path = os.path.join(path, task, model, "test_results", case)
@@ -165,6 +167,16 @@ def test_train_equal(path, task, model, case):
     with open(gold_value_path, "r") as f:
         gold_result_json = json.load(f)
 
+    # Navigate nested structure: {platform: {device: {metric: ...}}}
+    assert platform in gold_result_json, (
+        f"Platform '{platform}' not found in gold values. Available: {list(gold_result_json.keys())}"
+    )
+    platform_data = gold_result_json[platform]
+    assert device in platform_data, (
+        f"Device '{device}' not found for platform '{platform}'. Available: {list(platform_data.keys())}"
+    )
+    gold_result_json = platform_data[device]
+
     # Extract the metric keys from gold values
     metric_keys = list(gold_result_json.keys())
 
diff --git a/tests/test_utils/runners/run_functional_tests.sh b/tests/test_utils/runners/run_functional_tests.sh
index cc6eec7d94..0abaadb4e4 100755
--- a/tests/test_utils/runners/run_functional_tests.sh
+++ b/tests/test_utils/runners/run_functional_tests.sh
@@ -90,17 +90,17 @@ run_test() {
         rm -rf "$exp_dir"/* 2>/dev/null || true
     fi
 
-    # Map task name to flagscale CLI subcommand
-    # e.g. hetero_train -> train, train -> train, others unchanged
-    local cli_task="$task"
-    case "$task" in
-        *train*) cli_task="train" ;;
-    esac
+    # Run test via flagscale CLI using the 'run' command with Hydra overrides.
+    # This allows passing platform-specific envs config group (envs=<platform>).
+    local run_args=("run" "--config-path" "$conf_dir" "--config-name" "$config" "--action" "test")
+
+    # If platform-specific envs config exists, pass it as a Hydra override
+    if [ -d "$conf_dir/envs" ] && [ -f "$conf_dir/envs/${PLATFORM}.yaml" ]; then
+        run_args+=("envs=${PLATFORM}")
+    fi
 
-    # Run test via flagscale CLI
-    # --config expects the full YAML path
-    log_info "Running: flagscale $cli_task $model --config $config_file --test"
-    flagscale "$cli_task" "$model" --config "$config_file" --test || return 1
+    log_info "Running: flagscale ${run_args[*]}"
+    flagscale "${run_args[@]}" || return 1
 
     # Match the corresponding comparison function according to task type
     # Matching rules: