From 5c74ab1fc681a74f8efbed2e149e124d41cbd2b9 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Fri, 15 May 2026 16:39:56 +0800 Subject: [PATCH 01/17] [cicd] test dind --- .github/workflows/demo_inference_test.yml | 172 ++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 .github/workflows/demo_inference_test.yml diff --git a/.github/workflows/demo_inference_test.yml b/.github/workflows/demo_inference_test.yml new file mode 100644 index 000000000..1b4736374 --- /dev/null +++ b/.github/workflows/demo_inference_test.yml @@ -0,0 +1,172 @@ +name: Demo Inference Test - Qwen3 + +# Demo workflow to test the test-lyz-infer runner with real Qwen3 inference +# Uses FlagScale inference tests from tests/functional_tests/inference/qwen3 + +on: + workflow_dispatch: + inputs: + test_case: + description: 'Test case to run' + required: false + default: '4b_tp2' + type: choice + options: + - 4b_tp2 + - 4b_tp2_ascend + +jobs: + demo_inference: + name: Qwen3 Inference Demo + runs-on: test-lyz-infer + env: + PROJECT_ROOT: ${{ github.workspace }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Print test info + run: | + echo "==========================================" + echo "Qwen3 Inference Demo Test" + echo "==========================================" + echo "Runner: test-lyz-infer" + echo "Test Case: ${{ inputs.test_case }}" + echo "Workflow: ${{ github.workflow }}" + echo "Run ID: ${{ github.run_id }}" + echo "Project Root: $PROJECT_ROOT" + echo "==========================================" + + - name: Check system info + run: | + echo "=== System Information ===" + echo "Hostname: $(hostname)" + echo "OS: $(uname -s)" + echo "Kernel: $(uname -r)" + echo "Architecture: $(uname -m)" + echo "" + echo "=== CPU Information ===" + lscpu | grep -E "Model name|CPU\(s\)|Thread|Core" || echo "lscpu not available" + echo "" + echo "=== Memory Information ===" + free -h || echo "free command not available" + echo "" + echo "=== Disk Space ===" + df -h / || echo "df command not available" + + - name: Check GPU availability + run: | + echo "=== GPU Information ===" + if command -v nvidia-smi &> /dev/null; then + echo "NVIDIA GPU detected:" + nvidia-smi + else + echo "nvidia-smi not found - no NVIDIA GPU or driver not installed" + fi + + if command -v npu-smi &> /dev/null; then + echo "Ascend NPU detected:" + npu-smi info + else + echo "npu-smi not found - no Ascend NPU" + fi + + - name: Check Python environment + run: | + echo "=== Python Environment ===" + if command -v python &> /dev/null; then + echo "Python version: $(python --version)" + echo "Python location: $(which python)" + else + echo "Python not found in PATH" + fi + + - name: Install FlagScale + run: | + echo "=== Installing FlagScale ===" + cd $PROJECT_ROOT + pip install . --no-build-isolation || { echo "❌ FlagScale install failed"; exit 1; } + + # Verify installation + command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; } + echo "✅ FlagScale CLI installed: $(flagscale --version 2>/dev/null || echo 'version unknown')" + + - name: Run Qwen3 inference test + id: inference_test + run: | + set -euo pipefail + cd $PROJECT_ROOT + + TEST_CASE="${{ inputs.test_case }}" + TEST_DIR="tests/functional_tests/inference/qwen3" + CONFIG_FILE="$TEST_DIR/conf/${TEST_CASE}.yaml" + RESULTS_DIR="$TEST_DIR/test_results/${TEST_CASE}" + GOLD_FILE="$TEST_DIR/results_gold/${TEST_CASE}" + + echo "=== Running Qwen3 Inference Test ===" + echo "Test case: $TEST_CASE" + echo "Config: $CONFIG_FILE" + echo "Results dir: $RESULTS_DIR" + echo "" + + # Check if config exists + if [ ! -f "$CONFIG_FILE" ]; then + echo "❌ Config file not found: $CONFIG_FILE" + exit 1 + fi + + # Create results directory + mkdir -p "$RESULTS_DIR" + + # Run inference using flagscale + echo "Starting inference..." + flagscale run --config-path "$TEST_DIR/conf" --config-name "${TEST_CASE}" || { + echo "❌ Inference failed" + exit 1 + } + + echo "✅ Inference completed" + + # Check results + if [ -f "$RESULTS_DIR/output.txt" ]; then + echo "" + echo "=== Inference Output ===" + cat "$RESULTS_DIR/output.txt" + echo "" + + # Compare with gold results if available + if [ -f "$GOLD_FILE" ]; then + echo "=== Comparing with gold results ===" + if diff -u "$GOLD_FILE" "$RESULTS_DIR/output.txt"; then + echo "✅ Results match gold standard" + else + echo "⚠️ Results differ from gold standard (this may be expected)" + fi + fi + else + echo "⚠️ Output file not found at $RESULTS_DIR/output.txt" + fi + + echo "" + echo "==========================================" + echo "✅ Qwen3 inference test completed!" + echo "==========================================" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: qwen3-inference-results-${{ inputs.test_case }}-${{ github.run_id }} + path: tests/functional_tests/inference/qwen3/test_results/${{ inputs.test_case }} + retention-days: 7 + if-no-files-found: warn + + - name: Test summary + if: always() + run: | + echo "=== Test Summary ===" + echo "Status: ${{ job.status }}" + echo "Runner: test-lyz-infer" + echo "Test Case: ${{ inputs.test_case }}" + echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" From d43e7eea919a427180ef85a759f40af26c39ba0f Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Fri, 15 May 2026 16:42:27 +0800 Subject: [PATCH 02/17] [cicd] test dind --- .github/workflows/demo_inference_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/demo_inference_test.yml b/.github/workflows/demo_inference_test.yml index 1b4736374..7427c124b 100644 --- a/.github/workflows/demo_inference_test.yml +++ b/.github/workflows/demo_inference_test.yml @@ -4,6 +4,8 @@ name: Demo Inference Test - Qwen3 # Uses FlagScale inference tests from tests/functional_tests/inference/qwen3 on: + pull_request: + branches: ["main"] workflow_dispatch: inputs: test_case: From c12fcf51133960f25046523916af8bc97fd311df Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Fri, 15 May 2026 17:35:39 +0800 Subject: [PATCH 03/17] [cicd] test dind --- .github/workflows/demo_inference_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/demo_inference_test.yml b/.github/workflows/demo_inference_test.yml index 7427c124b..e97c37329 100644 --- a/.github/workflows/demo_inference_test.yml +++ b/.github/workflows/demo_inference_test.yml @@ -11,7 +11,7 @@ on: test_case: description: 'Test case to run' required: false - default: '4b_tp2' + default: '4b_tp2_ascend' type: choice options: - 4b_tp2 From 0d23b4d905898d371934462ed7cc0d9fa885fb24 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Mon, 18 May 2026 10:48:59 +0800 Subject: [PATCH 04/17] test --- .github/workflows/demo_inference_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/demo_inference_test.yml b/.github/workflows/demo_inference_test.yml index e97c37329..1642f77b5 100644 --- a/.github/workflows/demo_inference_test.yml +++ b/.github/workflows/demo_inference_test.yml @@ -69,7 +69,7 @@ jobs: if command -v npu-smi &> /dev/null; then echo "Ascend NPU detected:" - npu-smi info + # npu-smi info else echo "npu-smi not found - no Ascend NPU" fi From ba372ecc5f2947160778fdec4a2350223962b807 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Mon, 18 May 2026 15:44:41 +0800 Subject: [PATCH 05/17] test --- .github/workflows/demo_inference_test.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/demo_inference_test.yml b/.github/workflows/demo_inference_test.yml index 1642f77b5..e4684672b 100644 --- a/.github/workflows/demo_inference_test.yml +++ b/.github/workflows/demo_inference_test.yml @@ -34,7 +34,7 @@ jobs: echo "Qwen3 Inference Demo Test" echo "==========================================" echo "Runner: test-lyz-infer" - echo "Test Case: ${{ inputs.test_case }}" + echo "Test Case: ${{ inputs.test_case || '4b_tp2_ascend' }}" echo "Workflow: ${{ github.workflow }}" echo "Run ID: ${{ github.run_id }}" echo "Project Root: $PROJECT_ROOT" @@ -69,7 +69,7 @@ jobs: if command -v npu-smi &> /dev/null; then echo "Ascend NPU detected:" - # npu-smi info + npu-smi info else echo "npu-smi not found - no Ascend NPU" fi @@ -100,7 +100,7 @@ jobs: set -euo pipefail cd $PROJECT_ROOT - TEST_CASE="${{ inputs.test_case }}" + TEST_CASE="${{ inputs.test_case || '4b_tp2_ascend' }}" TEST_DIR="tests/functional_tests/inference/qwen3" CONFIG_FILE="$TEST_DIR/conf/${TEST_CASE}.yaml" RESULTS_DIR="$TEST_DIR/test_results/${TEST_CASE}" @@ -159,8 +159,8 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: qwen3-inference-results-${{ inputs.test_case }}-${{ github.run_id }} - path: tests/functional_tests/inference/qwen3/test_results/${{ inputs.test_case }} + name: qwen3-inference-results-${{ inputs.test_case || '4b_tp2_ascend' }}-${{ github.run_id }} + path: tests/functional_tests/inference/qwen3/test_results/${{ inputs.test_case || '4b_tp2_ascend' }} retention-days: 7 if-no-files-found: warn @@ -170,5 +170,5 @@ jobs: echo "=== Test Summary ===" echo "Status: ${{ job.status }}" echo "Runner: test-lyz-infer" - echo "Test Case: ${{ inputs.test_case }}" + echo "Test Case: ${{ inputs.test_case || '4b_tp2_ascend' }}" echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" From dc15ff20912fee9a74eccc12beca6e7e86135dc0 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Mon, 18 May 2026 15:47:47 +0800 Subject: [PATCH 06/17] test --- .github/workflows/demo_inference_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/demo_inference_test.yml b/.github/workflows/demo_inference_test.yml index e4684672b..ee68018c9 100644 --- a/.github/workflows/demo_inference_test.yml +++ b/.github/workflows/demo_inference_test.yml @@ -123,7 +123,7 @@ jobs: # Run inference using flagscale echo "Starting inference..." - flagscale run --config-path "$TEST_DIR/conf" --config-name "${TEST_CASE}" || { + flagscale run --config-path "$TEST_DIR/conf" --config-name "${TEST_CASE}" --test || { echo "❌ Inference failed" exit 1 } From 9c049bf7ade1e648720c28d13d95940b5ec5f690 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Mon, 18 May 2026 15:50:26 +0800 Subject: [PATCH 07/17] test --- .github/workflows/demo_inference_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/demo_inference_test.yml b/.github/workflows/demo_inference_test.yml index ee68018c9..73519ada7 100644 --- a/.github/workflows/demo_inference_test.yml +++ b/.github/workflows/demo_inference_test.yml @@ -123,7 +123,7 @@ jobs: # Run inference using flagscale echo "Starting inference..." - flagscale run --config-path "$TEST_DIR/conf" --config-name "${TEST_CASE}" --test || { + flagscale inference qwen3 --config-path "$TEST_DIR/conf" --config-name "${TEST_CASE}" --test || { echo "❌ Inference failed" exit 1 } From c8934345df50355fe06b7c74b7fb991f3aedb12b Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Mon, 18 May 2026 15:52:01 +0800 Subject: [PATCH 08/17] test --- .github/workflows/demo_inference_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/demo_inference_test.yml b/.github/workflows/demo_inference_test.yml index 73519ada7..40bd63956 100644 --- a/.github/workflows/demo_inference_test.yml +++ b/.github/workflows/demo_inference_test.yml @@ -123,7 +123,7 @@ jobs: # Run inference using flagscale echo "Starting inference..." - flagscale inference qwen3 --config-path "$TEST_DIR/conf" --config-name "${TEST_CASE}" --test || { + flagscale inference qwen3 --config "$TEST_DIR/conf/${TEST_CASE}.yaml" --test || { echo "❌ Inference failed" exit 1 } From ac4a4835ea95fb362447d0a3bab42f4304fa1e1d Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Mon, 18 May 2026 15:56:09 +0800 Subject: [PATCH 09/17] test --- .../inference/qwen3/conf/inference/4b_tp2_ascend.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional_tests/inference/qwen3/conf/inference/4b_tp2_ascend.yaml b/tests/functional_tests/inference/qwen3/conf/inference/4b_tp2_ascend.yaml index 366e72081..31b6d36fc 100644 --- a/tests/functional_tests/inference/qwen3/conf/inference/4b_tp2_ascend.yaml +++ b/tests/functional_tests/inference/qwen3/conf/inference/4b_tp2_ascend.yaml @@ -1,6 +1,6 @@ llm: - model: /home/gitlab-runner/data/Qwen3-4B - tokenizer: /home/gitlab-runner/data/Qwen3-4B + model: /flagcicd/model/0b0ec7d3-1439-4e5d-9847-f5026942e397/latest + tokenizer: /flagcicd/model/0b0ec7d3-1439-4e5d-9847-f5026942e397/latest trust_remote_code: true tensor_parallel_size: 2 pipeline_parallel_size: 1 From d1cc929d0b0f544975a868d56e6f7e626bfcfd97 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Mon, 18 May 2026 16:00:05 +0800 Subject: [PATCH 10/17] test --- .github/workflows/demo_inference_test.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/demo_inference_test.yml b/.github/workflows/demo_inference_test.yml index 40bd63956..32775c436 100644 --- a/.github/workflows/demo_inference_test.yml +++ b/.github/workflows/demo_inference_test.yml @@ -90,6 +90,12 @@ jobs: cd $PROJECT_ROOT pip install . --no-build-isolation || { echo "❌ FlagScale install failed"; exit 1; } + # Install vllm-plugin-FL + pip install vllm-plugin-fl==0.1.0+vllm0.13.0 \ + --extra-index-url https://resource.flagos.net/repository/flagos-pypi-hosted/simple \ + || { echo "❌ vllm-plugin-FL install failed"; exit 1; } + echo "✅ vllm-plugin-FL installed successfully" + # Verify installation command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; } echo "✅ FlagScale CLI installed: $(flagscale --version 2>/dev/null || echo 'version unknown')" From a75d26cbf223afc68f0a4cec51f6f89016cd1940 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Mon, 18 May 2026 16:43:06 +0800 Subject: [PATCH 11/17] test metax --- .github/workflows/demo_train_test.yml | 191 ++++++++++++++++++ .../train/qwen3/conf/train/data.yaml | 4 +- 2 files changed, 193 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/demo_train_test.yml diff --git a/.github/workflows/demo_train_test.yml b/.github/workflows/demo_train_test.yml new file mode 100644 index 000000000..9f11977d6 --- /dev/null +++ b/.github/workflows/demo_train_test.yml @@ -0,0 +1,191 @@ +name: Demo Train Test - Qwen3 + +# Demo workflow to test the flagscale-metax-c550 runner with real Qwen3 training +# Uses FlagScale training tests from tests/functional_tests/train/qwen3 + +on: + pull_request: + branches: ["main"] + workflow_dispatch: + inputs: + test_case: + description: 'Test case to run' + required: false + default: '0_6b_metax' + type: choice + options: + - 0_6b_metax + +jobs: + demo_train: + name: Qwen3 Train Demo + runs-on: test-lyz-train-metax + env: + PROJECT_ROOT: ${{ github.workspace }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Print test info + run: | + echo "==========================================" + echo "Qwen3 Train Demo Test" + echo "==========================================" + echo "Runner: ${{ runner.name }}" + echo "Test Case: ${{ inputs.test_case || '0_6b_metax' }}" + echo "Workflow: ${{ github.workflow }}" + echo "Run ID: ${{ github.run_id }}" + echo "Project Root: $PROJECT_ROOT" + echo "==========================================" + + - name: Check system info + run: | + echo "=== System Information ===" + echo "Hostname: $(hostname)" + echo "OS: $(uname -s)" + echo "Kernel: $(uname -r)" + echo "Architecture: $(uname -m)" + echo "" + echo "=== CPU Information ===" + lscpu | grep -E "Model name|CPU\(s\)|Thread|Core" || echo "lscpu not available" + echo "" + echo "=== Memory Information ===" + free -h || echo "free command not available" + echo "" + echo "=== Disk Space ===" + df -h / || echo "df command not available" + + - name: Check GPU availability + run: | + echo "=== GPU Information ===" + if command -v mx-smi &> /dev/null; then + echo "MetaX GPU detected:" + mx-smi + else + echo "mx-smi not found - no MetaX GPU or driver not installed" + fi + + if command -v nvidia-smi &> /dev/null; then + echo "NVIDIA GPU detected:" + nvidia-smi + else + echo "nvidia-smi not found" + fi + + - name: Check Python environment + run: | + echo "=== Python Environment ===" + if command -v python &> /dev/null; then + echo "Python version: $(python --version)" + echo "Python location: $(which python)" + else + echo "Python not found in PATH" + fi + + - name: Prepare data + run: | + mkdir -p /opt/data && cd /opt/data + wget https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/datasets/enron_emails_demo_text_document_qwen/enron_emails_demo_text_document_qwen.idx + wget https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/datasets/enron_emails_demo_text_document_qwen/enron_emails_demo_text_document_qwen.bin + mkdir -p /opt/qwentokenizer && cd /opt/qwentokenizer + wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/tokenizer_config.json" -O tokenizer_config.json + wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/qwen.tiktoken" -O qwen.tiktoken + wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/qwen_generation_utils.py" -O qwen_generation_utils.py + wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/tokenization_qwen.py" -O tokenization_qwen.py + + - name: Install FlagScale + run: | + echo "=== Installing FlagScale ===" + cd $PROJECT_ROOT + + # Install Megatron-LM-FL + pip install megatron_core==0.1.0+megatron0.15.0rc7 \ + --extra-index-url https://resource.flagos.net/repository/flagos-pypi-hosted/simple \ + || { echo "❌ Megatron-LM-FL install failed"; exit 1; } + echo "✅ Megatron-LM-FL installed successfully" + + # Install TransformerEngine-FL and dependencies + git clone --depth 1 https://github.com/flagos-ai/TransformerEngine-FL.git /workspace/TransformerEngine-FL \ + || { echo "❌ TransformerEngine-FL clone failed"; exit 1; } + TE_FL_SKIP_CUDA=1 pip install /workspace/TransformerEngine-FL --no-build-isolation \ + || { echo "❌ TransformerEngine-FL install failed"; exit 1; } + echo "✅ TransformerEngine-FL installed successfully" + + # Install FlagScale + pip install . --no-build-isolation || { echo "❌ FlagScale install failed"; exit 1; } + + # Verify installation + command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; } + echo "✅ FlagScale CLI installed: $(flagscale --version 2>/dev/null || echo 'version unknown')" + + - name: Run Qwen3 train test + id: train_test + run: | + set -euo pipefail + cd $PROJECT_ROOT + + TEST_CASE="${{ inputs.test_case || '0_6b_metax' }}" + TEST_DIR="tests/functional_tests/train/qwen3" + CONFIG_FILE="$TEST_DIR/conf/${TEST_CASE}.yaml" + RESULTS_DIR="$TEST_DIR/test_results/${TEST_CASE}" + + echo "=== Running Qwen3 Train Test ===" + echo "Test case: $TEST_CASE" + echo "Config: $CONFIG_FILE" + echo "Results dir: $RESULTS_DIR" + echo "" + + # Check if config exists + if [ ! -f "$CONFIG_FILE" ]; then + echo "❌ Config file not found: $CONFIG_FILE" + exit 1 + fi + + echo "Starting training..." + flagscale train qwen3 --config "$CONFIG_FILE" --test || { + echo "❌ Training failed" + exit 1 + } + + echo "✅ Training completed" + + # Check results + if [ -d "$RESULTS_DIR" ]; then + echo "" + echo "=== Training Results ===" + ls -la "$RESULTS_DIR" + echo "" + + # Check for loss log if available + if [ -f "$RESULTS_DIR/hydra/logs/flagscale/flagscale.log" ]; then + echo "=== Training Log (last 50 lines) ===" + tail -50 "$RESULTS_DIR/hydra/logs/flagscale/flagscale.log" + echo "" + fi + else + echo "⚠️ Results directory not found at $RESULTS_DIR" + fi + + echo "" + echo "==========================================" + echo "✅ Qwen3 train test completed!" + echo "==========================================" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: qwen3-train-results-${{ inputs.test_case || '0_6b_metax' }}-${{ github.run_id }} + path: tests/functional_tests/train/qwen3/test_results/${{ inputs.test_case || '0_6b_metax' }} + retention-days: 7 + if-no-files-found: warn + + - name: Test summary + if: always() + run: | + echo "=== Test Summary ===" + echo "Status: ${{ job.status }}" + echo "Runner: ${{ runner.name }}" + echo "Test Case: ${{ inputs.test_case || '0_6b_metax' }}" + echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" diff --git a/tests/functional_tests/train/qwen3/conf/train/data.yaml b/tests/functional_tests/train/qwen3/conf/train/data.yaml index 8d9d4bd4c..bc9db18d8 100644 --- a/tests/functional_tests/train/qwen3/conf/train/data.yaml +++ b/tests/functional_tests/train/qwen3/conf/train/data.yaml @@ -1,9 +1,9 @@ data: - data_path: /home/gitlab-runner/data/pile_wikipedia_demo/pile_wikipedia_demo + data_path: /opt/data/pile_wikipedia_demo split: 1 no_mmap_bin_files: true tokenizer: tokenizer_type: QwenTokenizerFS - tokenizer_path: /home/gitlab-runner/tokenizers/qwentokenizer + tokenizer_path: /opt/qwentokenizer vocab_size: 151936 make_vocab_size_divisible_by: 64 From 3f0bb5597515d0bf665c5d53f5e3a0502de28a01 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Mon, 18 May 2026 18:02:48 +0800 Subject: [PATCH 12/17] print user --- .github/workflows/demo_train_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/demo_train_test.yml b/.github/workflows/demo_train_test.yml index 9f11977d6..7665c191f 100644 --- a/.github/workflows/demo_train_test.yml +++ b/.github/workflows/demo_train_test.yml @@ -37,6 +37,7 @@ jobs: echo "Workflow: ${{ github.workflow }}" echo "Run ID: ${{ github.run_id }}" echo "Project Root: $PROJECT_ROOT" + echo "Current User: $(whoami)" echo "==========================================" - name: Check system info From d08f73e3233028a9ae07e4c402317813896fced3 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Tue, 19 May 2026 10:03:29 +0800 Subject: [PATCH 13/17] env | sort --- .github/workflows/demo_train_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/demo_train_test.yml b/.github/workflows/demo_train_test.yml index 7665c191f..74f1ed39a 100644 --- a/.github/workflows/demo_train_test.yml +++ b/.github/workflows/demo_train_test.yml @@ -38,6 +38,8 @@ jobs: echo "Run ID: ${{ github.run_id }}" echo "Project Root: $PROJECT_ROOT" echo "Current User: $(whoami)" + echo "=== Environment Variables ===" + env | sort echo "==========================================" - name: Check system info From 82a68c8b59c901af122e56cb3db204eae3b033e3 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Tue, 19 May 2026 10:26:44 +0800 Subject: [PATCH 14/17] conda env --- .github/workflows/demo_train_test.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/demo_train_test.yml b/.github/workflows/demo_train_test.yml index 74f1ed39a..db0ae44fb 100644 --- a/.github/workflows/demo_train_test.yml +++ b/.github/workflows/demo_train_test.yml @@ -99,6 +99,9 @@ jobs: - name: Install FlagScale run: | + source /etc/profile.d/conda.sh + conda activate base + echo "=== Installing FlagScale ===" cd $PROJECT_ROOT @@ -125,6 +128,9 @@ jobs: - name: Run Qwen3 train test id: train_test run: | + source /etc/profile.d/conda.sh + conda activate base + set -euo pipefail cd $PROJECT_ROOT From bbcf237ac244b689ae4d073bd964dae863ac254b Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Tue, 19 May 2026 10:32:18 +0800 Subject: [PATCH 15/17] conda env --- .github/workflows/demo_train_test.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/demo_train_test.yml b/.github/workflows/demo_train_test.yml index db0ae44fb..f72da983c 100644 --- a/.github/workflows/demo_train_test.yml +++ b/.github/workflows/demo_train_test.yml @@ -106,9 +106,13 @@ jobs: cd $PROJECT_ROOT # Install Megatron-LM-FL - pip install megatron_core==0.1.0+megatron0.15.0rc7 \ - --extra-index-url https://resource.flagos.net/repository/flagos-pypi-hosted/simple \ - || { echo "❌ Megatron-LM-FL install failed"; exit 1; } + git clone \ + "https://github.com/flagos-ai/Megatron-LM-FL.git" /tmp/Megatron-LM-FL + git -C /tmp/Megatron-LM-FL checkout d092f8df49f7c0b5b4cae42d036b7e4a26b8fc81 + + echo "Installing Megatron-LM-FL via pip..." + pip install /tmp/Megatron-LM-FL --no-build-isolation --root-user-action=ignore \ + || { echo "Megatron-LM-FL install failed"; exit 1; } echo "✅ Megatron-LM-FL installed successfully" # Install TransformerEngine-FL and dependencies @@ -130,7 +134,7 @@ jobs: run: | source /etc/profile.d/conda.sh conda activate base - + set -euo pipefail cd $PROJECT_ROOT From 2f0c7b78e8ee5c647c63f2645912664a902d40ea Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Tue, 19 May 2026 11:15:51 +0800 Subject: [PATCH 16/17] test musa --- .github/workflows/test_demo_musa.yml | 188 +++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 .github/workflows/test_demo_musa.yml diff --git a/.github/workflows/test_demo_musa.yml b/.github/workflows/test_demo_musa.yml new file mode 100644 index 000000000..ccaf91691 --- /dev/null +++ b/.github/workflows/test_demo_musa.yml @@ -0,0 +1,188 @@ +name: Demo Train Test - Qwen3 + +# Demo workflow to test the flagscale-metax-c550 runner with real Qwen3 training +# Uses FlagScale training tests from tests/functional_tests/train/qwen3 + +on: + pull_request: + branches: ["main"] + workflow_dispatch: + inputs: + test_case: + description: 'Test case to run' + required: false + default: '0_6b_metax' + type: choice + options: + - 0_6b_metax + +jobs: + demo_train: + name: Qwen3 Train Demo + runs-on: test-lyz-musa + env: + PROJECT_ROOT: ${{ github.workspace }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Print test info + run: | + echo "==========================================" + echo "Qwen3 Train Demo Test" + echo "==========================================" + echo "Runner: ${{ runner.name }}" + echo "Test Case: ${{ inputs.test_case || '0_6b_metax' }}" + echo "Workflow: ${{ github.workflow }}" + echo "Run ID: ${{ github.run_id }}" + echo "Project Root: $PROJECT_ROOT" + echo "Current User: $(whoami)" + echo "=== Environment Variables ===" + env | sort + echo "==========================================" + + - name: Check system info + run: | + echo "=== System Information ===" + echo "Hostname: $(hostname)" + echo "OS: $(uname -s)" + echo "Kernel: $(uname -r)" + echo "Architecture: $(uname -m)" + echo "" + echo "=== CPU Information ===" + lscpu | grep -E "Model name|CPU\(s\)|Thread|Core" || echo "lscpu not available" + echo "" + echo "=== Memory Information ===" + free -h || echo "free command not available" + echo "" + echo "=== Disk Space ===" + df -h / || echo "df command not available" + + - name: Check GPU availability + run: | + echo "=== GPU Information ===" + mthreads-gmi + + - name: Check Python environment + run: | + echo "=== Python Environment ===" + if command -v python &> /dev/null; then + echo "Python version: $(python --version)" + echo "Python location: $(which python)" + else + echo "Python not found in PATH" + fi + + - name: Prepare data + run: | + mkdir -p /opt/data && cd /opt/data + wget https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/datasets/enron_emails_demo_text_document_qwen/enron_emails_demo_text_document_qwen.idx + wget https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/datasets/enron_emails_demo_text_document_qwen/enron_emails_demo_text_document_qwen.bin + mkdir -p /opt/qwentokenizer && cd /opt/qwentokenizer + wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/tokenizer_config.json" -O tokenizer_config.json + wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/qwen.tiktoken" -O qwen.tiktoken + wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/qwen_generation_utils.py" -O qwen_generation_utils.py + wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/tokenization_qwen.py" -O tokenization_qwen.py + + - name: Install FlagScale + run: | + + echo "=== Installing FlagScale ===" + cd $PROJECT_ROOT + + # Install Megatron-LM-FL + git clone \ + "https://github.com/flagos-ai/Megatron-LM-FL.git" /tmp/Megatron-LM-FL + git -C /tmp/Megatron-LM-FL checkout d092f8df49f7c0b5b4cae42d036b7e4a26b8fc81 + + echo "Installing Megatron-LM-FL via pip..." + pip install /tmp/Megatron-LM-FL --no-build-isolation --root-user-action=ignore \ + || { echo "Megatron-LM-FL install failed"; exit 1; } + echo "✅ Megatron-LM-FL installed successfully" + + # Install TransformerEngine-FL and dependencies + git clone --depth 1 https://github.com/flagos-ai/TransformerEngine-FL.git /workspace/TransformerEngine-FL \ + || { echo "❌ TransformerEngine-FL clone failed"; exit 1; } + TE_FL_SKIP_CUDA=1 pip install /workspace/TransformerEngine-FL --no-build-isolation \ + || { echo "❌ TransformerEngine-FL install failed"; exit 1; } + echo "✅ TransformerEngine-FL installed successfully" + + # Install FlagScale + pip install . --no-build-isolation || { echo "❌ FlagScale install failed"; exit 1; } + + # Verify installation + command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; } + echo "✅ FlagScale CLI installed: $(flagscale --version 2>/dev/null || echo 'version unknown')" + + - name: Run Qwen3 train test + id: train_test + run: | + + set -euo pipefail + cd $PROJECT_ROOT + + TEST_CASE="${{ inputs.test_case || '0_6b_metax' }}" + TEST_DIR="tests/functional_tests/train/qwen3" + CONFIG_FILE="$TEST_DIR/conf/${TEST_CASE}.yaml" + RESULTS_DIR="$TEST_DIR/test_results/${TEST_CASE}" + + echo "=== Running Qwen3 Train Test ===" + echo "Test case: $TEST_CASE" + echo "Config: $CONFIG_FILE" + echo "Results dir: $RESULTS_DIR" + echo "" + + # Check if config exists + if [ ! -f "$CONFIG_FILE" ]; then + echo "❌ Config file not found: $CONFIG_FILE" + exit 1 + fi + + echo "Starting training..." + flagscale train qwen3 --config "$CONFIG_FILE" --test || { + echo "❌ Training failed" + exit 1 + } + + echo "✅ Training completed" + + # Check results + if [ -d "$RESULTS_DIR" ]; then + echo "" + echo "=== Training Results ===" + ls -la "$RESULTS_DIR" + echo "" + + # Check for loss log if available + if [ -f "$RESULTS_DIR/hydra/logs/flagscale/flagscale.log" ]; then + echo "=== Training Log (last 50 lines) ===" + tail -50 "$RESULTS_DIR/hydra/logs/flagscale/flagscale.log" + echo "" + fi + else + echo "⚠️ Results directory not found at $RESULTS_DIR" + fi + + echo "" + echo "==========================================" + echo "✅ Qwen3 train test completed!" + echo "==========================================" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: qwen3-train-results-${{ inputs.test_case || '0_6b_metax' }}-${{ github.run_id }} + path: tests/functional_tests/train/qwen3/test_results/${{ inputs.test_case || '0_6b_metax' }} + retention-days: 7 + if-no-files-found: warn + + - name: Test summary + if: always() + run: | + echo "=== Test Summary ===" + echo "Status: ${{ job.status }}" + echo "Runner: ${{ runner.name }}" + echo "Test Case: ${{ inputs.test_case || '0_6b_metax' }}" + echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" From ac4762b68a767768382077e232087a9e0ddc7c83 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Tue, 19 May 2026 14:29:27 +0800 Subject: [PATCH 17/17] test musa --- .github/workflows/test_demo_musa.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_demo_musa.yml b/.github/workflows/test_demo_musa.yml index ccaf91691..ec4830aea 100644 --- a/.github/workflows/test_demo_musa.yml +++ b/.github/workflows/test_demo_musa.yml @@ -1,4 +1,4 @@ -name: Demo Train Test - Qwen3 +name: Demo Train Test Musa - Qwen3 # Demo workflow to test the flagscale-metax-c550 runner with real Qwen3 training # Uses FlagScale training tests from tests/functional_tests/train/qwen3