Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions .github/workflows/demo_inference_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
name: Demo Inference Test - Qwen3

# Demo workflow to test the test-lyz-infer runner with real Qwen3 inference
# Uses FlagScale inference tests from tests/functional_tests/inference/qwen3

on:
pull_request:
branches: ["main"]
workflow_dispatch:
inputs:
test_case:
description: 'Test case to run'
required: false
default: '4b_tp2_ascend'
type: choice
options:
- 4b_tp2
- 4b_tp2_ascend

jobs:
demo_inference:
name: Qwen3 Inference Demo
runs-on: test-lyz-infer
env:
PROJECT_ROOT: ${{ github.workspace }}

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Print test info
run: |
echo "=========================================="
echo "Qwen3 Inference Demo Test"
echo "=========================================="
echo "Runner: test-lyz-infer"
echo "Test Case: ${{ inputs.test_case || '4b_tp2_ascend' }}"
echo "Workflow: ${{ github.workflow }}"
echo "Run ID: ${{ github.run_id }}"
echo "Project Root: $PROJECT_ROOT"
echo "=========================================="

- name: Check system info
run: |
echo "=== System Information ==="
echo "Hostname: $(hostname)"
echo "OS: $(uname -s)"
echo "Kernel: $(uname -r)"
echo "Architecture: $(uname -m)"
echo ""
echo "=== CPU Information ==="
lscpu | grep -E "Model name|CPU\(s\)|Thread|Core" || echo "lscpu not available"
echo ""
echo "=== Memory Information ==="
free -h || echo "free command not available"
echo ""
echo "=== Disk Space ==="
df -h / || echo "df command not available"

- name: Check GPU availability
run: |
echo "=== GPU Information ==="
if command -v nvidia-smi &> /dev/null; then
echo "NVIDIA GPU detected:"
nvidia-smi
else
echo "nvidia-smi not found - no NVIDIA GPU or driver not installed"
fi

if command -v npu-smi &> /dev/null; then
echo "Ascend NPU detected:"
npu-smi info
else
echo "npu-smi not found - no Ascend NPU"
fi

- name: Check Python environment
run: |
echo "=== Python Environment ==="
if command -v python &> /dev/null; then
echo "Python version: $(python --version)"
echo "Python location: $(which python)"
else
echo "Python not found in PATH"
fi

- name: Install FlagScale
run: |
echo "=== Installing FlagScale ==="
cd $PROJECT_ROOT
pip install . --no-build-isolation || { echo "❌ FlagScale install failed"; exit 1; }

# Install vllm-plugin-FL
pip install vllm-plugin-fl==0.1.0+vllm0.13.0 \
--extra-index-url https://resource.flagos.net/repository/flagos-pypi-hosted/simple \
|| { echo "❌ vllm-plugin-FL install failed"; exit 1; }
echo "✅ vllm-plugin-FL installed successfully"

# Verify installation
command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; }
echo "✅ FlagScale CLI installed: $(flagscale --version 2>/dev/null || echo 'version unknown')"

- name: Run Qwen3 inference test
id: inference_test
run: |
set -euo pipefail
cd $PROJECT_ROOT

TEST_CASE="${{ inputs.test_case || '4b_tp2_ascend' }}"
TEST_DIR="tests/functional_tests/inference/qwen3"
CONFIG_FILE="$TEST_DIR/conf/${TEST_CASE}.yaml"
RESULTS_DIR="$TEST_DIR/test_results/${TEST_CASE}"
GOLD_FILE="$TEST_DIR/results_gold/${TEST_CASE}"

echo "=== Running Qwen3 Inference Test ==="
echo "Test case: $TEST_CASE"
echo "Config: $CONFIG_FILE"
echo "Results dir: $RESULTS_DIR"
echo ""

# Check if config exists
if [ ! -f "$CONFIG_FILE" ]; then
echo "❌ Config file not found: $CONFIG_FILE"
exit 1
fi

# Create results directory
mkdir -p "$RESULTS_DIR"

# Run inference using flagscale
echo "Starting inference..."
flagscale inference qwen3 --config "$TEST_DIR/conf/${TEST_CASE}.yaml" --test || {
echo "❌ Inference failed"
exit 1
}

echo "✅ Inference completed"

# Check results
if [ -f "$RESULTS_DIR/output.txt" ]; then
echo ""
echo "=== Inference Output ==="
cat "$RESULTS_DIR/output.txt"
echo ""

# Compare with gold results if available
if [ -f "$GOLD_FILE" ]; then
echo "=== Comparing with gold results ==="
if diff -u "$GOLD_FILE" "$RESULTS_DIR/output.txt"; then
echo "✅ Results match gold standard"
else
echo "⚠️ Results differ from gold standard (this may be expected)"
fi
fi
else
echo "⚠️ Output file not found at $RESULTS_DIR/output.txt"
fi

echo ""
echo "=========================================="
echo "✅ Qwen3 inference test completed!"
echo "=========================================="

- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: qwen3-inference-results-${{ inputs.test_case || '4b_tp2_ascend' }}-${{ github.run_id }}
path: tests/functional_tests/inference/qwen3/test_results/${{ inputs.test_case || '4b_tp2_ascend' }}
retention-days: 7
if-no-files-found: warn

- name: Test summary
if: always()
run: |
echo "=== Test Summary ==="
echo "Status: ${{ job.status }}"
echo "Runner: test-lyz-infer"
echo "Test Case: ${{ inputs.test_case || '4b_tp2_ascend' }}"
echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
204 changes: 204 additions & 0 deletions .github/workflows/demo_train_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
name: Demo Train Test - Qwen3

# Demo workflow to test the flagscale-metax-c550 runner with real Qwen3 training
# Uses FlagScale training tests from tests/functional_tests/train/qwen3

on:
pull_request:
branches: ["main"]
workflow_dispatch:
inputs:
test_case:
description: 'Test case to run'
required: false
default: '0_6b_metax'
type: choice
options:
- 0_6b_metax

jobs:
demo_train:
name: Qwen3 Train Demo
runs-on: test-lyz-train-metax
env:
PROJECT_ROOT: ${{ github.workspace }}

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Print test info
run: |
echo "=========================================="
echo "Qwen3 Train Demo Test"
echo "=========================================="
echo "Runner: ${{ runner.name }}"
echo "Test Case: ${{ inputs.test_case || '0_6b_metax' }}"
echo "Workflow: ${{ github.workflow }}"
echo "Run ID: ${{ github.run_id }}"
echo "Project Root: $PROJECT_ROOT"
echo "Current User: $(whoami)"
echo "=== Environment Variables ==="
env | sort
echo "=========================================="

- name: Check system info
run: |
echo "=== System Information ==="
echo "Hostname: $(hostname)"
echo "OS: $(uname -s)"
echo "Kernel: $(uname -r)"
echo "Architecture: $(uname -m)"
echo ""
echo "=== CPU Information ==="
lscpu | grep -E "Model name|CPU\(s\)|Thread|Core" || echo "lscpu not available"
echo ""
echo "=== Memory Information ==="
free -h || echo "free command not available"
echo ""
echo "=== Disk Space ==="
df -h / || echo "df command not available"

- name: Check GPU availability
run: |
echo "=== GPU Information ==="
if command -v mx-smi &> /dev/null; then
echo "MetaX GPU detected:"
mx-smi
else
echo "mx-smi not found - no MetaX GPU or driver not installed"
fi

if command -v nvidia-smi &> /dev/null; then
echo "NVIDIA GPU detected:"
nvidia-smi
else
echo "nvidia-smi not found"
fi

- name: Check Python environment
run: |
echo "=== Python Environment ==="
if command -v python &> /dev/null; then
echo "Python version: $(python --version)"
echo "Python location: $(which python)"
else
echo "Python not found in PATH"
fi

- name: Prepare data
run: |
mkdir -p /opt/data && cd /opt/data
wget https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/datasets/enron_emails_demo_text_document_qwen/enron_emails_demo_text_document_qwen.idx
wget https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/datasets/enron_emails_demo_text_document_qwen/enron_emails_demo_text_document_qwen.bin
mkdir -p /opt/qwentokenizer && cd /opt/qwentokenizer
wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/tokenizer_config.json" -O tokenizer_config.json
wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/qwen.tiktoken" -O qwen.tiktoken
wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/qwen_generation_utils.py" -O qwen_generation_utils.py
wget "https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/tokenizers/qwentokenizer/tokenization_qwen.py" -O tokenization_qwen.py

- name: Install FlagScale
run: |
source /etc/profile.d/conda.sh
conda activate base

echo "=== Installing FlagScale ==="
cd $PROJECT_ROOT

# Install Megatron-LM-FL
git clone \
"https://github.com/flagos-ai/Megatron-LM-FL.git" /tmp/Megatron-LM-FL
git -C /tmp/Megatron-LM-FL checkout d092f8df49f7c0b5b4cae42d036b7e4a26b8fc81

echo "Installing Megatron-LM-FL via pip..."
pip install /tmp/Megatron-LM-FL --no-build-isolation --root-user-action=ignore \
|| { echo "Megatron-LM-FL install failed"; exit 1; }
echo "✅ Megatron-LM-FL installed successfully"

# Install TransformerEngine-FL and dependencies
git clone --depth 1 https://github.com/flagos-ai/TransformerEngine-FL.git /workspace/TransformerEngine-FL \
|| { echo "❌ TransformerEngine-FL clone failed"; exit 1; }
TE_FL_SKIP_CUDA=1 pip install /workspace/TransformerEngine-FL --no-build-isolation \
|| { echo "❌ TransformerEngine-FL install failed"; exit 1; }
echo "✅ TransformerEngine-FL installed successfully"

# Install FlagScale
pip install . --no-build-isolation || { echo "❌ FlagScale install failed"; exit 1; }

# Verify installation
command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; }
echo "✅ FlagScale CLI installed: $(flagscale --version 2>/dev/null || echo 'version unknown')"

- name: Run Qwen3 train test
id: train_test
run: |
source /etc/profile.d/conda.sh
conda activate base

set -euo pipefail
cd $PROJECT_ROOT

TEST_CASE="${{ inputs.test_case || '0_6b_metax' }}"
TEST_DIR="tests/functional_tests/train/qwen3"
CONFIG_FILE="$TEST_DIR/conf/${TEST_CASE}.yaml"
RESULTS_DIR="$TEST_DIR/test_results/${TEST_CASE}"

echo "=== Running Qwen3 Train Test ==="
echo "Test case: $TEST_CASE"
echo "Config: $CONFIG_FILE"
echo "Results dir: $RESULTS_DIR"
echo ""

# Check if config exists
if [ ! -f "$CONFIG_FILE" ]; then
echo "❌ Config file not found: $CONFIG_FILE"
exit 1
fi

echo "Starting training..."
flagscale train qwen3 --config "$CONFIG_FILE" --test || {
echo "❌ Training failed"
exit 1
}

echo "✅ Training completed"

# Check results
if [ -d "$RESULTS_DIR" ]; then
echo ""
echo "=== Training Results ==="
ls -la "$RESULTS_DIR"
echo ""

# Check for loss log if available
if [ -f "$RESULTS_DIR/hydra/logs/flagscale/flagscale.log" ]; then
echo "=== Training Log (last 50 lines) ==="
tail -50 "$RESULTS_DIR/hydra/logs/flagscale/flagscale.log"
echo ""
fi
else
echo "⚠️ Results directory not found at $RESULTS_DIR"
fi

echo ""
echo "=========================================="
echo "✅ Qwen3 train test completed!"
echo "=========================================="

- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: qwen3-train-results-${{ inputs.test_case || '0_6b_metax' }}-${{ github.run_id }}
path: tests/functional_tests/train/qwen3/test_results/${{ inputs.test_case || '0_6b_metax' }}
retention-days: 7
if-no-files-found: warn

- name: Test summary
if: always()
run: |
echo "=== Test Summary ==="
echo "Status: ${{ job.status }}"
echo "Runner: ${{ runner.name }}"
echo "Test Case: ${{ inputs.test_case || '0_6b_metax' }}"
echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
Loading
Loading