From cd8b990d24b2a6e3fd903a633793dd968b1a5dd0 Mon Sep 17 00:00:00 2001
From: superxf <1208713646@qq.com>
Date: Wed, 1 Jul 2026 10:04:45 +0800
Subject: [PATCH] add qwen3 test ci

---
 .github/workflows/ci.yml     | 169 +++++++++++++++++++++++++++--------
 tests/test_qwen3_accuracy.py |  92 +++++++++++++++++++
 2 files changed, 226 insertions(+), 35 deletions(-)
 create mode 100644 tests/test_qwen3_accuracy.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f9be0f1..3d81d78 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -27,52 +27,151 @@ jobs:
           extra_args: --all-files
 
   unit-tests:
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, linux, arm64, npu]
     timeout-minutes: 30
+    defaults:
+      run:
+        working-directory: dist-checkout
+    env:
+      ASCEND_HOME_PATH: /usr/local/Ascend/cann-9.0.0
+      PTOAS_ROOT: ${{ github.workspace }}/dist-checkout/ptoas-bin
+      PTO_ISA_ROOT: ${{ github.workspace }}/dist-checkout/pto-isa
+      CMAKE_BUILD_PARALLEL_LEVEL: 16
+      CMAKE_C_COMPILER_LAUNCHER: ccache
+      CMAKE_CXX_COMPILER_LAUNCHER: ccache
+      CCACHE_DIR: /home/ci-runner/hw-native-sys-pypto-lib/ci-cache/ccache
+      CCACHE_MAXSIZE: 30G
+      # Shared with the container-side sim job (root) via the same ci-cache dir;
+      # 000 keeps cache files mutually writable across root and ci-runner.
+      CCACHE_UMASK: '000'
+      # Per-job pip dir: unlike ccache, pip has no shared-umask knob, so a shared
+      # pip cache would leave files owned by whichever job wrote first and block
+      # the other. The sim job uses ci-cache/pip-sim (see its bind mount).
+      PIP_CACHE_DIR: /home/ci-runner/hw-native-sys-pypto-lib/ci-cache/pip-a2a3
+
     steps:
       - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
         with:
-          python-version: "3.10"
+          path: dist-checkout
+          submodules: true
 
-      - name: Cache pip packages
-        uses: actions/cache@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ runner.os }}-pip-unit-tests
-          restore-keys: |
-            ${{ runner.os }}-pip-
+      - name: Resolve toolchain + sync pypto source
+        # Sync the cached pypto checkout ONCE here; the Build step below reuses it,
+        # so the toolchain pins and the installed pypto are always the same
+        # revision. pypto owns ptoas (toolchain/versions.env) and pins pto-isa via
+        # its runtime submodule (runtime/pto_isa.pin); read both.
+        run: |
+          PYPTO_SRC="/home/ci-runner/hw-native-sys-pypto-lib/ci-cache/pypto-src"
+          if [ -d "$PYPTO_SRC/.git" ]; then
+            echo "Cache hit — updating pypto"
+            git -C "$PYPTO_SRC" fetch --depth=1 origin HEAD
+            git -C "$PYPTO_SRC" reset --hard FETCH_HEAD
+            git -C "$PYPTO_SRC" clean -ffdx
+            git -C "$PYPTO_SRC" submodule foreach --recursive 'git reset --hard && git clean -ffdx' || true
+          else
+            echo "Cache miss — cloning pypto"
+            git clone --recurse-submodules --depth=1 https://github.com/hw-native-sys/pypto.git "$PYPTO_SRC"
+          fi
+          git -C "$PYPTO_SRC" submodule update --init --recursive
+          rm -rf "$PYPTO_SRC/build" "$PYPTO_SRC/_skbuild" "$PYPTO_SRC/runtime/build"
+          {
+            grep -E '^PTOAS_VERSION=' "$PYPTO_SRC/toolchain/versions.env"
+            echo "PTOAS_SHA256=$(sed -n 's/^PTOAS_SHA256_AARCH64=//p' "$PYPTO_SRC/toolchain/versions.env")"
+            echo "PTO_ISA_COMMIT=$(tr -d '[:space:]' < "$PYPTO_SRC/runtime/pto_isa.pin")"
+          } >> "$GITHUB_ENV"
+
+      - name: Check NPU
+        working-directory: .
+        run: npu-smi info
 
-      - name: Install dependencies
+      # See the sim job: namespace the cache by pto-isa commit so an ISA bump
+      # forces a real recompile instead of reusing stale objects (issue #1139).
+      - name: Scope ccache to pto-isa version
+        run: echo "CCACHE_NAMESPACE=pto-isa-${PTO_ISA_COMMIT}" >> "$GITHUB_ENV"
+
+      - name: Clone pto-isa repository (pinned)
         run: |
-          python -m pip install --upgrade pip
-          pip install nanobind
-          pip install pytest
-          pip install torch --index-url https://download.pytorch.org/whl/cpu
+          rm -rf "$PTO_ISA_ROOT"
+          timeout 60 git clone https://github.com/hw-native-sys/pto-isa.git "$PTO_ISA_ROOT" \
+            || { rm -rf "$PTO_ISA_ROOT"; timeout 300 git clone https://gitcode.com/luohuan40/pto-isa.git "$PTO_ISA_ROOT"; }
+          cd "$PTO_ISA_ROOT"
+          git checkout ${{ env.PTO_ISA_COMMIT }}
 
-      - name: Get pypto HEAD commit
-        id: pypto-hash
-        run: echo "hash=$(git ls-remote https://github.com/hw-native-sys/pypto.git HEAD | cut -f1)" >> $GITHUB_OUTPUT
+      - name: Bootstrap conda + per-job venv + write activate.sh
+        # Per-job venv layered on conda py310-lib; activate.sh lets each later step
+        # enter the env with a single `source activate.sh`. set_env.sh wires up
+        # the Ascend/HCCL host environment; LD_LIBRARY_PATH is prepended with
+        # $CONDA_PREFIX/lib because ptoas needs a newer GLIBCXX than the system.
+        # PTO2_RING_* (set by the runner's systemd) are handled per-file in the
+        # run step — they break HCCL, so multi-card files unset them there.
+        run: |
+          source /home/ci-runner/miniconda3/etc/profile.d/conda.sh
+          conda activate py310-lib
+          python -m venv venv --system-site-packages
+          cat > activate.sh <<'EOF'
+          source /home/ci-runner/miniconda3/etc/profile.d/conda.sh
+          conda activate py310-lib
+          source "$GITHUB_WORKSPACE/dist-checkout/venv/bin/activate"
+          source /usr/local/Ascend/cann-9.0.0/set_env.sh
+          export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
+          EOF
 
-      - name: Cache pypto wheels
-        id: cache-pypto
-        uses: actions/cache@v4
-        with:
-          path: /tmp/pypto-wheels
-          key: pypto-${{ runner.os }}-${{ runner.arch }}-py3.10-${{ steps.pypto-hash.outputs.hash }}
+      - name: Show ccache stats (before)
+        run: ccache -s || true
+
+      - name: Build and install pypto and simpler
+        # Reuses the pypto source synced by the resolve step above (same revision
+        # the toolchain pins were read from).
+        run: |
+          source activate.sh
+          PYPTO_SRC="/home/ci-runner/hw-native-sys-pypto-lib/ci-cache/pypto-src"
+          python -m pip install --upgrade pip
+          pip install scikit-build-core nanobind cmake ninja
+          pip install --no-build-isolation "$PYPTO_SRC"
+          pip install --no-build-isolation "$PYPTO_SRC/runtime"
 
-      - name: Build pypto wheels
-        if: steps.cache-pypto.outputs.cache-hit != 'true'
+      - name: Install runner dependencies
         run: |
-          git clone --recurse-submodules --depth=1 https://github.com/hw-native-sys/pypto.git /tmp/pypto
-          pip wheel /tmp/pypto -w /tmp/pypto-wheels --no-deps
-          pip wheel /tmp/pypto/runtime -w /tmp/pypto-wheels --no-deps
+          source activate.sh
+          pip install nanobind
+          pip install torch transformers safetensors numpy
 
-      - name: Install pypto and simpler
-        run: pip install /tmp/pypto-wheels/*.whl
+      - name: Show ccache stats (after)
+        run: ccache -s || true
 
-      - name: Run unit tests
+      - name: Install ptoas (local cache)
         env:
-          PYTHONPATH: ${{ github.workspace }}
-        run: python -m pytest tests/test_cli.py tests/test_batching.py
+          PTOAS_CACHE_DIR: /home/ci-runner/hw-native-sys-pypto-lib/ci-cache/ptoas
+        run: |
+          CACHE_ARCHIVE="$PTOAS_CACHE_DIR/ptoas-aarch64-${PTOAS_VERSION}-${PTOAS_SHA256}.tar.gz"
+          download_ptoas() {
+            echo "Downloading ptoas ${PTOAS_VERSION}"
+            mkdir -p "$PTOAS_CACHE_DIR"
+            curl --fail --location --retry 3 --retry-all-errors \
+              https://github.com/hw-native-sys/PTOAS/releases/download/${PTOAS_VERSION}/ptoas-bin-aarch64.tar.gz \
+              -o "$CACHE_ARCHIVE.tmp"
+            echo "${PTOAS_SHA256}  $CACHE_ARCHIVE.tmp" | sha256sum -c -
+            mv "$CACHE_ARCHIVE.tmp" "$CACHE_ARCHIVE"
+          }
+          if [ ! -f "$CACHE_ARCHIVE" ]; then
+            echo "Cache miss"
+            download_ptoas
+          elif ! echo "${PTOAS_SHA256}  $CACHE_ARCHIVE" | sha256sum -c -; then
+            echo "Cache corrupted — removing and re-downloading"
+            rm -f "$CACHE_ARCHIVE"
+            download_ptoas
+          else
+            echo "Cache hit — using $CACHE_ARCHIVE"
+          fi
+          mkdir -p "$PTOAS_ROOT"
+          tar -xzf "$CACHE_ARCHIVE" -C "$PTOAS_ROOT"
+          chmod +x "$PTOAS_ROOT/ptoas"
+          chmod +x "$PTOAS_ROOT/bin/ptoas"
+      
+      - name: Run Qwen3 accuracy guard
+        env:
+          PYTHONPATH: ${{ github.workspace }}/dist-checkout
+          PYPTO_QWEN3_MODEL_DIR: /data/l00955553/model/Qwen3-14B
+        run: |
+          source activate.sh
+          python -m pytest tests/test_qwen3_accuracy.py -q -s
\ No newline at end of file
diff --git a/tests/test_qwen3_accuracy.py b/tests/test_qwen3_accuracy.py
new file mode 100644
index 0000000..5f3d162
--- /dev/null
+++ b/tests/test_qwen3_accuracy.py
@@ -0,0 +1,92 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+
+"""Qwen3 output accuracy guard for CI."""
+
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+
+MODEL_DIR_ENV = os.environ.get("PYPTO_QWEN3_MODEL_DIR")
+if MODEL_DIR_ENV is None:
+    pytest.fail("PYPTO_QWEN3_MODEL_DIR is required")
+MODEL_DIR = Path(MODEL_DIR_ENV)
+MODEL_ID = "qwen3-14b-accuracy"
+PLATFORM = os.environ.get("PYPTO_QWEN3_PLATFORM", "a2a3")
+DEVICE_ID_ENV = os.environ.get("DEVICE_ID")
+if DEVICE_ID_ENV is None:
+    pytest.fail("DEVICE_ID is required")
+DEVICE_ID = int(DEVICE_ID_ENV)
+PROMPT = "The capital of France is"
+MAX_NEW_TOKENS = 8
+
+EXPECTED_TOKEN_IDS = [12095, 13, 576, 6722, 315, 9625, 374, 12095]
+
+
+def test_qwen3_output_matches_expected_tokens():
+    if not MODEL_DIR or not MODEL_DIR.is_dir():
+        pytest.fail(f"Qwen3 model weights not found: {MODEL_DIR}")
+
+    from examples.model.qwen3_14b.runner.npu_executor import Qwen314BPyptoExecutor
+    from python.core.engine import LLMEngine
+    from python.core.kv_cache import KvCacheManager
+    from python.core.types import GenerateConfig, RuntimeConfig
+
+    kv_cache_manager = KvCacheManager()
+    executor = Qwen314BPyptoExecutor(
+        kv_cache_manager,
+        platform=PLATFORM,
+        device_id=DEVICE_ID,
+    )
+    engine = LLMEngine(kv_cache_manager=kv_cache_manager, executor=executor)
+
+    try:
+        engine.init_model(
+            model_id=MODEL_ID,
+            model_dir=str(MODEL_DIR),
+            model_format="huggingface",
+            runtime_config=RuntimeConfig(
+                page_size=128,
+                max_batch_size=16,
+                max_seq_len=512,
+                max_new_tokens=MAX_NEW_TOKENS,
+                device="cpu",
+                kv_dtype="bfloat16",
+                weight_dtype="float32",
+            ),
+        )
+        result = engine.generate_result(
+            MODEL_ID,
+            PROMPT,
+            GenerateConfig(
+                max_new_tokens=MAX_NEW_TOKENS,
+                temperature=0.0,
+                top_p=1.0,
+                top_k=None,
+            ),
+        )
+    finally:
+        executor.close()
+    assert result.token_ids == EXPECTED_TOKEN_IDS, (
+        f"Qwen3 output changed for prompt {PROMPT!r}:\n"
+        f"expected token_ids: {EXPECTED_TOKEN_IDS}\n"
+        f"actual token_ids:   {result.token_ids}\n"
+        f"actual text:        {result.text!r}\n"
+        f"finish_reason:      {result.finish_reason}"
+    )