lightseekorg · key4ng · May 18, 2026 · May 18, 2026 · May 18, 2026 · coderabbitai
@@ -19,6 +19,11 @@ on:
         required: true
         type: number
         description: "Job timeout in minutes"
+      test_timeout:
+        required: false
+        type: number
+        default: 25
+        description: "pytest step timeout in minutes (must be < job timeout)"
       extra_deps:
         required: false
         type: string
@@ -113,7 +118,7 @@ jobs:
 
       # Run tests
       - name: Run E2E tests
-        timeout-minutes: 25
+        timeout-minutes: ${{ inputs.test_timeout }}
         env:
           SHOW_WORKER_LOGS: "0"
           SHOW_ROUTER_LOGS: "1"

@@ -466,22 +466,28 @@ jobs:
           || (needs.detect-changes.result == 'success'
               && (needs.detect-changes.outputs.common == 'true'
                   || needs.detect-changes.outputs.chat-completions == 'true')))
+    # Absorbs the previously-2-GPU chat tests (gpt-oss-20b and
+    # Qwen2.5-14B at tp=1). Timeouts and test_timeout bumped accordingly.
     strategy:
       fail-fast: false
       matrix:
         include:
           - engine: sglang
-            timeout: 30
+            timeout: 60
+            test_timeout: 45
           - engine: vllm
-            timeout: 20
+            timeout: 45
+            test_timeout: 35
           - engine: trtllm
-            timeout: 90
+            timeout: 120
+            test_timeout: 90
     uses: ./.github/workflows/e2e-gpu-job.yml
     with:
       engine: ${{ matrix.engine }}
       gpu_tier: "1"
       runner: 1-gpu-h100
       timeout: ${{ matrix.timeout }}
+      test_timeout: ${{ matrix.test_timeout }}
       test_dirs: e2e_test/chat_completions
     secrets: inherit
 
@@ -573,29 +579,12 @@ jobs:
 
   # === 2 GPU ===
 
-  e2e-2gpu-chat:
-    name: e2e-2gpu-chat (${{ matrix.engine }})
-    needs: [e2e-1gpu-chat]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - engine: sglang
-            timeout: 20
-          - engine: vllm
-            timeout: 20
-          - engine: trtllm
-            timeout: 30
-    uses: ./.github/workflows/e2e-gpu-job.yml
-    with:
-      engine: ${{ matrix.engine }}
-      gpu_tier: "2"
-      runner: 2-gpu-h100
-      timeout: ${{ matrix.timeout }}
-      test_dirs: e2e_test/chat_completions
-    secrets: inherit
+  # Note: the previous e2e-2gpu-chat job was retired alongside the tp=1
+  # drop on Qwen2.5-14B-Instruct / gpt-oss-20b — every chat_completions
+  # class that used those models fits on a single H100 now, so all
+  # chat_completions tests run under e2e-1gpu-chat above.
 
-  e2e-2gpu-responses:
+  e2e-1gpu-responses:
     needs: [build-wheel, detect-changes]
     if: >-
       always()
@@ -608,8 +597,8 @@ jobs:
     uses: ./.github/workflows/e2e-gpu-job.yml
     with:
       engine: sglang
-      gpu_tier: "2"
-      runner: 2-gpu-h100
+      gpu_tier: "1"
+      runner: 1-gpu-h100
       timeout: 30
       test_dirs: e2e_test/responses
       setup_agentic_deps: true
@@ -971,7 +960,7 @@ jobs:
           path: benchmark_go_bindings/
 
   finish:
-    needs: [pre-commit, python-lint, grpc-proto-build-check, build-wheel, python-unit-tests, unit-tests, benchmarks, e2e-1gpu-chat, e2e-1gpu-completions, e2e-1gpu-embeddings, e2e-1gpu-gateway, e2e-2gpu-chat, e2e-2gpu-responses, e2e-2gpu-pd, e2e-4gpu-chat, e2e-4gpu-gateway, e2e-vendor, go-unit-tests, go-bindings-e2e]
+    needs: [pre-commit, python-lint, grpc-proto-build-check, build-wheel, python-unit-tests, unit-tests, benchmarks, e2e-1gpu-chat, e2e-1gpu-completions, e2e-1gpu-embeddings, e2e-1gpu-gateway, e2e-1gpu-responses, e2e-2gpu-pd, e2e-4gpu-chat, e2e-4gpu-gateway, e2e-vendor, go-unit-tests, go-bindings-e2e]
     if: always()
     runs-on: k8s-runner-cpu
     permissions: {}
@@ -989,8 +978,7 @@ jobs:
                 "${{ needs.e2e-1gpu-completions.result }}" == "failure" || \
                 "${{ needs.e2e-1gpu-embeddings.result }}" == "failure" || \
                 "${{ needs.e2e-1gpu-gateway.result }}" == "failure" || \
-                "${{ needs.e2e-2gpu-chat.result }}" == "failure" || \
-                "${{ needs.e2e-2gpu-responses.result }}" == "failure" || \
+                "${{ needs.e2e-1gpu-responses.result }}" == "failure" || \
                 "${{ needs.e2e-2gpu-pd.result }}" == "failure" || \
                 "${{ needs.e2e-4gpu-chat.result }}" == "failure" || \
                 "${{ needs.e2e-4gpu-gateway.result }}" == "failure" || \

@@ -1591,7 +1591,7 @@ def test_conflicting_defs_required_tool_choice(self, model, api_client):
 
 
 @pytest.mark.engine("sglang", "vllm", "trtllm")
-@pytest.mark.gpu(2)
+@pytest.mark.gpu(1)
 @pytest.mark.model("Qwen/Qwen2.5-14B-Instruct")
 @pytest.mark.gateway(extra_args=["--tool-call-parser", "qwen", "--history-backend", "memory"])
 @pytest.mark.parametrize("setup_backend", ["grpc"], indirect=True)

@@ -360,7 +360,7 @@ def _delta_text(delta):
 
 
 @pytest.mark.engine("sglang", "vllm", "trtllm")
-@pytest.mark.gpu(2)
+@pytest.mark.gpu(1)
 @pytest.mark.model("openai/gpt-oss-20b")
 @pytest.mark.gateway(extra_args=["--history-backend", "memory"])
 class TestChatCompletionGptOss(TestChatCompletion):

@@ -140,7 +140,7 @@ class TestStructuredOutputRegular(_TestStructuredOutputBase):
 
 
 @pytest.mark.engine("sglang", "vllm", "trtllm")
-@pytest.mark.gpu(2)
+@pytest.mark.gpu(1)
 @pytest.mark.model("openai/gpt-oss-20b")
 @pytest.mark.gateway(extra_args=["--history-backend", "memory"])
 @pytest.mark.parametrize("setup_backend", ["grpc"], indirect=True)

@@ -168,7 +168,7 @@ def run_chat_completion():
 
 
 @pytest.mark.engine("sglang", "vllm", "trtllm")
-@pytest.mark.gpu(2)
+@pytest.mark.gpu(1)
 @pytest.mark.model("openai/gpt-oss-20b")
 @pytest.mark.gateway(extra_args=["--history-backend", "memory"])
 @pytest.mark.parametrize("setup_backend", ["grpc"], indirect=True)

@@ -108,6 +108,7 @@ def pytest_runtest_logstart(nodeid: str, location: tuple) -> None:
     pytest_collection_modifyitems,
     pytest_configure,
     pytest_runtest_setup,
+    pytest_sessionfinish,
     setup_backend,
 )
 from smg_client import SmgClient
@@ -151,6 +152,7 @@ def api_client(request, setup_backend):
     "pytest_runtest_setup",
     "pytest_collection_modifyitems",
     "pytest_configure",
+    "pytest_sessionfinish",
     # Fixtures
     "setup_backend",
     "backend_router",

@@ -12,6 +12,7 @@
     pytest_collection_modifyitems,
     pytest_configure,
     pytest_runtest_setup,
+    pytest_sessionfinish,
 )
 
 # Marker helpers
@@ -25,6 +26,7 @@
     "pytest_collection_modifyitems",
     "pytest_configure",
     "pytest_runtest_setup",
+    "pytest_sessionfinish",
     # Backend fixtures
     "setup_backend",
     "backend_router",

@@ -3,14 +3,18 @@
 This module handles:
 - Marker registration: Defining custom pytest markers
 - Test filtering: Env-var-based filtering by engine, vendor, and GPU tier
+- Test ordering: Cluster items by ``(backend, model)`` so the
+  session-scoped worker pool (``infra.worker_pool``) doesn't have to
+  evict-and-restart between consecutive classes that share a backend.
+- Session teardown: Stop all pooled workers on session finish.
 """
 
 from __future__ import annotations
 
 import os
 
 import pytest
-from infra import get_runtime
+from infra import cleanup_pool, get_runtime
 
 from .markers import resolve_class_marker
 
@@ -110,32 +114,82 @@ def pytest_collection_modifyitems(
     config: pytest.Config,
     items: list[pytest.Item],
 ) -> None:
-    """Filter collected tests based on E2E_ENGINE, E2E_VENDOR, and E2E_GPU_TIER env vars."""
+    """Filter + order collected tests.
+
+    Filtering: env vars ``E2E_ENGINE``, ``E2E_VENDOR``, ``E2E_GPU_TIER``
+    select the matching slice when set.
+
+    Ordering: items are sorted by ``(backend, model)`` so consecutive
+    classes that share a backend cluster together. This is what lets
+    ``infra.worker_pool`` reuse a single worker across many test classes
+    instead of evicting on every boundary.
+    """
     engine = os.environ.get("E2E_ENGINE") or None
     vendor = os.environ.get("E2E_VENDOR") or None
     gpu_tier = os.environ.get("E2E_GPU_TIER") or None
 
-    if not any([engine, vendor, gpu_tier]):
-        return
-
-    selected: list[pytest.Item] = []
-    for item in items:
-        # Filter by engine
-        if engine:
-            engine_marker = _get_marker(item, "engine")
-            if not engine_marker or engine not in engine_marker.args:
-                continue
-        # Filter by vendor
-        if vendor:
-            vendor_marker = _get_marker(item, "vendor")
-            if not vendor_marker or vendor not in vendor_marker.args:
-                continue
-        # Filter by GPU tier
-        if gpu_tier is not None:
-            gpu_marker = _get_marker(item, "gpu")
-            gpu_count = gpu_marker.args[0] if gpu_marker else 1
-            if str(gpu_count) != gpu_tier:
-                continue
-        selected.append(item)
-
-    items[:] = selected
+    if any([engine, vendor, gpu_tier]):
+        selected: list[pytest.Item] = []
+        for item in items:
+            # Filter by engine
+            if engine:
+                engine_marker = _get_marker(item, "engine")
+                if not engine_marker or engine not in engine_marker.args:
+                    continue
+            # Filter by vendor
+            if vendor:
+                vendor_marker = _get_marker(item, "vendor")
+                if not vendor_marker or vendor not in vendor_marker.args:
+                    continue
+            # Filter by GPU tier
+            if gpu_tier is not None:
+                gpu_marker = _get_marker(item, "gpu")
+                gpu_count = gpu_marker.args[0] if gpu_marker else 1
+                if str(gpu_count) != gpu_tier:
+                    continue
+            selected.append(item)
+        items[:] = selected
+
+    items.sort(key=_pool_sort_key)
+
+
+def _pool_sort_key(item: pytest.Item) -> tuple:
+    """Sort key clustering items that would share a pool entry.
+
+    Primary: backend parametrize value — ``setup_backend`` for class-scope
+    fixtures, falling back to ``backend_router`` for function-scope ones.
+    Different backends mean different pool keys.
+    Secondary: ``@pytest.mark.model`` value if set, else empty — same
+    backend with the same model is the cache-hit case.
+    Tertiary: ``item.nodeid`` for stability across collections.
+    """
+    backend = ""
+    callspec = getattr(item, "callspec", None)
+    if callspec is not None:
+        params = getattr(callspec, "params", {}) or {}
+        backend = str(params.get("setup_backend", params.get("backend_router", "")))
+
+    model_marker = resolve_class_marker(item, "model")
+    model = ""
+    if model_marker is not None and model_marker.args:
+        model = str(model_marker.args[0])
+
+    return (backend, model, item.nodeid)
+
+
+# ---------------------------------------------------------------------------
+# Session teardown — stop pooled workers
+# ---------------------------------------------------------------------------
+
+
+def pytest_sessionfinish(
+    session: pytest.Session,  # noqa: ARG001
+    exitstatus: int,  # noqa: ARG001
+) -> None:
+    """Tear down any workers held by the session-scoped pool.
+
+    The pool also has an ``atexit`` handler for cases where pytest exits
+    abnormally before this hook runs (SIGINT, ``pytest.exit``), but the
+    explicit hook is cheaper and gives clean log output during normal runs.
+    """
+    cleanup_pool()