diff --git a/conftest.py b/conftest.py index 4710d946b..beeb80119 100644 --- a/conftest.py +++ b/conftest.py @@ -765,6 +765,39 @@ def _emit_group(header: str, body: str) -> None: print("::endgroup::", flush=True) +def _github_actions_escape(value: object) -> str: + """Escape a value for the GitHub Actions workflow-command payload.""" + return str(value).replace("%", "%25").replace("\r", "%0D").replace("\n", "%0A") + + +def _emit_resource_failure_summary( + results: list[_ps.JobResult], + *, + emit_annotations: bool = True, + heading: str = "Resource phase failed", +) -> None: + """Print failed Resource child jobs outside collapsible groups.""" + failed = [r for r in results if r.returncode != 0] + if not failed: + return + + print(f"\n*** {heading}: {len(failed)} child job(s) ***", flush=True) + for res in failed: + devices = ",".join(str(d) for d in res.device_ids) + nodeid = res.nodeid or "" + if emit_annotations: + message = _github_actions_escape(f"{nodeid} ({res.label}) rc={res.returncode} devices=[{devices}]") + print(f"::error title=Resource phase failed::{message}", flush=True) + + print( + f"- nodeid={nodeid}", + flush=True, + ) + print(f" label={res.label}", flush=True) + print(f" rc={res.returncode} devices={res.device_ids} duration={res.duration_s:.1f}s", flush=True) + print(" full output is in the Resource child group above", flush=True) + + def _dispatch_test_phases(session, resource_specs): # noqa: PLR0912 """Run Resource → L2 phases. @@ -791,6 +824,7 @@ def _dispatch_test_phases(session, resource_specs): # noqa: PLR0912 # ----- Phase 1: Resource (L3 classes + standalone resource functions) ----- resource_failed = False + resource_results: list[_ps.JobResult] = [] if resource_specs: jobs = [] for spec in resource_specs: @@ -828,18 +862,20 @@ def _build(ids, _nodeid=spec.nodeid, _rt=spec.runtime, _kind=spec.kind): device_count=spec.device_count, build_cmd=_build, cwd=str(cwd), + nodeid=spec.nodeid, ) ) def _on_done(res): tag = "PASS" if res.returncode == 0 else f"FAIL rc={res.returncode}" - header = f"{res.label} [{tag} {res.duration_s:.1f}s, devices={res.device_ids}]" + nodeid = res.nodeid or "" + header = f"{res.label} nodeid={nodeid} [{tag} {res.duration_s:.1f}s, devices={res.device_ids}]" _emit_group(header, res.output) if res.returncode != 0: # Out-of-group summary so a reviewer scanning the collapsed # log still sees the failure without having to expand. print( - f"*** FAIL: {res.label} (devices={res.device_ids}) — expand group above ***", + f"*** FAIL: {nodeid} ({res.label}, devices={res.device_ids}) — expand group above ***", flush=True, ) @@ -859,7 +895,10 @@ def _on_done(res): print(f"\n*** Resource phase ABORTED: {e} ***\n", flush=True) session.testsfailed = 1 return True + resource_results = results resource_failed = any(r.returncode != 0 for r in results) + if resource_failed: + _emit_resource_failure_summary(results) if any(r.returncode == TIMEOUT_EXIT_CODE for r in results): print("\n*** Resource phase: TIMED OUT ***\n", flush=True) os._exit(TIMEOUT_EXIT_CODE) @@ -940,6 +979,13 @@ def _on_done(res): if fail_fast: break + if resource_failed: + _emit_resource_failure_summary( + resource_results, + emit_annotations=False, + heading="Resource phase failed recap", + ) + session.testsfailed = 1 if (resource_failed or l2_failed) else 0 if not (resource_failed or l2_failed): session.testscollected = sum(1 for _ in session.items) diff --git a/simpler_setup/parallel_scheduler.py b/simpler_setup/parallel_scheduler.py index fcbdb8393..734d65815 100644 --- a/simpler_setup/parallel_scheduler.py +++ b/simpler_setup/parallel_scheduler.py @@ -48,6 +48,7 @@ class Job: build_cmd: Callable[[list[int]], list[str]] # Given allocated ids → argv cwd: str | None = None env: dict | None = None + nodeid: str | None = None # pytest nodeid, when the caller has one @dataclass @@ -57,6 +58,7 @@ class JobResult: device_ids: list[int] output: str = "" # Captured combined stdout+stderr duration_s: float = 0.0 + nodeid: str | None = None @dataclass @@ -270,6 +272,7 @@ def _reap_one() -> JobResult | None: device_ids=rj.device_ids, output="".join(rj.output_lines), duration_s=duration, + nodeid=rj.job.nodeid, ) state.results.append(res) if rc != 0: @@ -328,6 +331,7 @@ def _reap_one() -> JobResult | None: device_ids=rj.device_ids, output="".join(rj.output_lines), duration_s=duration, + nodeid=rj.job.nodeid, ) ) _active_state = None diff --git a/tests/ut/py/test_resource_failure_summary.py b/tests/ut/py/test_resource_failure_summary.py new file mode 100644 index 000000000..5a2ac7246 --- /dev/null +++ b/tests/ut/py/test_resource_failure_summary.py @@ -0,0 +1,101 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Tests for Resource phase failure summaries emitted by the root conftest.""" + +from __future__ import annotations + +import importlib.util +from pathlib import Path + +from simpler_setup.parallel_scheduler import JobResult + +_ROOT = Path(__file__).resolve().parents[3] + + +def _load_root_conftest(): + spec = importlib.util.spec_from_file_location("_root_conftest_resource_summary", _ROOT / "conftest.py") + assert spec is not None and spec.loader is not None + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def test_emit_resource_failure_summary_prints_nodeid_and_annotation(capsys): + cf = _load_root_conftest() + results = [ + JobResult( + label="standalone pass", + returncode=0, + device_ids=[0], + output="pass output\n", + duration_s=1.0, + ), + JobResult( + label="standalone bad%case\nname", + returncode=-11, + device_ids=[4, 5], + output=( + "line1\n" + "E RuntimeError: run_prepared failed with code 507018\n" + "PTO2 runtime failed: orch_error_code=0 sched_error_code=100 runtime_status=-100\n" + "PTO2 scheduler timeout sub_class=S1:running-stalled\n" + ), + duration_s=12.34, + nodeid="tests/st/runtime_fatal_codes/test_probe.py::test_bad[param]", + ), + ] + + cf._emit_resource_failure_summary(results) + + out = capsys.readouterr().out + assert "*** Resource phase failed: 1 child job(s) ***" in out + assert ( + "::error title=Resource phase failed::" + "tests/st/runtime_fatal_codes/test_probe.py::test_bad[param] " + "(standalone bad%25case%0Aname) rc=-11 devices=[4,5]" + ) in out + assert "- nodeid=tests/st/runtime_fatal_codes/test_probe.py::test_bad[param]" in out + assert "label=standalone bad%case\nname" in out + assert "rc=-11 devices=[4, 5] duration=12.3s" in out + assert "full output is in the Resource child group above" in out + assert "hint:" not in out + assert "line1" not in out + assert "RuntimeError: run_prepared failed with code 507018" not in out + assert "PTO2 runtime failed: orch_error_code=0 sched_error_code=100 runtime_status=-100" not in out + assert "PTO2 scheduler timeout sub_class=S1:running-stalled" not in out + assert "standalone pass" not in out + + +def test_emit_resource_failure_summary_can_emit_compact_recap(capsys): + cf = _load_root_conftest() + results = [ + JobResult( + label="standalone failed", + returncode=2, + device_ids=[7], + output="hidden tail\n", + duration_s=3.0, + nodeid="tests/st/test_failed.py::test_failed", + ) + ] + + cf._emit_resource_failure_summary( + results, + emit_annotations=False, + heading="Resource phase failed recap", + ) + + out = capsys.readouterr().out + assert "*** Resource phase failed recap: 1 child job(s) ***" in out + assert "::error" not in out + assert "nodeid=tests/st/test_failed.py::test_failed" in out + assert "label=standalone failed" in out + assert "rc=2 devices=[7] duration=3.0s" in out + assert "full output is in the Resource child group above" in out + assert "hidden tail" not in out