Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 48 additions & 2 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,39 @@ def _emit_group(header: str, body: str) -> None:
print("::endgroup::", flush=True)


def _github_actions_escape(value: object) -> str:
"""Escape a value for the GitHub Actions workflow-command payload."""
return str(value).replace("%", "%25").replace("\r", "%0D").replace("\n", "%0A")


def _emit_resource_failure_summary(
results: list[_ps.JobResult],
*,
emit_annotations: bool = True,
heading: str = "Resource phase failed",
) -> None:
"""Print failed Resource child jobs outside collapsible groups."""
failed = [r for r in results if r.returncode != 0]
if not failed:
return

print(f"\n*** {heading}: {len(failed)} child job(s) ***", flush=True)
for res in failed:
devices = ",".join(str(d) for d in res.device_ids)
nodeid = res.nodeid or "<unknown>"
if emit_annotations:
message = _github_actions_escape(f"{nodeid} ({res.label}) rc={res.returncode} devices=[{devices}]")
print(f"::error title=Resource phase failed::{message}", flush=True)

print(
f"- nodeid={nodeid}",
flush=True,
)
print(f" label={res.label}", flush=True)
print(f" rc={res.returncode} devices={res.device_ids} duration={res.duration_s:.1f}s", flush=True)
print(" full output is in the Resource child group above", flush=True)


def _dispatch_test_phases(session, resource_specs): # noqa: PLR0912
"""Run Resource → L2 phases.

Expand All @@ -791,6 +824,7 @@ def _dispatch_test_phases(session, resource_specs): # noqa: PLR0912

# ----- Phase 1: Resource (L3 classes + standalone resource functions) -----
resource_failed = False
resource_results: list[_ps.JobResult] = []
if resource_specs:
jobs = []
for spec in resource_specs:
Expand Down Expand Up @@ -828,18 +862,20 @@ def _build(ids, _nodeid=spec.nodeid, _rt=spec.runtime, _kind=spec.kind):
device_count=spec.device_count,
build_cmd=_build,
cwd=str(cwd),
nodeid=spec.nodeid,
)
)

def _on_done(res):
tag = "PASS" if res.returncode == 0 else f"FAIL rc={res.returncode}"
header = f"{res.label} [{tag} {res.duration_s:.1f}s, devices={res.device_ids}]"
nodeid = res.nodeid or "<unknown>"
header = f"{res.label} nodeid={nodeid} [{tag} {res.duration_s:.1f}s, devices={res.device_ids}]"
_emit_group(header, res.output)
if res.returncode != 0:
# Out-of-group summary so a reviewer scanning the collapsed
# log still sees the failure without having to expand.
print(
f"*** FAIL: {res.label} (devices={res.device_ids}) — expand group above ***",
f"*** FAIL: {nodeid} ({res.label}, devices={res.device_ids}) — expand group above ***",
flush=True,
)

Expand All @@ -859,7 +895,10 @@ def _on_done(res):
print(f"\n*** Resource phase ABORTED: {e} ***\n", flush=True)
session.testsfailed = 1
return True
resource_results = results
resource_failed = any(r.returncode != 0 for r in results)
if resource_failed:
_emit_resource_failure_summary(results)
if any(r.returncode == TIMEOUT_EXIT_CODE for r in results):
print("\n*** Resource phase: TIMED OUT ***\n", flush=True)
os._exit(TIMEOUT_EXIT_CODE)
Expand Down Expand Up @@ -940,6 +979,13 @@ def _on_done(res):
if fail_fast:
break

if resource_failed:
_emit_resource_failure_summary(
resource_results,
emit_annotations=False,
heading="Resource phase failed recap",
)

session.testsfailed = 1 if (resource_failed or l2_failed) else 0
if not (resource_failed or l2_failed):
session.testscollected = sum(1 for _ in session.items)
Expand Down
4 changes: 4 additions & 0 deletions simpler_setup/parallel_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class Job:
build_cmd: Callable[[list[int]], list[str]] # Given allocated ids → argv
cwd: str | None = None
env: dict | None = None
nodeid: str | None = None # pytest nodeid, when the caller has one


@dataclass
Expand All @@ -57,6 +58,7 @@ class JobResult:
device_ids: list[int]
output: str = "" # Captured combined stdout+stderr
duration_s: float = 0.0
nodeid: str | None = None


@dataclass
Expand Down Expand Up @@ -270,6 +272,7 @@ def _reap_one() -> JobResult | None:
device_ids=rj.device_ids,
output="".join(rj.output_lines),
duration_s=duration,
nodeid=rj.job.nodeid,
)
state.results.append(res)
if rc != 0:
Expand Down Expand Up @@ -328,6 +331,7 @@ def _reap_one() -> JobResult | None:
device_ids=rj.device_ids,
output="".join(rj.output_lines),
duration_s=duration,
nodeid=rj.job.nodeid,
)
)
_active_state = None
Expand Down
101 changes: 101 additions & 0 deletions tests/ut/py/test_resource_failure_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright (c) PyPTO Contributors.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
"""Tests for Resource phase failure summaries emitted by the root conftest."""

from __future__ import annotations

import importlib.util
from pathlib import Path

from simpler_setup.parallel_scheduler import JobResult

_ROOT = Path(__file__).resolve().parents[3]


def _load_root_conftest():
spec = importlib.util.spec_from_file_location("_root_conftest_resource_summary", _ROOT / "conftest.py")
assert spec is not None and spec.loader is not None
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod


def test_emit_resource_failure_summary_prints_nodeid_and_annotation(capsys):
cf = _load_root_conftest()
results = [
JobResult(
label="standalone pass",
returncode=0,
device_ids=[0],
output="pass output\n",
duration_s=1.0,
),
JobResult(
label="standalone bad%case\nname",
returncode=-11,
device_ids=[4, 5],
output=(
"line1\n"
"E RuntimeError: run_prepared failed with code 507018\n"
"PTO2 runtime failed: orch_error_code=0 sched_error_code=100 runtime_status=-100\n"
"PTO2 scheduler timeout sub_class=S1:running-stalled\n"
),
duration_s=12.34,
nodeid="tests/st/runtime_fatal_codes/test_probe.py::test_bad[param]",
),
]

cf._emit_resource_failure_summary(results)

out = capsys.readouterr().out
assert "*** Resource phase failed: 1 child job(s) ***" in out
assert (
"::error title=Resource phase failed::"
"tests/st/runtime_fatal_codes/test_probe.py::test_bad[param] "
"(standalone bad%25case%0Aname) rc=-11 devices=[4,5]"
) in out
assert "- nodeid=tests/st/runtime_fatal_codes/test_probe.py::test_bad[param]" in out
assert "label=standalone bad%case\nname" in out
assert "rc=-11 devices=[4, 5] duration=12.3s" in out
assert "full output is in the Resource child group above" in out
assert "hint:" not in out
assert "line1" not in out
assert "RuntimeError: run_prepared failed with code 507018" not in out
assert "PTO2 runtime failed: orch_error_code=0 sched_error_code=100 runtime_status=-100" not in out
assert "PTO2 scheduler timeout sub_class=S1:running-stalled" not in out
assert "standalone pass" not in out


def test_emit_resource_failure_summary_can_emit_compact_recap(capsys):
cf = _load_root_conftest()
results = [
JobResult(
label="standalone failed",
returncode=2,
device_ids=[7],
output="hidden tail\n",
duration_s=3.0,
nodeid="tests/st/test_failed.py::test_failed",
)
]

cf._emit_resource_failure_summary(
results,
emit_annotations=False,
heading="Resource phase failed recap",
)

out = capsys.readouterr().out
assert "*** Resource phase failed recap: 1 child job(s) ***" in out
assert "::error" not in out
assert "nodeid=tests/st/test_failed.py::test_failed" in out
assert "label=standalone failed" in out
assert "rc=2 devices=[7] duration=3.0s" in out
assert "full output is in the Resource child group above" in out
assert "hidden tail" not in out
Loading