Skip to content

Commit ff048ff

Browse files
SS-JIAclaude
andcommitted
CI: diagnose AOTI hang on macOS — isolated test with native stack sampling
Summary: The macOS unittest job has been timing out since the PyTorch pin was updated to 2.12. Three CI runs showed 38-42 minutes of complete silence after ~55% test completion, with faulthandler unable to fire (confirming the hang is in native C/C++ code, not Python). The last tests before the silence are in examples/models/llama/tests/; the next tests in collection order are the llama3_2_vision AOTI tests. This commit isolates the diagnosis: - Disables all CI jobs except the macos unittest job - Replaces the full test suite with a single AOTI test (test_llama3_2_text_decoder_aoti) run as an instrumented script - Prints timestamps before/after each step (export, compile, load, run) to identify which AOTI step hangs - Runs a background watchdog that uses macOS `sample` to capture native C/C++ call stacks every 60s, since faulthandler cannot see into native code that holds the GIL - Sets PYTHONUNBUFFERED=1 to prevent pipe buffering Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 88faab2 commit ff048ff

2 files changed

Lines changed: 138 additions & 51 deletions

File tree

.ci/scripts/unittest-macos-cmake.sh

Lines changed: 133 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,139 @@
66
# LICENSE file in the root directory of this source tree.
77
set -eux
88

9-
# Keep AOTInductor precompiled headers scoped to this job. The default cache
10-
# location can persist across macOS self-hosted runner jobs and produce stale
11-
# PCH failures after PyTorch is reinstalled.
9+
# =============================================================================
10+
# AOTI HANG DIAGNOSIS
11+
#
12+
# Run a single AOTI test that is known to hang on macOS CI. A background
13+
# watchdog samples the native call stack every 60 s so we can see exactly
14+
# which C/C++ function the thread is blocked in (faulthandler only shows
15+
# Python frames and cannot fire when the GIL is held by native code).
16+
# =============================================================================
17+
1218
export TORCHINDUCTOR_CACHE_DIR="$(mktemp -d "${RUNNER_TEMP:-/tmp}/torchinductor_cache_XXXXXX")"
1319
trap 'rm -rf "${TORCHINDUCTOR_CACHE_DIR}"' EXIT
1420

15-
# Run pytest with coverage
16-
${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
17-
# Run gtest
18-
LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \
19-
${CONDA_RUN} test/run_oss_cpp_tests.sh
21+
# Force unbuffered output so every print appears immediately in the CI log.
22+
export PYTHONUNBUFFERED=1
23+
24+
# ---------- instrumented test wrapper ----------
25+
cat > /tmp/aoti_diag.py << 'PYEOF'
26+
"""Run a single AOTI test with step-by-step timing instrumentation."""
27+
import json, os, sys, tempfile, time
28+
29+
def log(msg):
30+
elapsed = time.time() - _t0
31+
print(f"[AOTI-DIAG +{elapsed:7.1f}s] {msg}", flush=True)
32+
33+
_t0 = time.time()
34+
log("start")
35+
36+
import torch
37+
log(f"torch {torch.__version__} loaded")
38+
39+
from executorch.examples.models.llama3_2_vision.text_decoder.model import Llama3_2Decoder
40+
log("Llama3_2Decoder imported")
41+
42+
params = {
43+
"dim": 2048,
44+
"ffn_dim_multiplier": 1.3,
45+
"fusion_interval": 2,
46+
"intermediate_dim": 14336,
47+
"multiple_of": 1024,
48+
"n_heads": 32,
49+
"n_kv_heads": 8,
50+
"n_layers": 2,
51+
"n_special_tokens": 8,
52+
"norm_eps": 1e-05,
53+
"rope_theta": 500000.0,
54+
"use_scaled_rope": True,
55+
"vision_chunk_size": 560,
56+
"vision_max_num_chunks": 4,
57+
"vocab_size": 21008,
58+
"vision_num_cross_attention_layers": 1,
59+
}
60+
61+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as f:
62+
json.dump(params, f, indent=2); f.flush()
63+
model = Llama3_2Decoder(
64+
encoder_max_seq_len=6404,
65+
generate_full_logits=True,
66+
enable_dynamic_shape=True,
67+
use_kv_cache=True,
68+
params=f.name,
69+
dtype=torch.float32,
70+
)
71+
log("model constructed")
72+
73+
encoder = model.get_eager_model().eval()
74+
for p in encoder.parameters():
75+
p.requires_grad_(False)
76+
log("model eval + no_grad")
77+
78+
example_inputs = model.get_example_inputs()
79+
example_kwargs = model.get_example_kwarg_inputs()
80+
81+
# Step 1: torch.export
82+
log("step 1/4: torch.export.export ...")
83+
t = time.time()
84+
with torch.no_grad(), torch.inference_mode():
85+
ep = torch.export.export(encoder, example_inputs, kwargs=example_kwargs, strict=True)
86+
log(f"step 1/4: torch.export.export done ({time.time()-t:.1f}s)")
87+
88+
# Step 2: aoti_compile_and_package
89+
tmpdir = tempfile.mkdtemp()
90+
pkg_path = os.path.join(tmpdir, "text_decoder.pt2")
91+
log(f"step 2/4: aoti_compile_and_package -> {pkg_path} ...")
92+
t = time.time()
93+
path = torch._inductor.aoti_compile_and_package(ep, package_path=pkg_path)
94+
log(f"step 2/4: aoti_compile_and_package done ({time.time()-t:.1f}s)")
95+
96+
# Step 3: aoti_load_package
97+
log("step 3/4: aoti_load_package ...")
98+
t = time.time()
99+
encoder_aoti = torch._inductor.aoti_load_package(path)
100+
log(f"step 3/4: aoti_load_package done ({time.time()-t:.1f}s)")
101+
102+
# Step 4: inference
103+
log("step 4/4: inference ...")
104+
t = time.time()
105+
y = encoder_aoti(*example_inputs, **example_kwargs)
106+
log(f"step 4/4: inference done ({time.time()-t:.1f}s)")
107+
108+
# Verify
109+
eager_res = encoder.forward(*example_inputs, **example_kwargs)
110+
torch.testing.assert_close(y, eager_res, rtol=1e-4, atol=1e-4)
111+
log("PASS — results match")
112+
PYEOF
113+
114+
# ---------- run with background watchdog ----------
115+
# Start the test
116+
${CONDA_RUN} --no-capture-output python /tmp/aoti_diag.py &
117+
TEST_PID=$!
118+
119+
# Watchdog: every 60s, if the test is still running, sample the native stack.
120+
(
121+
while kill -0 "$TEST_PID" 2>/dev/null; do
122+
sleep 60
123+
if kill -0 "$TEST_PID" 2>/dev/null; then
124+
echo ""
125+
echo "===== WATCHDOG: native stack sample ($(date)) ====="
126+
# sample captures C/C++ call stacks on macOS
127+
sample "$TEST_PID" 1 2>&1 | head -200 || true
128+
echo "===== END WATCHDOG ====="
129+
echo ""
130+
fi
131+
done
132+
) &
133+
WATCHDOG_PID=$!
134+
135+
# Wait for test, propagate exit code
136+
wait "$TEST_PID"
137+
EXIT_CODE=$?
138+
139+
# Clean up watchdog
140+
kill "$WATCHDOG_PID" 2>/dev/null || true
141+
wait "$WATCHDOG_PID" 2>/dev/null || true
142+
143+
echo "Test exited with code $EXIT_CODE"
144+
exit $EXIT_CODE

.github/workflows/_unittest.yml

Lines changed: 5 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,11 @@ on:
2626
default: '3.10'
2727

2828
jobs:
29-
linux:
30-
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
31-
permissions:
32-
id-token: write
33-
contents: read
34-
with:
35-
runner: linux.2xlarge.memory
36-
docker-image: ${{ inputs.docker-image }}
37-
submodules: 'recursive'
38-
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
39-
timeout: 90
40-
script: |
41-
set -eux
42-
.ci/scripts/unittest-linux.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"
29+
# linux and windows disabled for AOTI hang diagnosis
30+
# linux:
31+
# ...
32+
# windows:
33+
# ...
4334

4435
macos:
4536
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -54,32 +45,3 @@ jobs:
5445
# This is needed to get the prebuilt PyTorch wheel from S3
5546
${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
5647
.ci/scripts/unittest-macos.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"
57-
58-
windows:
59-
if: ${{ inputs.build-tool == 'cmake' }}
60-
uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
61-
with:
62-
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
63-
timeout: 120
64-
script: |
65-
git config --global http.sslBackend openssl
66-
git submodule update --init --recursive
67-
conda init powershell
68-
69-
powershell -Command "& {
70-
Set-PSDebug -Trace 1
71-
\$ErrorActionPreference = 'Stop'
72-
\$PSNativeCommandUseErrorActionPreference = \$true
73-
74-
.ci/scripts/setup-windows.ps1 -editable "${{ inputs.editable }}"
75-
if (\$LASTEXITCODE -ne 0) {
76-
Write-Host "Setup failed. Exit code: \$LASTEXITCODE."
77-
exit \$LASTEXITCODE
78-
}
79-
80-
.ci/scripts/unittest-windows.ps1 -buildMode "${{ inputs.build-mode }}"
81-
if (\$LASTEXITCODE -ne 0) {
82-
Write-Host "Unit tests failed. Exit code: \$LASTEXITCODE."
83-
exit \$LASTEXITCODE
84-
}
85-
}"

0 commit comments

Comments
 (0)