|
6 | 6 | # LICENSE file in the root directory of this source tree. |
7 | 7 | set -eux |
8 | 8 |
|
9 | | -# Keep AOTInductor precompiled headers scoped to this job. The default cache |
10 | | -# location can persist across macOS self-hosted runner jobs and produce stale |
11 | | -# PCH failures after PyTorch is reinstalled. |
| 9 | +# ============================================================================= |
| 10 | +# AOTI HANG DIAGNOSIS |
| 11 | +# |
| 12 | +# Run a single AOTI test that is known to hang on macOS CI. A background |
| 13 | +# watchdog samples the native call stack every 60 s so we can see exactly |
| 14 | +# which C/C++ function the thread is blocked in (faulthandler only shows |
| 15 | +# Python frames and cannot fire when the GIL is held by native code). |
| 16 | +# ============================================================================= |
| 17 | + |
12 | 18 | export TORCHINDUCTOR_CACHE_DIR="$(mktemp -d "${RUNNER_TEMP:-/tmp}/torchinductor_cache_XXXXXX")" |
13 | 19 | trap 'rm -rf "${TORCHINDUCTOR_CACHE_DIR}"' EXIT |
14 | 20 |
|
15 | | -# Run pytest with coverage |
16 | | -${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml |
17 | | -# Run gtest |
18 | | -LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \ |
19 | | -${CONDA_RUN} test/run_oss_cpp_tests.sh |
| 21 | +# Force unbuffered output so every print appears immediately in the CI log. |
| 22 | +export PYTHONUNBUFFERED=1 |
| 23 | + |
| 24 | +# ---------- instrumented test wrapper ---------- |
| 25 | +cat > /tmp/aoti_diag.py << 'PYEOF' |
| 26 | +"""Run a single AOTI test with step-by-step timing instrumentation.""" |
| 27 | +import json, os, sys, tempfile, time |
| 28 | +
|
| 29 | +def log(msg): |
| 30 | + elapsed = time.time() - _t0 |
| 31 | + print(f"[AOTI-DIAG +{elapsed:7.1f}s] {msg}", flush=True) |
| 32 | +
|
| 33 | +_t0 = time.time() |
| 34 | +log("start") |
| 35 | +
|
| 36 | +import torch |
| 37 | +log(f"torch {torch.__version__} loaded") |
| 38 | +
|
| 39 | +from executorch.examples.models.llama3_2_vision.text_decoder.model import Llama3_2Decoder |
| 40 | +log("Llama3_2Decoder imported") |
| 41 | +
|
| 42 | +params = { |
| 43 | + "dim": 2048, |
| 44 | + "ffn_dim_multiplier": 1.3, |
| 45 | + "fusion_interval": 2, |
| 46 | + "intermediate_dim": 14336, |
| 47 | + "multiple_of": 1024, |
| 48 | + "n_heads": 32, |
| 49 | + "n_kv_heads": 8, |
| 50 | + "n_layers": 2, |
| 51 | + "n_special_tokens": 8, |
| 52 | + "norm_eps": 1e-05, |
| 53 | + "rope_theta": 500000.0, |
| 54 | + "use_scaled_rope": True, |
| 55 | + "vision_chunk_size": 560, |
| 56 | + "vision_max_num_chunks": 4, |
| 57 | + "vocab_size": 21008, |
| 58 | + "vision_num_cross_attention_layers": 1, |
| 59 | +} |
| 60 | +
|
| 61 | +with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as f: |
| 62 | + json.dump(params, f, indent=2); f.flush() |
| 63 | + model = Llama3_2Decoder( |
| 64 | + encoder_max_seq_len=6404, |
| 65 | + generate_full_logits=True, |
| 66 | + enable_dynamic_shape=True, |
| 67 | + use_kv_cache=True, |
| 68 | + params=f.name, |
| 69 | + dtype=torch.float32, |
| 70 | + ) |
| 71 | +log("model constructed") |
| 72 | +
|
| 73 | +encoder = model.get_eager_model().eval() |
| 74 | +for p in encoder.parameters(): |
| 75 | + p.requires_grad_(False) |
| 76 | +log("model eval + no_grad") |
| 77 | +
|
| 78 | +example_inputs = model.get_example_inputs() |
| 79 | +example_kwargs = model.get_example_kwarg_inputs() |
| 80 | +
|
| 81 | +# Step 1: torch.export |
| 82 | +log("step 1/4: torch.export.export ...") |
| 83 | +t = time.time() |
| 84 | +with torch.no_grad(), torch.inference_mode(): |
| 85 | + ep = torch.export.export(encoder, example_inputs, kwargs=example_kwargs, strict=True) |
| 86 | +log(f"step 1/4: torch.export.export done ({time.time()-t:.1f}s)") |
| 87 | +
|
| 88 | +# Step 2: aoti_compile_and_package |
| 89 | +tmpdir = tempfile.mkdtemp() |
| 90 | +pkg_path = os.path.join(tmpdir, "text_decoder.pt2") |
| 91 | +log(f"step 2/4: aoti_compile_and_package -> {pkg_path} ...") |
| 92 | +t = time.time() |
| 93 | +path = torch._inductor.aoti_compile_and_package(ep, package_path=pkg_path) |
| 94 | +log(f"step 2/4: aoti_compile_and_package done ({time.time()-t:.1f}s)") |
| 95 | +
|
| 96 | +# Step 3: aoti_load_package |
| 97 | +log("step 3/4: aoti_load_package ...") |
| 98 | +t = time.time() |
| 99 | +encoder_aoti = torch._inductor.aoti_load_package(path) |
| 100 | +log(f"step 3/4: aoti_load_package done ({time.time()-t:.1f}s)") |
| 101 | +
|
| 102 | +# Step 4: inference |
| 103 | +log("step 4/4: inference ...") |
| 104 | +t = time.time() |
| 105 | +y = encoder_aoti(*example_inputs, **example_kwargs) |
| 106 | +log(f"step 4/4: inference done ({time.time()-t:.1f}s)") |
| 107 | +
|
| 108 | +# Verify |
| 109 | +eager_res = encoder.forward(*example_inputs, **example_kwargs) |
| 110 | +torch.testing.assert_close(y, eager_res, rtol=1e-4, atol=1e-4) |
| 111 | +log("PASS — results match") |
| 112 | +PYEOF |
| 113 | + |
| 114 | +# ---------- run with background watchdog ---------- |
| 115 | +# Start the test |
| 116 | +${CONDA_RUN} --no-capture-output python /tmp/aoti_diag.py & |
| 117 | +TEST_PID=$! |
| 118 | + |
| 119 | +# Watchdog: every 60s, if the test is still running, sample the native stack. |
| 120 | +( |
| 121 | + while kill -0 "$TEST_PID" 2>/dev/null; do |
| 122 | + sleep 60 |
| 123 | + if kill -0 "$TEST_PID" 2>/dev/null; then |
| 124 | + echo "" |
| 125 | + echo "===== WATCHDOG: native stack sample ($(date)) =====" |
| 126 | + # sample captures C/C++ call stacks on macOS |
| 127 | + sample "$TEST_PID" 1 2>&1 | head -200 || true |
| 128 | + echo "===== END WATCHDOG =====" |
| 129 | + echo "" |
| 130 | + fi |
| 131 | + done |
| 132 | +) & |
| 133 | +WATCHDOG_PID=$! |
| 134 | + |
| 135 | +# Wait for test, propagate exit code |
| 136 | +wait "$TEST_PID" |
| 137 | +EXIT_CODE=$? |
| 138 | + |
| 139 | +# Clean up watchdog |
| 140 | +kill "$WATCHDOG_PID" 2>/dev/null || true |
| 141 | +wait "$WATCHDOG_PID" 2>/dev/null || true |
| 142 | + |
| 143 | +echo "Test exited with code $EXIT_CODE" |
| 144 | +exit $EXIT_CODE |
0 commit comments