microsoft · danielmeppiel · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+# merge_gate_wait.sh -- poll the GitHub Checks API for an expected required
+# check on a given SHA and emit a single pass/fail verdict. Used by
+# .github/workflows/merge-gate.yml as the orchestrator's core logic.
+#
+# Why this script exists:
+#   GitHub's required-status-checks model is name-based, not workflow-based.
+#   When the underlying workflow fails to dispatch (transient webhook
+#   delivery failure on `pull_request`), the required check stays in
+#   "Expected -- Waiting" forever and the PR is silently stuck. This script
+#   turns that ambiguous yellow into an unambiguous red after a bounded
+#   liveness window, so reviewers see a real failure with a real message.
+#
+# Inputs (environment variables):
+#   GH_TOKEN          required. Token with `checks:read` for the repo.
+#   REPO              required. owner/repo (e.g. microsoft/apm).
+#   SHA               required. Head SHA of the PR.
+#   EXPECTED_CHECK    optional. Check-run name to wait for.
+#                     Default: "Build & Test (Linux)".
+#   TIMEOUT_MIN       optional. Total wall-clock budget in minutes.
+#                     Default: 30.
+#   POLL_SEC          optional. Poll interval in seconds. Default: 30.
+#
+# Exit codes:
+#   0  expected check completed with conclusion success | skipped | neutral
+#   1  expected check completed with a failing conclusion
+#   2  expected check never appeared within TIMEOUT_MIN (THE BUG we catch)
+#   3  expected check appeared but did not complete within TIMEOUT_MIN
+#   4  invalid arguments / environment
+
+set -euo pipefail
+
+EXPECTED_CHECK="${EXPECTED_CHECK:-Build & Test (Linux)}"
+TIMEOUT_MIN="${TIMEOUT_MIN:-30}"
+POLL_SEC="${POLL_SEC:-30}"
+
+if [ -z "${GH_TOKEN:-}" ] || [ -z "${REPO:-}" ] || [ -z "${SHA:-}" ]; then
+  echo "ERROR: GH_TOKEN, REPO, and SHA are required." >&2
+  exit 4
+fi
+
+if ! command -v gh >/dev/null 2>&1; then
+  echo "ERROR: gh CLI is required." >&2
+  exit 4
+fi
+
+if ! command -v jq >/dev/null 2>&1; then
+  echo "ERROR: jq is required." >&2
+  exit 4
+fi
+
+deadline=$(( $(date +%s) + TIMEOUT_MIN * 60 ))
+poll_count=0
+ever_seen="false"
+
+echo "[merge-gate] waiting for check '${EXPECTED_CHECK}' on ${REPO}@${SHA}"
+echo "[merge-gate] timeout=${TIMEOUT_MIN}m poll=${POLL_SEC}s"
+
+while [ "$(date +%s)" -lt "$deadline" ]; do
+  poll_count=$((poll_count + 1))
+
+  # Filter by check-run name server-side. Most-recent check-run is first.
+  payload=$(gh api \
+    -H "Accept: application/vnd.github+json" \
+    "repos/${REPO}/commits/${SHA}/check-runs?check_name=$(jq -rn --arg n "$EXPECTED_CHECK" '$n|@uri')&per_page=10" \
+    2>/dev/null) || payload='{"check_runs":[]}'
+
+  total=$(echo "$payload" | jq '.check_runs | length' 2>/dev/null || echo 0)
+  case "$total" in
+    ''|*[!0-9]*) total=0 ;;
+  esac
+
+  if [ "$total" -gt 0 ]; then
+    ever_seen="true"
+    # Take the most recently started run for this name.
+    status=$(echo "$payload" | jq -r '.check_runs | sort_by(.started_at) | reverse | .[0].status')
+    conclusion=$(echo "$payload" | jq -r '.check_runs | sort_by(.started_at) | reverse | .[0].conclusion')
+    url=$(echo "$payload" | jq -r '.check_runs | sort_by(.started_at) | reverse | .[0].html_url')
+
+    echo "[merge-gate] poll #${poll_count}: status=${status} conclusion=${conclusion}"
+
+    if [ "$status" = "completed" ]; then
+      echo "[merge-gate] tier 1 finished: ${conclusion}"
+      echo "[merge-gate] details: ${url}"
+      case "$conclusion" in
+        success|skipped|neutral)
+          exit 0
+          ;;
+        *)
+          echo "::error title=Tier 1 failed::'${EXPECTED_CHECK}' reported '${conclusion}'. See ${url}"
+          exit 1
+          ;;
+      esac
+    fi
+  else
+    echo "[merge-gate] poll #${poll_count}: '${EXPECTED_CHECK}' not yet present"
+  fi
+
+  sleep "$POLL_SEC"
+done
+
+if [ "$ever_seen" = "false" ]; then
+  cat <<EOF >&2
+::error title=Tier 1 never started::The required check '${EXPECTED_CHECK}' did not appear for SHA ${SHA} within ${TIMEOUT_MIN} minutes.
+
+This usually indicates a transient GitHub Actions webhook delivery failure for the 'pull_request' event. Recovery:
+  1. Push an empty commit to retrigger:  git commit --allow-empty -m 'ci: retrigger' && git push
+  2. If that fails, close and reopen the PR.
+
+This gate (Merge Gate) catches the failure mode so it surfaces as a clear red check instead of a stuck 'Expected -- Waiting'. See .github/workflows/merge-gate.yml.
+EOF
+  exit 2
+fi
+
+echo "::error title=Tier 1 timeout::Build & Test (Linux) appeared but did not complete within ${TIMEOUT_MIN} minutes." >&2
+exit 3
@@ -0,0 +1,119 @@
+# Merge Gate -- shadow-mode orchestrator that aggregates required PR checks
+# into a single verdict and turns "stuck pull_request webhook" failures into
+# loud red checks instead of silent yellow "Expected -- Waiting" forever.
+#
+# Why this file exists:
+#   GitHub's required-status-checks model is name-based, not workflow-based.
+#   We tier our CI: ci.yml runs Tier 1 on `pull_request` and emits
+#   `Build & Test (Linux)`, while ci-integration-pr-stub.yml stubs the four
+#   Tier 2 checks via `pull_request_target`. That asymmetry means required
+#   checks depend on TWO independent webhook deliveries succeeding. If the
+#   `pull_request` event is dropped (transient, observed on PR #856), 4/5
+#   stubs go green and the 5th hangs in "Expected -- Waiting" indefinitely.
+#
+#   This workflow eventually replaces the per-test required checks with a
+#   single `Merge Gate / gate` check that:
+#     - dispatches via two redundant triggers (pull_request +
+#       pull_request_target) so a single dropped delivery is recoverable;
+#     - polls the Checks API for the real Tier 1 check and aggregates;
+#     - times out cleanly with a clear error message if Tier 1 never fires
+#       (the bug we are catching);
+#     - is the SOLE required check after rollout, decoupling branch
+#       protection from workflow topology.
+#
+# Rollout plan:
+#   Phase 1 (this PR):    workflow runs in shadow mode -- not required.
+#                         Observe behaviour on real PRs for >=1 week.
+#   Phase 2 (post-merge): flip branch protection to require only
+#                         `Merge Gate / gate` and drop the four stub names.
+#                         Stub workflow can then be deleted.
+#
+# Security:
+#   `pull_request_target` is used here for redundancy ONLY. This workflow
+#   never checks out PR code, never interpolates PR data into `run:`, and
+#   has read-only token permissions. The classic
+#   pull_request_target+checkout(head) exploit is impossible by construction.
+#   See ci-integration-pr-stub.yml for the same security model.
+
+name: Merge Gate
+
+on:
+  pull_request:
+    branches: [ main ]
+    paths-ignore:
+      - 'docs/**'
+      - '.gitignore'
+      - 'LICENSE'
+  pull_request_target:
+    branches: [ main ]
+    types: [ opened, synchronize, reopened, ready_for_review ]
+    paths-ignore:
+      - 'docs/**'
+      - '.gitignore'
+      - 'LICENSE'
+
+# Single in-flight gate per PR. If both pull_request and pull_request_target
+# fire (the happy redundant case), the second one cancels the first.
+concurrency:
+  group: merge-gate-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  checks: read
+  pull-requests: read
+
+jobs:
+  gate:
+    name: gate
+    runs-on: ubuntu-24.04
+    timeout-minutes: 35
+    steps:
+      # On pull_request we can safely checkout PR head: the runner has no
+      # secrets and a read-only token. Under pull_request_target we MUST NOT
+      # checkout PR head -- we fetch from the base branch via API instead.
+      - name: Checkout PR head (pull_request only)
+        if: github.event_name == 'pull_request'
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          sparse-checkout: |
+            .github/scripts/ci
+
+      - name: Fetch script from base (pull_request_target only)
+        if: github.event_name == 'pull_request_target'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          mkdir -p .github/scripts/ci
+          # Self-bootstrap: if the script does not yet exist on base (i.e. this
+          # is the PR adding the script), degrade to a no-op that passes. Once
+          # the script lands on main, this branch becomes a real gate.
+          status=$(curl -fsSL -o .github/scripts/ci/merge_gate_wait.sh \
+            --retry 5 --retry-delay 3 --retry-connrefused --max-time 30 \
+            -w '%{http_code}' \
+            -H "Authorization: Bearer ${GH_TOKEN}" \
+            -H "Accept: application/vnd.github.raw" \
+            "https://api.github.com/repos/${{ github.repository }}/contents/.github/scripts/ci/merge_gate_wait.sh?ref=${GITHUB_BASE_REF}" \
+            || echo "404")
+          if [ "$status" = "404" ] || [ ! -s .github/scripts/ci/merge_gate_wait.sh ]; then
+            echo "::warning::merge_gate_wait.sh not found on base ref '${GITHUB_BASE_REF}' yet -- self-bootstrap pass."
+            cat > .github/scripts/ci/merge_gate_wait.sh <<'BOOTSTRAP'
+          #!/usr/bin/env bash
+          echo "[merge-gate] self-bootstrap pass: script not yet on base"
+          exit 0
+          BOOTSTRAP
+          fi
+          chmod +x .github/scripts/ci/merge_gate_wait.sh
+
+      - name: Wait for Tier 1 (Build & Test Linux)
+        env:
+          GH_TOKEN: ${{ github.token }}
+          REPO: ${{ github.repository }}
+          SHA: ${{ github.event.pull_request.head.sha }}
+          EXPECTED_CHECK: 'Build & Test (Linux)'
+          TIMEOUT_MIN: '30'
+          POLL_SEC: '30'
+        run: |
+          chmod +x .github/scripts/ci/merge_gate_wait.sh
+          .github/scripts/ci/merge_gate_wait.sh
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - New `enterprise/governance-guide.md` documentation page: flagship governance reference for CISO / VPE / Platform Tech Lead audiences, covering enforcement points, bypass contract, failure semantics, air-gapped operation, rollout playbook, and known gaps. Trims duplicated content in `governance.md`, `apm-policy.md`, and `integrations/github-rulesets.md`. Adds `templates/apm-policy-starter.yml`. (#851)
 - `apm install` now supports Azure DevOps AAD bearer-token auth via `az account get-access-token`, with PAT-first fallback for orgs that disable PAT creation. Closes #852 (#856)
+- New CI safety net: `merge-gate.yml` orchestrator turns dropped `pull_request` webhook deliveries into clear red checks instead of stuck `Expected -- Waiting for status to be reported`. Triggers on both `pull_request` and `pull_request_target` for redundancy. (#865) (PR follow-up to #856 CI flake)
 
 ## [0.9.1] - 2026-04-22