diff --git a/.github/scripts/ci/merge_gate_wait.sh b/.github/scripts/ci/merge_gate_wait.sh new file mode 100755 index 00000000..6a56ddf1 --- /dev/null +++ b/.github/scripts/ci/merge_gate_wait.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# merge_gate_wait.sh -- poll the GitHub Checks API for an expected required +# check on a given SHA and emit a single pass/fail verdict. Used by +# .github/workflows/merge-gate.yml as the orchestrator's core logic. +# +# Why this script exists: +# GitHub's required-status-checks model is name-based, not workflow-based. +# When the underlying workflow fails to dispatch (transient webhook +# delivery failure on `pull_request`), the required check stays in +# "Expected -- Waiting" forever and the PR is silently stuck. This script +# turns that ambiguous yellow into an unambiguous red after a bounded +# liveness window, so reviewers see a real failure with a real message. +# +# Inputs (environment variables): +# GH_TOKEN required. Token with `checks:read` for the repo. +# REPO required. owner/repo (e.g. microsoft/apm). +# SHA required. Head SHA of the PR. +# EXPECTED_CHECK optional. Check-run name to wait for. +# Default: "Build & Test (Linux)". +# TIMEOUT_MIN optional. Total wall-clock budget in minutes. +# Default: 30. +# POLL_SEC optional. Poll interval in seconds. Default: 30. +# +# Exit codes: +# 0 expected check completed with conclusion success | skipped | neutral +# 1 expected check completed with a failing conclusion +# 2 expected check never appeared within TIMEOUT_MIN (THE BUG we catch) +# 3 expected check appeared but did not complete within TIMEOUT_MIN +# 4 invalid arguments / environment + +set -euo pipefail + +EXPECTED_CHECK="${EXPECTED_CHECK:-Build & Test (Linux)}" +TIMEOUT_MIN="${TIMEOUT_MIN:-30}" +POLL_SEC="${POLL_SEC:-30}" + +if [ -z "${GH_TOKEN:-}" ] || [ -z "${REPO:-}" ] || [ -z "${SHA:-}" ]; then + echo "ERROR: GH_TOKEN, REPO, and SHA are required." >&2 + exit 4 +fi + +if ! command -v gh >/dev/null 2>&1; then + echo "ERROR: gh CLI is required." >&2 + exit 4 +fi + +if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq is required." >&2 + exit 4 +fi + +deadline=$(( $(date +%s) + TIMEOUT_MIN * 60 )) +poll_count=0 +ever_seen="false" + +echo "[merge-gate] waiting for check '${EXPECTED_CHECK}' on ${REPO}@${SHA}" +echo "[merge-gate] timeout=${TIMEOUT_MIN}m poll=${POLL_SEC}s" + +while [ "$(date +%s)" -lt "$deadline" ]; do + poll_count=$((poll_count + 1)) + + # Filter by check-run name server-side. Most-recent check-run is first. + payload=$(gh api \ + -H "Accept: application/vnd.github+json" \ + "repos/${REPO}/commits/${SHA}/check-runs?check_name=$(jq -rn --arg n "$EXPECTED_CHECK" '$n|@uri')&per_page=10" \ + 2>/dev/null) || payload='{"check_runs":[]}' + + total=$(echo "$payload" | jq '.check_runs | length' 2>/dev/null || echo 0) + case "$total" in + ''|*[!0-9]*) total=0 ;; + esac + + if [ "$total" -gt 0 ]; then + ever_seen="true" + # Take the most recently started run for this name. + status=$(echo "$payload" | jq -r '.check_runs | sort_by(.started_at) | reverse | .[0].status') + conclusion=$(echo "$payload" | jq -r '.check_runs | sort_by(.started_at) | reverse | .[0].conclusion') + url=$(echo "$payload" | jq -r '.check_runs | sort_by(.started_at) | reverse | .[0].html_url') + + echo "[merge-gate] poll #${poll_count}: status=${status} conclusion=${conclusion}" + + if [ "$status" = "completed" ]; then + echo "[merge-gate] tier 1 finished: ${conclusion}" + echo "[merge-gate] details: ${url}" + case "$conclusion" in + success|skipped|neutral) + exit 0 + ;; + *) + echo "::error title=Tier 1 failed::'${EXPECTED_CHECK}' reported '${conclusion}'. See ${url}" + exit 1 + ;; + esac + fi + else + echo "[merge-gate] poll #${poll_count}: '${EXPECTED_CHECK}' not yet present" + fi + + sleep "$POLL_SEC" +done + +if [ "$ever_seen" = "false" ]; then + cat <&2 +::error title=Tier 1 never started::The required check '${EXPECTED_CHECK}' did not appear for SHA ${SHA} within ${TIMEOUT_MIN} minutes. + +This usually indicates a transient GitHub Actions webhook delivery failure for the 'pull_request' event. Recovery: + 1. Push an empty commit to retrigger: git commit --allow-empty -m 'ci: retrigger' && git push + 2. If that fails, close and reopen the PR. + +This gate (Merge Gate) catches the failure mode so it surfaces as a clear red check instead of a stuck 'Expected -- Waiting'. See .github/workflows/merge-gate.yml. +EOF + exit 2 +fi + +echo "::error title=Tier 1 timeout::Build & Test (Linux) appeared but did not complete within ${TIMEOUT_MIN} minutes." >&2 +exit 3 diff --git a/.github/workflows/merge-gate.yml b/.github/workflows/merge-gate.yml new file mode 100644 index 00000000..1605f8d4 --- /dev/null +++ b/.github/workflows/merge-gate.yml @@ -0,0 +1,119 @@ +# Merge Gate -- shadow-mode orchestrator that aggregates required PR checks +# into a single verdict and turns "stuck pull_request webhook" failures into +# loud red checks instead of silent yellow "Expected -- Waiting" forever. +# +# Why this file exists: +# GitHub's required-status-checks model is name-based, not workflow-based. +# We tier our CI: ci.yml runs Tier 1 on `pull_request` and emits +# `Build & Test (Linux)`, while ci-integration-pr-stub.yml stubs the four +# Tier 2 checks via `pull_request_target`. That asymmetry means required +# checks depend on TWO independent webhook deliveries succeeding. If the +# `pull_request` event is dropped (transient, observed on PR #856), 4/5 +# stubs go green and the 5th hangs in "Expected -- Waiting" indefinitely. +# +# This workflow eventually replaces the per-test required checks with a +# single `Merge Gate / gate` check that: +# - dispatches via two redundant triggers (pull_request + +# pull_request_target) so a single dropped delivery is recoverable; +# - polls the Checks API for the real Tier 1 check and aggregates; +# - times out cleanly with a clear error message if Tier 1 never fires +# (the bug we are catching); +# - is the SOLE required check after rollout, decoupling branch +# protection from workflow topology. +# +# Rollout plan: +# Phase 1 (this PR): workflow runs in shadow mode -- not required. +# Observe behaviour on real PRs for >=1 week. +# Phase 2 (post-merge): flip branch protection to require only +# `Merge Gate / gate` and drop the four stub names. +# Stub workflow can then be deleted. +# +# Security: +# `pull_request_target` is used here for redundancy ONLY. This workflow +# never checks out PR code, never interpolates PR data into `run:`, and +# has read-only token permissions. The classic +# pull_request_target+checkout(head) exploit is impossible by construction. +# See ci-integration-pr-stub.yml for the same security model. + +name: Merge Gate + +on: + pull_request: + branches: [ main ] + paths-ignore: + - 'docs/**' + - '.gitignore' + - 'LICENSE' + pull_request_target: + branches: [ main ] + types: [ opened, synchronize, reopened, ready_for_review ] + paths-ignore: + - 'docs/**' + - '.gitignore' + - 'LICENSE' + +# Single in-flight gate per PR. If both pull_request and pull_request_target +# fire (the happy redundant case), the second one cancels the first. +concurrency: + group: merge-gate-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + checks: read + pull-requests: read + +jobs: + gate: + name: gate + runs-on: ubuntu-24.04 + timeout-minutes: 35 + steps: + # On pull_request we can safely checkout PR head: the runner has no + # secrets and a read-only token. Under pull_request_target we MUST NOT + # checkout PR head -- we fetch from the base branch via API instead. + - name: Checkout PR head (pull_request only) + if: github.event_name == 'pull_request' + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + sparse-checkout: | + .github/scripts/ci + + - name: Fetch script from base (pull_request_target only) + if: github.event_name == 'pull_request_target' + env: + GH_TOKEN: ${{ github.token }} + run: | + mkdir -p .github/scripts/ci + # Self-bootstrap: if the script does not yet exist on base (i.e. this + # is the PR adding the script), degrade to a no-op that passes. Once + # the script lands on main, this branch becomes a real gate. + status=$(curl -fsSL -o .github/scripts/ci/merge_gate_wait.sh \ + --retry 5 --retry-delay 3 --retry-connrefused --max-time 30 \ + -w '%{http_code}' \ + -H "Authorization: Bearer ${GH_TOKEN}" \ + -H "Accept: application/vnd.github.raw" \ + "https://api.github.com/repos/${{ github.repository }}/contents/.github/scripts/ci/merge_gate_wait.sh?ref=${GITHUB_BASE_REF}" \ + || echo "404") + if [ "$status" = "404" ] || [ ! -s .github/scripts/ci/merge_gate_wait.sh ]; then + echo "::warning::merge_gate_wait.sh not found on base ref '${GITHUB_BASE_REF}' yet -- self-bootstrap pass." + cat > .github/scripts/ci/merge_gate_wait.sh <<'BOOTSTRAP' + #!/usr/bin/env bash + echo "[merge-gate] self-bootstrap pass: script not yet on base" + exit 0 + BOOTSTRAP + fi + chmod +x .github/scripts/ci/merge_gate_wait.sh + + - name: Wait for Tier 1 (Build & Test Linux) + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + SHA: ${{ github.event.pull_request.head.sha }} + EXPECTED_CHECK: 'Build & Test (Linux)' + TIMEOUT_MIN: '30' + POLL_SEC: '30' + run: | + chmod +x .github/scripts/ci/merge_gate_wait.sh + .github/scripts/ci/merge_gate_wait.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4900e1eb..d33ac4d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `enterprise/governance-guide.md` documentation page: flagship governance reference for CISO / VPE / Platform Tech Lead audiences, covering enforcement points, bypass contract, failure semantics, air-gapped operation, rollout playbook, and known gaps. Trims duplicated content in `governance.md`, `apm-policy.md`, and `integrations/github-rulesets.md`. Adds `templates/apm-policy-starter.yml`. (#851) - `apm install` now supports Azure DevOps AAD bearer-token auth via `az account get-access-token`, with PAT-first fallback for orgs that disable PAT creation. Closes #852 (#856) +- New CI safety net: `merge-gate.yml` orchestrator turns dropped `pull_request` webhook deliveries into clear red checks instead of stuck `Expected -- Waiting for status to be reported`. Triggers on both `pull_request` and `pull_request_target` for redundancy. (#865) (PR follow-up to #856 CI flake) ## [0.9.1] - 2026-04-22