From ea7df733de27a00d6c0b7696c4e6020cd8ec7195 Mon Sep 17 00:00:00 2001 From: danielmeppiel Date: Thu, 23 Apr 2026 09:48:51 +0200 Subject: [PATCH 1/4] ci: add merge-gate orchestrator (shadow) + stuck-PR watchdog PR #856 surfaced a structural CI fragility: required status checks are satisfied by two independent webhook channels (pull_request emits 'Build & Test (Linux)', pull_request_target emits the four Tier 2 stubs). When the pull_request delivery is dropped, 4/5 stubs go green and the 5th hangs in 'Expected -- Waiting' forever -- ambiguous yellow indistinguishable from a slow build. Recovery is folklore. This PR ships two safety nets in shadow mode: * .github/workflows/merge-gate.yml + scripts/ci/merge_gate_wait.sh Single orchestrator that polls the Checks API for 'Build & Test (Linux)' on the PR head SHA and aggregates into one verdict. Triggers on BOTH pull_request and pull_request_target so a single dropped delivery is recoverable; concurrency dedupes. Times out cleanly with a clear error message if Tier 1 never dispatched -- turning the invisible failure into a loud red check. NOT YET REQUIRED -- shadow observation first, ruleset flip after merge. * .github/workflows/watchdog-stuck-prs.yml + scripts/ci/watchdog_scan.sh Cron every 15 min. For open non-draft PRs with no ci.yml run on the head SHA AND non-paths-ignored files, posts one recovery comment. Backstop for any required check that stops dispatching. Tested live (dry-run) against microsoft/apm: watchdog correctly distinguishes stuck PRs (#853, #409) from docs-only PRs (#864, #461, #825). Both scripts shellcheck-clean. merge_gate_wait.sh tested end-to-end against PR #856 head SHA (success path, exit 0) and a non-existent SHA (timeout path, exit 2 with clear error annotation). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/scripts/ci/merge_gate_wait.sh | 116 ++++++++++++++++ .github/scripts/ci/watchdog_scan.sh | 164 +++++++++++++++++++++++ .github/workflows/merge-gate.yml | 90 +++++++++++++ .github/workflows/watchdog-stuck-prs.yml | 55 ++++++++ CHANGELOG.md | 1 + 5 files changed, 426 insertions(+) create mode 100755 .github/scripts/ci/merge_gate_wait.sh create mode 100755 .github/scripts/ci/watchdog_scan.sh create mode 100644 .github/workflows/merge-gate.yml create mode 100644 .github/workflows/watchdog-stuck-prs.yml diff --git a/.github/scripts/ci/merge_gate_wait.sh b/.github/scripts/ci/merge_gate_wait.sh new file mode 100755 index 00000000..99098dc6 --- /dev/null +++ b/.github/scripts/ci/merge_gate_wait.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# merge_gate_wait.sh -- poll the GitHub Checks API for an expected required +# check on a given SHA and emit a single pass/fail verdict. Used by +# .github/workflows/merge-gate.yml as the orchestrator's core logic. +# +# Why this script exists: +# GitHub's required-status-checks model is name-based, not workflow-based. +# When the underlying workflow fails to dispatch (transient webhook +# delivery failure on `pull_request`), the required check stays in +# "Expected -- Waiting" forever and the PR is silently stuck. This script +# turns that ambiguous yellow into an unambiguous red after a bounded +# liveness window, so reviewers see a real failure with a real message. +# +# Inputs (environment variables): +# GH_TOKEN required. Token with `checks:read` for the repo. +# REPO required. owner/repo (e.g. microsoft/apm). +# SHA required. Head SHA of the PR. +# EXPECTED_CHECK optional. Check-run name to wait for. +# Default: "Build & Test (Linux)". +# TIMEOUT_MIN optional. Total wall-clock budget in minutes. +# Default: 30. +# POLL_SEC optional. Poll interval in seconds. Default: 30. +# +# Exit codes: +# 0 expected check completed with conclusion success | skipped | neutral +# 1 expected check completed with a failing conclusion +# 2 expected check never appeared within TIMEOUT_MIN (THE BUG we catch) +# 3 expected check appeared but did not complete within TIMEOUT_MIN +# 4 invalid arguments / environment + +set -euo pipefail + +EXPECTED_CHECK="${EXPECTED_CHECK:-Build & Test (Linux)}" +TIMEOUT_MIN="${TIMEOUT_MIN:-30}" +POLL_SEC="${POLL_SEC:-30}" + +if [ -z "${GH_TOKEN:-}" ] || [ -z "${REPO:-}" ] || [ -z "${SHA:-}" ]; then + echo "ERROR: GH_TOKEN, REPO, and SHA are required." >&2 + exit 4 +fi + +if ! command -v gh >/dev/null 2>&1; then + echo "ERROR: gh CLI is required." >&2 + exit 4 +fi + +if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq is required." >&2 + exit 4 +fi + +deadline=$(( $(date +%s) + TIMEOUT_MIN * 60 )) +poll_count=0 +ever_seen="false" + +echo "[merge-gate] waiting for check '${EXPECTED_CHECK}' on ${REPO}@${SHA}" +echo "[merge-gate] timeout=${TIMEOUT_MIN}m poll=${POLL_SEC}s" + +while [ "$(date +%s)" -lt "$deadline" ]; do + poll_count=$((poll_count + 1)) + + # Filter by check-run name server-side. Most-recent check-run is first. + payload=$(gh api \ + -H "Accept: application/vnd.github+json" \ + "repos/${REPO}/commits/${SHA}/check-runs?check_name=$(jq -rn --arg n "$EXPECTED_CHECK" '$n|@uri')&per_page=10" \ + 2>/dev/null) || payload='{"check_runs":[]}' + + total=$(echo "$payload" | jq '.check_runs | length' 2>/dev/null || echo 0) + case "$total" in + ''|*[!0-9]*) total=0 ;; + esac + + if [ "$total" -gt 0 ]; then + ever_seen="true" + # Take the most recently started run for this name. + status=$(echo "$payload" | jq -r '.check_runs | sort_by(.started_at) | reverse | .[0].status') + conclusion=$(echo "$payload" | jq -r '.check_runs | sort_by(.started_at) | reverse | .[0].conclusion') + url=$(echo "$payload" | jq -r '.check_runs | sort_by(.started_at) | reverse | .[0].html_url') + + echo "[merge-gate] poll #${poll_count}: status=${status} conclusion=${conclusion}" + + if [ "$status" = "completed" ]; then + echo "[merge-gate] tier 1 finished: ${conclusion}" + echo "[merge-gate] details: ${url}" + case "$conclusion" in + success|skipped|neutral) + exit 0 + ;; + *) + echo "::error title=Tier 1 failed::Build & Test (Linux) reported '${conclusion}'. See ${url}" + exit 1 + ;; + esac + fi + else + echo "[merge-gate] poll #${poll_count}: '${EXPECTED_CHECK}' not yet present" + fi + + sleep "$POLL_SEC" +done + +if [ "$ever_seen" = "false" ]; then + cat <&2 +::error title=Tier 1 never started::The required check '${EXPECTED_CHECK}' did not appear for SHA ${SHA} within ${TIMEOUT_MIN} minutes. + +This usually indicates a transient GitHub Actions webhook delivery failure for the 'pull_request' event. Recovery: + 1. Push an empty commit to retrigger: git commit --allow-empty -m 'ci: retrigger' && git push + 2. If that fails, close and reopen the PR. + +This gate (Merge Gate) catches the failure mode so it surfaces as a clear red check instead of a stuck 'Expected -- Waiting'. See .github/workflows/merge-gate.yml. +EOF + exit 2 +fi + +echo "::error title=Tier 1 timeout::Build & Test (Linux) appeared but did not complete within ${TIMEOUT_MIN} minutes." >&2 +exit 3 diff --git a/.github/scripts/ci/watchdog_scan.sh b/.github/scripts/ci/watchdog_scan.sh new file mode 100755 index 00000000..71e28dcc --- /dev/null +++ b/.github/scripts/ci/watchdog_scan.sh @@ -0,0 +1,164 @@ +#!/usr/bin/env bash +# watchdog_scan.sh -- scan open PRs for the "stuck pull_request webhook" +# failure mode and post a single recovery comment per stuck PR. +# +# Used by .github/workflows/watchdog-stuck-prs.yml on a 15-minute cron. +# Acts as a safety net while merge-gate.yml rolls out (and remains useful +# afterwards as a backstop for any required check that stops dispatching). +# +# Detection rule: +# For each open PR not in draft: +# - take last-update age in minutes (proxy for "time since last push") +# - if age is in the [WATCHDOG_MIN_AGE_MIN, WATCHDOG_MAX_AGE_MIN] window +# - AND the head SHA has zero check-runs named EXPECTED_CHECK +# then post the recovery comment exactly once (idempotency via a marker +# string `WATCHDOG_STUCK_WEBHOOK_MARKER` embedded in the comment body). +# +# Inputs (environment variables): +# GH_TOKEN required. Token with pull-requests:write, checks:read. +# REPO required. owner/repo. +# EXPECTED_CHECK optional. Default: "Build & Test (Linux)". +# WATCHDOG_MIN_AGE_MIN optional. Default: 15. +# WATCHDOG_MAX_AGE_MIN optional. Default: 1440 (1 day). +# WATCHDOG_DRY_RUN optional. If "1", log decisions but post no comment. +# +# Exit code is always 0 unless the script itself errors -- one stuck PR +# should not fail the whole scan. + +set -euo pipefail + +EXPECTED_CHECK="${EXPECTED_CHECK:-Build & Test (Linux)}" +MIN_AGE_MIN="${WATCHDOG_MIN_AGE_MIN:-15}" +MAX_AGE_MIN="${WATCHDOG_MAX_AGE_MIN:-1440}" +DRY_RUN="${WATCHDOG_DRY_RUN:-0}" +MARKER="WATCHDOG_STUCK_WEBHOOK_MARKER" + +if [ -z "${GH_TOKEN:-}" ] || [ -z "${REPO:-}" ]; then + echo "ERROR: GH_TOKEN and REPO are required." >&2 + exit 1 +fi + +now_ts=$(date -u +%s) + +# Use updated_at as the "last activity" proxy. headRefOid changes on every +# push; updated_at changes on every push, comment, label etc. The combination +# is fine for this safety-net heuristic. +prs=$(gh pr list --repo "$REPO" --state open --limit 200 \ + --json number,headRefOid,updatedAt,isDraft,title 2>/dev/null || echo '[]') + +count_total=$(echo "$prs" | jq 'length') +count_stuck=0 +count_commented=0 + +echo "[watchdog] scanning ${count_total} open PRs in ${REPO}" +echo "[watchdog] expected_check='${EXPECTED_CHECK}' window=[${MIN_AGE_MIN}m..${MAX_AGE_MIN}m] dry_run=${DRY_RUN}" + +while read -r pr; do + number=$(echo "$pr" | jq -r '.number') + sha=$(echo "$pr" | jq -r '.headRefOid') + updated=$(echo "$pr" | jq -r '.updatedAt') + is_draft=$(echo "$pr" | jq -r '.isDraft') + title=$(echo "$pr" | jq -r '.title') + + if [ "$is_draft" = "true" ]; then + continue + fi + + # Portable date parse (works on macOS BSD date and GNU date). + if updated_ts=$(date -u -d "$updated" +%s 2>/dev/null); then + : + elif updated_ts=$(date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$updated" +%s 2>/dev/null); then + : + else + echo "[watchdog] PR #${number}: cannot parse updated_at='${updated}', skipping" >&2 + continue + fi + + age_min=$(( (now_ts - updated_ts) / 60 )) + + if [ "$age_min" -lt "$MIN_AGE_MIN" ] || [ "$age_min" -gt "$MAX_AGE_MIN" ]; then + continue + fi + + present=$(gh api \ + -H "Accept: application/vnd.github+json" \ + "repos/${REPO}/commits/${sha}/check-runs?check_name=$(jq -rn --arg n "$EXPECTED_CHECK" '$n|@uri')&per_page=1" \ + --jq '.check_runs | length' 2>/dev/null || echo "0") + + if [ "$present" != "0" ]; then + continue + fi + + # Distinguish "stuck webhook" from "legitimately skipped": + # 1. If ci.yml has ANY workflow run for this SHA, it dispatched (good). + workflow_runs=$(gh api \ + -H "Accept: application/vnd.github+json" \ + "repos/${REPO}/actions/workflows/ci.yml/runs?head_sha=${sha}&per_page=1" \ + --jq '.total_count' 2>/dev/null || echo "0") + + if [ "$workflow_runs" != "0" ]; then + continue + fi + + # 2. If all changed files match ci.yml's paths-ignore, the workflow was + # correctly suppressed and no run is expected. + # Keep this list in sync with .github/workflows/ci.yml `paths-ignore`. + non_ignored=$(gh api \ + -H "Accept: application/vnd.github+json" \ + "repos/${REPO}/pulls/${number}/files?per_page=300" \ + --jq '[.[].filename] | map(select( + (startswith("docs/") | not) + and . != ".gitignore" + and . != "LICENSE" + )) | length' 2>/dev/null || echo "0") + + if [ "$non_ignored" = "0" ]; then + continue + fi + + count_stuck=$((count_stuck + 1)) + echo "[watchdog] PR #${number} ('${title}') looks stuck: age=${age_min}m sha=${sha} non_ignored_files=${non_ignored}" + + # Idempotency: do not double-comment. + has_comment=$(gh api \ + -H "Accept: application/vnd.github+json" \ + "repos/${REPO}/issues/${number}/comments?per_page=100" \ + --jq "[.[] | select(.body | contains(\"${MARKER}\"))] | length" 2>/dev/null || echo "0") + + if [ "$has_comment" != "0" ]; then + echo "[watchdog] PR #${number}: already commented, skipping" + continue + fi + + if [ "$DRY_RUN" = "1" ]; then + echo "[watchdog] PR #${number}: DRY_RUN, would have commented" + continue + fi + + sha_short="${sha:0:8}" + body_file=$(mktemp) + # shellcheck disable=SC2016 # backticks here are markdown, not command subst + { + printf '\n' "$MARKER" + printf ':warning: **Stuck CI webhook detected**\n\n' + printf 'This PR'\''s head commit (`%s`) has no `%s` check-run after %s minutes. ' "$sha_short" "$EXPECTED_CHECK" "$age_min" + printf 'This usually means a GitHub Actions webhook delivery for the `pull_request` event was dropped and never recovered.\n\n' + printf '**Recovery:** push an empty commit to retrigger:\n\n' + printf '```bash\n' + printf 'git commit --allow-empty -m '\''ci: retrigger'\''\n' + printf 'git push\n' + printf '```\n\n' + printf 'If that does not help, close and reopen the PR.\n\n' + printf 'This comment is posted by `.github/workflows/watchdog-stuck-prs.yml`. It will not be repeated for this PR. See `.github/workflows/merge-gate.yml` for the orchestrator that aims to make this failure mode self-healing.\n' + } > "$body_file" + + if gh pr comment "$number" --repo "$REPO" --body-file "$body_file" >/dev/null 2>&1; then + count_commented=$((count_commented + 1)) + echo "[watchdog] PR #${number}: comment posted" + else + echo "[watchdog] PR #${number}: failed to post comment" >&2 + fi + rm -f "$body_file" +done < <(echo "$prs" | jq -c '.[]') + +echo "[watchdog] done. stuck=${count_stuck} commented=${count_commented}" diff --git a/.github/workflows/merge-gate.yml b/.github/workflows/merge-gate.yml new file mode 100644 index 00000000..23d03420 --- /dev/null +++ b/.github/workflows/merge-gate.yml @@ -0,0 +1,90 @@ +# Merge Gate -- shadow-mode orchestrator that aggregates required PR checks +# into a single verdict and turns "stuck pull_request webhook" failures into +# loud red checks instead of silent yellow "Expected -- Waiting" forever. +# +# Why this file exists: +# GitHub's required-status-checks model is name-based, not workflow-based. +# We tier our CI: ci.yml runs Tier 1 on `pull_request` and emits +# `Build & Test (Linux)`, while ci-integration-pr-stub.yml stubs the four +# Tier 2 checks via `pull_request_target`. That asymmetry means required +# checks depend on TWO independent webhook deliveries succeeding. If the +# `pull_request` event is dropped (transient, observed on PR #856), 4/5 +# stubs go green and the 5th hangs in "Expected -- Waiting" indefinitely. +# +# This workflow eventually replaces the per-test required checks with a +# single `Merge Gate / gate` check that: +# - dispatches via two redundant triggers (pull_request + +# pull_request_target) so a single dropped delivery is recoverable; +# - polls the Checks API for the real Tier 1 check and aggregates; +# - times out cleanly with a clear error message if Tier 1 never fires +# (the bug we are catching); +# - is the SOLE required check after rollout, decoupling branch +# protection from workflow topology. +# +# Rollout plan: +# Phase 1 (this PR): workflow runs in shadow mode -- not required. +# Observe behaviour on real PRs for >=1 week. +# Phase 2 (post-merge): flip branch protection to require only +# `Merge Gate / gate` and drop the four stub names. +# Stub workflow can then be deleted. +# +# Security: +# `pull_request_target` is used here for redundancy ONLY. This workflow +# never checks out PR code, never interpolates PR data into `run:`, and +# has read-only token permissions. The classic +# pull_request_target+checkout(head) exploit is impossible by construction. +# See ci-integration-pr-stub.yml for the same security model. + +name: Merge Gate + +on: + pull_request: + branches: [ main ] + paths-ignore: + - 'docs/**' + - '.gitignore' + - 'LICENSE' + pull_request_target: + branches: [ main ] + types: [ opened, synchronize, reopened, ready_for_review ] + paths-ignore: + - 'docs/**' + - '.gitignore' + - 'LICENSE' + +# Single in-flight gate per PR. If both pull_request and pull_request_target +# fire (the happy redundant case), the second one cancels the first. +concurrency: + group: merge-gate-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + checks: read + pull-requests: read + +jobs: + gate: + name: gate + runs-on: ubuntu-24.04 + timeout-minutes: 35 + steps: + - name: Wait for Tier 1 (Build & Test Linux) + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + SHA: ${{ github.event.pull_request.head.sha }} + EXPECTED_CHECK: 'Build & Test (Linux)' + TIMEOUT_MIN: '30' + POLL_SEC: '30' + run: | + # The script lives in the repo. We intentionally do NOT actions/checkout + # under pull_request_target -- instead we fetch the script from the base + # branch via the API. This preserves the no-checkout security guarantee. + curl -fsSL \ + -H "Authorization: Bearer ${GH_TOKEN}" \ + -H "Accept: application/vnd.github.raw" \ + "https://api.github.com/repos/${REPO}/contents/.github/scripts/ci/merge_gate_wait.sh?ref=${GITHUB_BASE_REF}" \ + -o /tmp/merge_gate_wait.sh + chmod +x /tmp/merge_gate_wait.sh + /tmp/merge_gate_wait.sh diff --git a/.github/workflows/watchdog-stuck-prs.yml b/.github/workflows/watchdog-stuck-prs.yml new file mode 100644 index 00000000..22d46645 --- /dev/null +++ b/.github/workflows/watchdog-stuck-prs.yml @@ -0,0 +1,55 @@ +# Watchdog -- scheduled scan for "stuck pull_request webhook" PRs. +# +# Why this file exists: +# GitHub Actions occasionally drops a `pull_request` webhook delivery and +# never recovers, leaving the required check `Build & Test (Linux)` in +# "Expected -- Waiting" forever for that PR. Recovery is folklore (push an +# empty commit, or close+reopen). This workflow detects the condition on +# any open PR and posts a single recovery comment so contributors are not +# silently blocked. +# +# It complements `merge-gate.yml`, which makes new PRs self-healing. +# The watchdog is a safety net for: (a) PRs opened before merge-gate is +# required, and (b) any future required check that stops dispatching. + +name: Watchdog Stuck PRs + +on: + schedule: + # Every 15 minutes. GitHub may delay schedule triggers under load; that + # is fine -- the watchdog is a backstop, not a primary control loop. + - cron: '*/15 * * * *' + workflow_dispatch: + inputs: + dry_run: + description: 'Log decisions but do not post comments' + required: false + default: 'false' + +permissions: + contents: read + pull-requests: write + checks: read + +concurrency: + group: watchdog-stuck-prs + cancel-in-progress: false + +jobs: + scan: + runs-on: ubuntu-24.04 + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github/scripts/ci + + - name: Scan open PRs + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + WATCHDOG_DRY_RUN: ${{ github.event.inputs.dry_run == 'true' && '1' || '0' }} + run: | + chmod +x .github/scripts/ci/watchdog_scan.sh + .github/scripts/ci/watchdog_scan.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4900e1eb..57588548 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `enterprise/governance-guide.md` documentation page: flagship governance reference for CISO / VPE / Platform Tech Lead audiences, covering enforcement points, bypass contract, failure semantics, air-gapped operation, rollout playbook, and known gaps. Trims duplicated content in `governance.md`, `apm-policy.md`, and `integrations/github-rulesets.md`. Adds `templates/apm-policy-starter.yml`. (#851) - `apm install` now supports Azure DevOps AAD bearer-token auth via `az account get-access-token`, with PAT-first fallback for orgs that disable PAT creation. Closes #852 (#856) +- New CI safety nets: `merge-gate.yml` orchestrator turns dropped `pull_request` webhook deliveries into clear red checks (currently shadow mode), and `watchdog-stuck-prs.yml` posts a recovery comment on PRs whose required check never dispatched. (PR follow-up to #856 CI flake) ## [0.9.1] - 2026-04-22 From ab363b268cca53d1fe0d90d50d7d23e8f98f22e7 Mon Sep 17 00:00:00 2001 From: danielmeppiel Date: Thu, 23 Apr 2026 09:51:47 +0200 Subject: [PATCH 2/4] ci(merge-gate): handle self-bootstrap and use checkout on pull_request Two fixes for the script-fetch step: 1. On 'pull_request' the runner has no secrets and a read-only token, so actions/checkout of PR head is safe -- use it for simplicity. We only need API-fetch under 'pull_request_target' where checkout would be a security risk. 2. On 'pull_request_target', when the script does not yet exist on base (i.e. THIS PR is the one adding it), curl returns 404 and we degrade to a self-bootstrap no-op pass instead of failing. Once the script lands on main, the gate becomes real. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/merge-gate.yml | 48 +++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/.github/workflows/merge-gate.yml b/.github/workflows/merge-gate.yml index 23d03420..4ec43a08 100644 --- a/.github/workflows/merge-gate.yml +++ b/.github/workflows/merge-gate.yml @@ -69,6 +69,42 @@ jobs: runs-on: ubuntu-24.04 timeout-minutes: 35 steps: + # On pull_request we can safely checkout PR head: the runner has no + # secrets and a read-only token. Under pull_request_target we MUST NOT + # checkout PR head -- we fetch from the base branch via API instead. + - name: Checkout PR head (pull_request only) + if: github.event_name == 'pull_request' + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + sparse-checkout: | + .github/scripts/ci + + - name: Fetch script from base (pull_request_target only) + if: github.event_name == 'pull_request_target' + env: + GH_TOKEN: ${{ github.token }} + run: | + mkdir -p .github/scripts/ci + # Self-bootstrap: if the script does not yet exist on base (i.e. this + # is the PR adding the script), degrade to a no-op that passes. Once + # the script lands on main, this branch becomes a real gate. + status=$(curl -fsSL -o .github/scripts/ci/merge_gate_wait.sh \ + -w '%{http_code}' \ + -H "Authorization: Bearer ${GH_TOKEN}" \ + -H "Accept: application/vnd.github.raw" \ + "https://api.github.com/repos/${{ github.repository }}/contents/.github/scripts/ci/merge_gate_wait.sh?ref=${GITHUB_BASE_REF}" \ + || echo "404") + if [ "$status" = "404" ] || [ ! -s .github/scripts/ci/merge_gate_wait.sh ]; then + echo "::warning::merge_gate_wait.sh not found on base ref '${GITHUB_BASE_REF}' yet -- self-bootstrap pass." + cat > .github/scripts/ci/merge_gate_wait.sh <<'BOOTSTRAP' + #!/usr/bin/env bash + echo "[merge-gate] self-bootstrap pass: script not yet on base" + exit 0 + BOOTSTRAP + fi + chmod +x .github/scripts/ci/merge_gate_wait.sh + - name: Wait for Tier 1 (Build & Test Linux) env: GH_TOKEN: ${{ github.token }} @@ -78,13 +114,5 @@ jobs: TIMEOUT_MIN: '30' POLL_SEC: '30' run: | - # The script lives in the repo. We intentionally do NOT actions/checkout - # under pull_request_target -- instead we fetch the script from the base - # branch via the API. This preserves the no-checkout security guarantee. - curl -fsSL \ - -H "Authorization: Bearer ${GH_TOKEN}" \ - -H "Accept: application/vnd.github.raw" \ - "https://api.github.com/repos/${REPO}/contents/.github/scripts/ci/merge_gate_wait.sh?ref=${GITHUB_BASE_REF}" \ - -o /tmp/merge_gate_wait.sh - chmod +x /tmp/merge_gate_wait.sh - /tmp/merge_gate_wait.sh + chmod +x .github/scripts/ci/merge_gate_wait.sh + .github/scripts/ci/merge_gate_wait.sh From 123e1930b153fdb8b70632deff417322b506152d Mon Sep 17 00:00:00 2001 From: danielmeppiel Date: Thu, 23 Apr 2026 10:00:41 +0200 Subject: [PATCH 3/4] ci: address Copilot review feedback on PR #865 - merge_gate_wait.sh: use $EXPECTED_CHECK in failure annotation instead of hardcoded 'Build & Test (Linux)' so the orchestrator stays generic. - merge-gate.yml: add curl --retry/--max-time on the script bootstrap fetch so a transient GitHub API blip does not redden the gate. - watchdog_scan.sh: fail loudly with stderr capture if 'gh pr list' errors out, instead of silently treating it as zero PRs (which would hide auth regressions or rate limiting). - watchdog_scan.sh: paginate the changed-files API call so PRs touching >100 files cannot be misclassified as docs-only and skipped. - CHANGELOG: append (#865) to follow the repo convention. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/scripts/ci/merge_gate_wait.sh | 2 +- .github/scripts/ci/watchdog_scan.sh | 26 +++++++++++++++++--------- .github/workflows/merge-gate.yml | 1 + CHANGELOG.md | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/.github/scripts/ci/merge_gate_wait.sh b/.github/scripts/ci/merge_gate_wait.sh index 99098dc6..6a56ddf1 100755 --- a/.github/scripts/ci/merge_gate_wait.sh +++ b/.github/scripts/ci/merge_gate_wait.sh @@ -87,7 +87,7 @@ while [ "$(date +%s)" -lt "$deadline" ]; do exit 0 ;; *) - echo "::error title=Tier 1 failed::Build & Test (Linux) reported '${conclusion}'. See ${url}" + echo "::error title=Tier 1 failed::'${EXPECTED_CHECK}' reported '${conclusion}'. See ${url}" exit 1 ;; esac diff --git a/.github/scripts/ci/watchdog_scan.sh b/.github/scripts/ci/watchdog_scan.sh index 71e28dcc..742227c1 100755 --- a/.github/scripts/ci/watchdog_scan.sh +++ b/.github/scripts/ci/watchdog_scan.sh @@ -43,8 +43,15 @@ now_ts=$(date -u +%s) # Use updated_at as the "last activity" proxy. headRefOid changes on every # push; updated_at changes on every push, comment, label etc. The combination # is fine for this safety-net heuristic. -prs=$(gh pr list --repo "$REPO" --state open --limit 200 \ - --json number,headRefOid,updatedAt,isDraft,title 2>/dev/null || echo '[]') +prs_stderr=$(mktemp) +if ! prs=$(gh pr list --repo "$REPO" --state open --limit 200 \ + --json number,headRefOid,updatedAt,isDraft,title 2>"$prs_stderr"); then + echo "::error title=Watchdog scan failed::Could not list open PRs in ${REPO}. Stderr below." >&2 + cat "$prs_stderr" >&2 + rm -f "$prs_stderr" + exit 1 +fi +rm -f "$prs_stderr" count_total=$(echo "$prs" | jq 'length') count_stuck=0 @@ -103,14 +110,15 @@ while read -r pr; do # 2. If all changed files match ci.yml's paths-ignore, the workflow was # correctly suppressed and no run is expected. # Keep this list in sync with .github/workflows/ci.yml `paths-ignore`. - non_ignored=$(gh api \ + # Use --paginate to handle PRs touching >100 files. + non_ignored=$(gh api --paginate \ -H "Accept: application/vnd.github+json" \ - "repos/${REPO}/pulls/${number}/files?per_page=300" \ - --jq '[.[].filename] | map(select( - (startswith("docs/") | not) - and . != ".gitignore" - and . != "LICENSE" - )) | length' 2>/dev/null || echo "0") + "repos/${REPO}/pulls/${number}/files" \ + --jq '.[].filename' 2>/dev/null \ + | awk '!/^docs\// && $0 != ".gitignore" && $0 != "LICENSE"' \ + | wc -l \ + | tr -d ' ') + case "$non_ignored" in ''|*[!0-9]*) non_ignored=0 ;; esac if [ "$non_ignored" = "0" ]; then continue diff --git a/.github/workflows/merge-gate.yml b/.github/workflows/merge-gate.yml index 4ec43a08..1605f8d4 100644 --- a/.github/workflows/merge-gate.yml +++ b/.github/workflows/merge-gate.yml @@ -90,6 +90,7 @@ jobs: # is the PR adding the script), degrade to a no-op that passes. Once # the script lands on main, this branch becomes a real gate. status=$(curl -fsSL -o .github/scripts/ci/merge_gate_wait.sh \ + --retry 5 --retry-delay 3 --retry-connrefused --max-time 30 \ -w '%{http_code}' \ -H "Authorization: Bearer ${GH_TOKEN}" \ -H "Accept: application/vnd.github.raw" \ diff --git a/CHANGELOG.md b/CHANGELOG.md index 57588548..aeb8f41b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `enterprise/governance-guide.md` documentation page: flagship governance reference for CISO / VPE / Platform Tech Lead audiences, covering enforcement points, bypass contract, failure semantics, air-gapped operation, rollout playbook, and known gaps. Trims duplicated content in `governance.md`, `apm-policy.md`, and `integrations/github-rulesets.md`. Adds `templates/apm-policy-starter.yml`. (#851) - `apm install` now supports Azure DevOps AAD bearer-token auth via `az account get-access-token`, with PAT-first fallback for orgs that disable PAT creation. Closes #852 (#856) -- New CI safety nets: `merge-gate.yml` orchestrator turns dropped `pull_request` webhook deliveries into clear red checks (currently shadow mode), and `watchdog-stuck-prs.yml` posts a recovery comment on PRs whose required check never dispatched. (PR follow-up to #856 CI flake) +- New CI safety nets: `merge-gate.yml` orchestrator turns dropped `pull_request` webhook deliveries into clear red checks (currently shadow mode), and `watchdog-stuck-prs.yml` posts a recovery comment on PRs whose required check never dispatched. (#865) (PR follow-up to #856 CI flake) ## [0.9.1] - 2026-04-22 From eac27d44d678d8508ebcfa397486bb98b628986b Mon Sep 17 00:00:00 2001 From: danielmeppiel Date: Thu, 23 Apr 2026 10:07:22 +0200 Subject: [PATCH 4/4] ci: drop watchdog -- gate's dual-trigger redundancy is sufficient The watchdog (cron every 15min, posts recovery comments on stuck PRs) was originally justified for the shadow-mode transition window. Since we are flipping to required immediately after this PR merges, that justification disappears. The merge-gate workflow already triggers on both 'pull_request' and 'pull_request_target' with concurrency dedup, so a single dropped webhook still fires the gate. The watchdog only added value for the double-drop case (both webhook channels fail for the same PR), which is vanishingly rare. We can add it back later if we ever observe one. Removes: - .github/workflows/watchdog-stuck-prs.yml - .github/scripts/ci/watchdog_scan.sh Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/scripts/ci/watchdog_scan.sh | 172 ----------------------- .github/workflows/watchdog-stuck-prs.yml | 55 -------- CHANGELOG.md | 2 +- 3 files changed, 1 insertion(+), 228 deletions(-) delete mode 100755 .github/scripts/ci/watchdog_scan.sh delete mode 100644 .github/workflows/watchdog-stuck-prs.yml diff --git a/.github/scripts/ci/watchdog_scan.sh b/.github/scripts/ci/watchdog_scan.sh deleted file mode 100755 index 742227c1..00000000 --- a/.github/scripts/ci/watchdog_scan.sh +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env bash -# watchdog_scan.sh -- scan open PRs for the "stuck pull_request webhook" -# failure mode and post a single recovery comment per stuck PR. -# -# Used by .github/workflows/watchdog-stuck-prs.yml on a 15-minute cron. -# Acts as a safety net while merge-gate.yml rolls out (and remains useful -# afterwards as a backstop for any required check that stops dispatching). -# -# Detection rule: -# For each open PR not in draft: -# - take last-update age in minutes (proxy for "time since last push") -# - if age is in the [WATCHDOG_MIN_AGE_MIN, WATCHDOG_MAX_AGE_MIN] window -# - AND the head SHA has zero check-runs named EXPECTED_CHECK -# then post the recovery comment exactly once (idempotency via a marker -# string `WATCHDOG_STUCK_WEBHOOK_MARKER` embedded in the comment body). -# -# Inputs (environment variables): -# GH_TOKEN required. Token with pull-requests:write, checks:read. -# REPO required. owner/repo. -# EXPECTED_CHECK optional. Default: "Build & Test (Linux)". -# WATCHDOG_MIN_AGE_MIN optional. Default: 15. -# WATCHDOG_MAX_AGE_MIN optional. Default: 1440 (1 day). -# WATCHDOG_DRY_RUN optional. If "1", log decisions but post no comment. -# -# Exit code is always 0 unless the script itself errors -- one stuck PR -# should not fail the whole scan. - -set -euo pipefail - -EXPECTED_CHECK="${EXPECTED_CHECK:-Build & Test (Linux)}" -MIN_AGE_MIN="${WATCHDOG_MIN_AGE_MIN:-15}" -MAX_AGE_MIN="${WATCHDOG_MAX_AGE_MIN:-1440}" -DRY_RUN="${WATCHDOG_DRY_RUN:-0}" -MARKER="WATCHDOG_STUCK_WEBHOOK_MARKER" - -if [ -z "${GH_TOKEN:-}" ] || [ -z "${REPO:-}" ]; then - echo "ERROR: GH_TOKEN and REPO are required." >&2 - exit 1 -fi - -now_ts=$(date -u +%s) - -# Use updated_at as the "last activity" proxy. headRefOid changes on every -# push; updated_at changes on every push, comment, label etc. The combination -# is fine for this safety-net heuristic. -prs_stderr=$(mktemp) -if ! prs=$(gh pr list --repo "$REPO" --state open --limit 200 \ - --json number,headRefOid,updatedAt,isDraft,title 2>"$prs_stderr"); then - echo "::error title=Watchdog scan failed::Could not list open PRs in ${REPO}. Stderr below." >&2 - cat "$prs_stderr" >&2 - rm -f "$prs_stderr" - exit 1 -fi -rm -f "$prs_stderr" - -count_total=$(echo "$prs" | jq 'length') -count_stuck=0 -count_commented=0 - -echo "[watchdog] scanning ${count_total} open PRs in ${REPO}" -echo "[watchdog] expected_check='${EXPECTED_CHECK}' window=[${MIN_AGE_MIN}m..${MAX_AGE_MIN}m] dry_run=${DRY_RUN}" - -while read -r pr; do - number=$(echo "$pr" | jq -r '.number') - sha=$(echo "$pr" | jq -r '.headRefOid') - updated=$(echo "$pr" | jq -r '.updatedAt') - is_draft=$(echo "$pr" | jq -r '.isDraft') - title=$(echo "$pr" | jq -r '.title') - - if [ "$is_draft" = "true" ]; then - continue - fi - - # Portable date parse (works on macOS BSD date and GNU date). - if updated_ts=$(date -u -d "$updated" +%s 2>/dev/null); then - : - elif updated_ts=$(date -u -j -f "%Y-%m-%dT%H:%M:%SZ" "$updated" +%s 2>/dev/null); then - : - else - echo "[watchdog] PR #${number}: cannot parse updated_at='${updated}', skipping" >&2 - continue - fi - - age_min=$(( (now_ts - updated_ts) / 60 )) - - if [ "$age_min" -lt "$MIN_AGE_MIN" ] || [ "$age_min" -gt "$MAX_AGE_MIN" ]; then - continue - fi - - present=$(gh api \ - -H "Accept: application/vnd.github+json" \ - "repos/${REPO}/commits/${sha}/check-runs?check_name=$(jq -rn --arg n "$EXPECTED_CHECK" '$n|@uri')&per_page=1" \ - --jq '.check_runs | length' 2>/dev/null || echo "0") - - if [ "$present" != "0" ]; then - continue - fi - - # Distinguish "stuck webhook" from "legitimately skipped": - # 1. If ci.yml has ANY workflow run for this SHA, it dispatched (good). - workflow_runs=$(gh api \ - -H "Accept: application/vnd.github+json" \ - "repos/${REPO}/actions/workflows/ci.yml/runs?head_sha=${sha}&per_page=1" \ - --jq '.total_count' 2>/dev/null || echo "0") - - if [ "$workflow_runs" != "0" ]; then - continue - fi - - # 2. If all changed files match ci.yml's paths-ignore, the workflow was - # correctly suppressed and no run is expected. - # Keep this list in sync with .github/workflows/ci.yml `paths-ignore`. - # Use --paginate to handle PRs touching >100 files. - non_ignored=$(gh api --paginate \ - -H "Accept: application/vnd.github+json" \ - "repos/${REPO}/pulls/${number}/files" \ - --jq '.[].filename' 2>/dev/null \ - | awk '!/^docs\// && $0 != ".gitignore" && $0 != "LICENSE"' \ - | wc -l \ - | tr -d ' ') - case "$non_ignored" in ''|*[!0-9]*) non_ignored=0 ;; esac - - if [ "$non_ignored" = "0" ]; then - continue - fi - - count_stuck=$((count_stuck + 1)) - echo "[watchdog] PR #${number} ('${title}') looks stuck: age=${age_min}m sha=${sha} non_ignored_files=${non_ignored}" - - # Idempotency: do not double-comment. - has_comment=$(gh api \ - -H "Accept: application/vnd.github+json" \ - "repos/${REPO}/issues/${number}/comments?per_page=100" \ - --jq "[.[] | select(.body | contains(\"${MARKER}\"))] | length" 2>/dev/null || echo "0") - - if [ "$has_comment" != "0" ]; then - echo "[watchdog] PR #${number}: already commented, skipping" - continue - fi - - if [ "$DRY_RUN" = "1" ]; then - echo "[watchdog] PR #${number}: DRY_RUN, would have commented" - continue - fi - - sha_short="${sha:0:8}" - body_file=$(mktemp) - # shellcheck disable=SC2016 # backticks here are markdown, not command subst - { - printf '\n' "$MARKER" - printf ':warning: **Stuck CI webhook detected**\n\n' - printf 'This PR'\''s head commit (`%s`) has no `%s` check-run after %s minutes. ' "$sha_short" "$EXPECTED_CHECK" "$age_min" - printf 'This usually means a GitHub Actions webhook delivery for the `pull_request` event was dropped and never recovered.\n\n' - printf '**Recovery:** push an empty commit to retrigger:\n\n' - printf '```bash\n' - printf 'git commit --allow-empty -m '\''ci: retrigger'\''\n' - printf 'git push\n' - printf '```\n\n' - printf 'If that does not help, close and reopen the PR.\n\n' - printf 'This comment is posted by `.github/workflows/watchdog-stuck-prs.yml`. It will not be repeated for this PR. See `.github/workflows/merge-gate.yml` for the orchestrator that aims to make this failure mode self-healing.\n' - } > "$body_file" - - if gh pr comment "$number" --repo "$REPO" --body-file "$body_file" >/dev/null 2>&1; then - count_commented=$((count_commented + 1)) - echo "[watchdog] PR #${number}: comment posted" - else - echo "[watchdog] PR #${number}: failed to post comment" >&2 - fi - rm -f "$body_file" -done < <(echo "$prs" | jq -c '.[]') - -echo "[watchdog] done. stuck=${count_stuck} commented=${count_commented}" diff --git a/.github/workflows/watchdog-stuck-prs.yml b/.github/workflows/watchdog-stuck-prs.yml deleted file mode 100644 index 22d46645..00000000 --- a/.github/workflows/watchdog-stuck-prs.yml +++ /dev/null @@ -1,55 +0,0 @@ -# Watchdog -- scheduled scan for "stuck pull_request webhook" PRs. -# -# Why this file exists: -# GitHub Actions occasionally drops a `pull_request` webhook delivery and -# never recovers, leaving the required check `Build & Test (Linux)` in -# "Expected -- Waiting" forever for that PR. Recovery is folklore (push an -# empty commit, or close+reopen). This workflow detects the condition on -# any open PR and posts a single recovery comment so contributors are not -# silently blocked. -# -# It complements `merge-gate.yml`, which makes new PRs self-healing. -# The watchdog is a safety net for: (a) PRs opened before merge-gate is -# required, and (b) any future required check that stops dispatching. - -name: Watchdog Stuck PRs - -on: - schedule: - # Every 15 minutes. GitHub may delay schedule triggers under load; that - # is fine -- the watchdog is a backstop, not a primary control loop. - - cron: '*/15 * * * *' - workflow_dispatch: - inputs: - dry_run: - description: 'Log decisions but do not post comments' - required: false - default: 'false' - -permissions: - contents: read - pull-requests: write - checks: read - -concurrency: - group: watchdog-stuck-prs - cancel-in-progress: false - -jobs: - scan: - runs-on: ubuntu-24.04 - timeout-minutes: 10 - steps: - - uses: actions/checkout@v4 - with: - sparse-checkout: | - .github/scripts/ci - - - name: Scan open PRs - env: - GH_TOKEN: ${{ github.token }} - REPO: ${{ github.repository }} - WATCHDOG_DRY_RUN: ${{ github.event.inputs.dry_run == 'true' && '1' || '0' }} - run: | - chmod +x .github/scripts/ci/watchdog_scan.sh - .github/scripts/ci/watchdog_scan.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index aeb8f41b..d33ac4d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `enterprise/governance-guide.md` documentation page: flagship governance reference for CISO / VPE / Platform Tech Lead audiences, covering enforcement points, bypass contract, failure semantics, air-gapped operation, rollout playbook, and known gaps. Trims duplicated content in `governance.md`, `apm-policy.md`, and `integrations/github-rulesets.md`. Adds `templates/apm-policy-starter.yml`. (#851) - `apm install` now supports Azure DevOps AAD bearer-token auth via `az account get-access-token`, with PAT-first fallback for orgs that disable PAT creation. Closes #852 (#856) -- New CI safety nets: `merge-gate.yml` orchestrator turns dropped `pull_request` webhook deliveries into clear red checks (currently shadow mode), and `watchdog-stuck-prs.yml` posts a recovery comment on PRs whose required check never dispatched. (#865) (PR follow-up to #856 CI flake) +- New CI safety net: `merge-gate.yml` orchestrator turns dropped `pull_request` webhook deliveries into clear red checks instead of stuck `Expected -- Waiting for status to be reported`. Triggers on both `pull_request` and `pull_request_target` for redundancy. (#865) (PR follow-up to #856 CI flake) ## [0.9.1] - 2026-04-22